# [Notebook 1]: Build Vector Store and Index

In [0]:
%pip install pyth --upgrade --quiet
%pip install openai --upgrade --quiet
dbutils.library.restartPython()

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m


### 1. Load in the past maintenance logs along with image filenames

In [0]:
import pickle

with open('data.pkl', 'rb') as file:  # The 'rb' argument stands for 'read binary'
    full_logs = pickle.load(file)

###2. Use DBRX to create short summaries of each log

In [0]:
from openai import OpenAI

DATABRICKS_ACCESS_TOKEN = "YOUR_DATABRICKS_TOKEN"

client = OpenAI(
  api_key=DATABRICKS_ACCESS_TOKEN,
  base_url="https://dbc-f499a870-66c0.cloud.databricks.com/serving-endpoints"
)

def create_summaries_of_prior_logs(openai_client, full_logs):
    all_summaries = []
    for filename, log in full_logs:
        prompt = f"""
            I'll provide you with text for a maintenance inspection report below. Your job is to take this report and provide a summary that is less than 77 tokens and captures the important synthesis of the text. Only consider critical pieces of observations and recommendations. Always remember to include the recommendation (summarized) in your summary. Provide nothing but the summary, don't preface it with anything.

            ####
            {log}
        """
        response = openai_client.chat.completions.create(
            model="databricks-dbrx-instruct",
            messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant."
            },
            {
                "role": "user",
                "content": prompt,
            }
            ],
            max_tokens=4096
        )
        summary = response.choices[0].message.content
        all_summaries.append((filename, summary, log))
    return all_summaries

all_summaries = create_summaries_of_prior_logs(client, full_logs)

###3. Create multi-modal embeddings using the CLIP model and store them in the vector DB

First, load the CLIP model to create image + text embeddings

In [0]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import numpy as np

MODEL = 'openai/clip-vit-large-patch14'

clip_model = CLIPModel.from_pretrained(MODEL)
clip_processor = CLIPProcessor.from_pretrained(MODEL)

def get_clip_embedding(text, image_path):
    image = Image.open(image_path)
    inputs = clip_processor(text=[text], images=[image], return_tensors="pt", padding=True, truncation=True)
    outputs = clip_model(**inputs)
    image_features = outputs.image_embeds  # This should be the embedding for the image
    text_features = outputs.text_embeds    # This should be the embedding for the text

    combined_embedding = (image_features + text_features) / 2
    return combined_embedding.squeeze().detach().numpy()

2024-05-06 02:10:55.062737: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


config.json:   0%|          | 0.00/4.52k [00:00<?, ?B/s]

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["eos_token_id"]` will be overriden.


model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

Then, create the Delta Table

In [0]:

%sql
CREATE TABLE IF NOT EXISTS past_maintenance_logs (
  id BIGINT GENERATED BY DEFAULT AS IDENTITY,
  summary STRING,
  log STRING,
  embedding ARRAY<FLOAT>
) TBLPROPERTIES (delta.enableChangeDataFeed = true);

Then cycle through all the embeddings and store them in the table

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pandas as pd
from tqdm import tqdm

spark = SparkSession.builder.appName("").getOrCreate()

def embed_and_store_images_summaries(image_path, log, summary):
    full_image_path = f"./images/{image_path}"
    embedding = get_clip_embedding(summary, full_image_path)
    data = {
        "summary": [summary],
        "log": [log],
        "embedding": [embedding]
    }
    pdf = pd.DataFrame(data)
    schema = StructType([
        StructField("summary", StringType(), True),
        StructField("log", StringType(), True),
        StructField("embedding", ArrayType(FloatType(), True), True)
    ])
    df = spark.createDataFrame(pdf, schema=schema)
    df.write.format("delta").mode("append").saveAsTable("past_maintenance_logs")

def populate_db(all_summaries):
    for filename, summary, log in tqdm(all_summaries, desc="Processing summaries"):
        embed_and_store_images_summaries(filename, log, summary)
    print("Finished storing prior logs")

In [0]:
populate_db(all_summaries)

Processing summaries:   0%|          | 0/25 [00:00<?, ?it/s]Processing summaries:   4%|▍         | 1/25 [00:05<02:13,  5.57s/it]Processing summaries:   8%|▊         | 2/25 [00:10<01:55,  5.01s/it]Processing summaries:  12%|█▏        | 3/25 [00:14<01:43,  4.70s/it]Processing summaries:  16%|█▌        | 4/25 [00:19<01:41,  4.82s/it]Processing summaries:  20%|██        | 5/25 [00:23<01:32,  4.62s/it]Processing summaries:  24%|██▍       | 6/25 [00:27<01:24,  4.44s/it]Processing summaries:  28%|██▊       | 7/25 [00:31<01:17,  4.28s/it]Processing summaries:  32%|███▏      | 8/25 [00:35<01:11,  4.22s/it]Processing summaries:  36%|███▌      | 9/25 [00:39<01:06,  4.15s/it]Processing summaries:  40%|████      | 10/25 [00:43<01:01,  4.10s/it]Processing summaries:  44%|████▍     | 11/25 [00:47<00:56,  4.02s/it]Processing summaries:  48%|████▊     | 12/25 [00:51<00:51,  3.94s/it]Processing summaries:  52%|█████▏    | 13/25 [00:55<00:48,  4.00s/it]Processing summaries:  56%|█████▌    

Finished storing prior logs



