In [None]:
! pip install --upgrade --user --quiet google-cloud-aiplatform google-cloud-storage tqdm pandas

In [None]:
import pandas as pd
from google.cloud import storage, aiplatform
from vertexai.preview.language_models import TextEmbeddingModel
import vertexai
import tqdm
import time

PROJECT_ID = "unvailed-466101"
LOCATION = "us-central1"
BUCKET_NAME = "unvailed_test_bucket_1"
CSV_FILE_PATH = "Unvailed Vendors - Supported.csv"

vertexai.init(project=PROJECT_ID, location=LOCATION)

df = pd.read_csv(CSV_FILE_PATH)

In [None]:
df.rename(columns={"page title": "page_title"}, inplace=True)
df['content'] = df['content'].str.replace(r"\(.*?\)", "", regex=True)
df['content'] = df['content'].str.replace(r"\n", " ", regex=True)

In [None]:
model = TextEmbeddingModel.from_pretrained("gemini-embedding-001")

def get_embeddings_wrapper(texts, batch_size=5):
    embeddings = []
    for i in tqdm.tqdm(range(0, len(texts), batch_size)):
        time.sleep(1) 
        batch_texts = texts[i:i+batch_size]
        batch_embeddings = model.get_embeddings(batch_texts)
        embeddings.extend([embedding.values for embedding in batch_embeddings])
    return embeddings

content = df['content'].tolist()
df['embedding'] = get_embeddings_wrapper(content)

In [None]:
jsonl_string = df[["id", "url", "page_title", "content", "embedding"]].to_json(orient="records", lines=True)
with open("vendors_supported.json", "w") as f:
    f.write(jsonl_string)

In [None]:
BUCKET_URI = f"gs://unvailed_test_bucket_1"
! gsutil cp vendors_supported.json {BUCKET_URI}

index_id = "projects/271286489289/locations/us-central1/indexes/4143788845227311104"
index = aiplatform.MatchingEngineIndex(index_id)

index_endpoint_id = "projects/271286489289/locations/us-central1/indexEndpoints/8423862156717457408"
index_endpoint = aiplatform.MatchingEngineIndexEndpoint(index_endpoint_id)

In [None]:
# Refresh Index - Do this every time new data is added to the Bucket
# gcs_path = "gs://unvailed_test_bucket_1"
# index.update_embeddings(contents_delta_uri: gcs_path)

In [None]:
user_prompt = "What is Carl House?"

input_embeddings = get_embeddings_wrapper([user_prompt])

response = index_endpoint.find_neighbors(
    deployed_index_id="job_applicants_indx",
    queries=input_embeddings,
    num_neighbors=5,
)

In [None]:
import numpy as np

for idx, neighbor in enumerate(response[0]):
    id = str(neighbor.id)
    similar = df.query("id == @id", engine="python")
    print(f"{neighbor.distance:.4f} {similar.content.values[0]}")

# print(response[0])