In [5]:
import streamlit as st
import pandas as pd
from google.cloud import aiplatform
from google.cloud import aiplatform_v1
import vertexai
from vertexai.language_models import TextEmbeddingModel

# Define constants
PROJECT_ID = "my-project-0004-346516"
LOCATION = "us-central1"
API_ENDPOINT = f"{LOCATION}-aiplatform.googleapis.com"
SHOTS_FILE_PATH = "../shots.csv"
CLIPS_DIR = ""

In [6]:

def list_deployed_indexes():
    """List all deployed indexes in Vertex AI Matching Engine."""
    client = aiplatform.gapic.IndexEndpointServiceClient(
        client_options={"api_endpoint": API_ENDPOINT}
    )
    parent = f"projects/{PROJECT_ID}/locations/{LOCATION}"
    index_list = []

    # List all index endpoints
    for index_endpoint in client.list_index_endpoints(parent=parent):
        for deployed_index in index_endpoint.deployed_indexes:
            index_list.append({
                "index_endpoint_name": index_endpoint.name,
                "deployed_index_id": deployed_index.id,
                "display_name": index_endpoint.display_name
            })

    return index_list

def get_embeddings(query, model):
    """Generate embeddings for the query using the specified model."""
    embeddings = model.get_embeddings([query])
    return embeddings[0].values

def load_shots_df(file_path):
    """Load shots_df from a local CSV file."""
    return pd.read_csv(file_path)

In [7]:


# ... [Your existing functions: list_deployed_indexes, get_embeddings, load_shots_df remain the same] ...

# --- Start of Jupyter Notebook Conversion ---

# 1. Initialization and Data Loading

# Initialize Vertex AI
vertexai.init(project=PROJECT_ID, location=LOCATION)
text_embedding_model = TextEmbeddingModel.from_pretrained("textembedding-gecko@latest")

# Load shots data
shots_df = load_shots_df(SHOTS_FILE_PATH)
print("Shots DataFrame loaded successfully.")  # Add a print statement for confirmation

# 2. List Deployed Indexes

deployed_indexes = list_deployed_indexes()

if deployed_indexes:
    print("Deployed indexes found:")
    for index in deployed_indexes:
        print(f"  - {index['display_name']} ({index['deployed_index_id']})")
else:
    print("No deployed indexes found. Please check your Vertex AI Matching Engine setup.")

# --- Manual Index Selection (Replace with user input in Streamlit later) ---
selected_index = deployed_indexes[0]  # Select the first index for now (or prompt the user for input)
print(f"Selected index: {selected_index['display_name']} with ID {selected_index['deployed_index_id']}")


Shots DataFrame loaded successfully.
Deployed indexes found:
  - darryl-testing-lh-pytorch112kagglewbi (darryl_testing_lh_pytorch112kagglewbi)
Selected index: darryl-testing-lh-pytorch112kagglewbi with ID darryl_testing_lh_pytorch112kagglewbi


In [41]:
# !pip show langchain_core
# !pip install --upgrade langchain_core
%pip install --upgrade langchain_google_vertexai

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [45]:
# from langchain_google_vertexai import VertexAIEmbeddings

query = "Give me the clips where there are animals present"

embeddings = TextEmbeddingModel.from_pretrained("textembedding-gecko@003")


test_embeddings = embeddings.get_embeddings(query)

AttributeError: 'TextEmbeddingModel' object has no attribute 'embed_query'

In [33]:


# Configure Vector Search client
client_options = {
  "api_endpoint": API_ENDPOINT
}
vector_search_client = aiplatform_v1.MatchServiceClient(
  client_options=client_options,
)
# Build FindNeighborsRequest object
datapoint = aiplatform_v1.IndexDatapoint(
  feature_vector=test_embeddings
)

query = aiplatform_v1.FindNeighborsRequest.Query(
  datapoint=datapoint,
  # The number of nearest neighbors to be retrieved
  neighbor_count=neighbor_count
)



request = aiplatform_v1.FindNeighborsRequest(
  index_endpoint=INDEX_ENDPOINT,
  deployed_index_id=DEPLOYED_INDEX_ID,
  # Request can have multiple queries
  queries=[query],
  return_full_datapoint=False,
)

# Execute the request
response = vector_search_client.find_neighbors(request)

df_new = pd.DataFrame()
print('neighbor_count', neighbor_count)

shots_df['distance'] = None

for i in range(0,neighbor_count):
    x=response.nearest_neighbors[0]
    
    df_match = shots_df.loc[shots_df['id'] == int(x.neighbors[i].datapoint.datapoint_id) ]
    df_match['distance'] = x.neighbors[i].distance

    # Append the matching rows to the new DataFrame
    df_new = pd.concat([df_new, df_match])
    

# Print the new DataFrame
df_sorted = df_new.sort_values(by="distance", ascending=True)
print(display(df_sorted))

NameError: name 'VertexAIEmbeddings' is not defined

In [27]:

# 3. Query and Retrieval

# --- Get user query (Replace with Streamlit input later) ---
query = "where is apples discussed"  # Get query from user input in Streamlit

# Generate query embedding
query_embedding = get_embeddings(query, text_embedding_model)
print("Query embedding generated.")

# Extract index details
deployed_index_id = selected_index['deployed_index_id']
index_endpoint_name = selected_index['index_endpoint_name']

# Build FindNeighborsRequest
datapoint = aiplatform_v1.IndexDatapoint(feature_vector=query_embedding)
find_neighbors_query = aiplatform_v1.FindNeighborsRequest.Query(datapoint=datapoint, neighbor_count=3)
find_neighbors_request = aiplatform_v1.FindNeighborsRequest(
    index_endpoint=index_endpoint_name,
    deployed_index_id=deployed_index_id,
    queries=[find_neighbors_query],
    return_full_datapoint=False
)

# Initialize MatchServiceClient
match_service_client = aiplatform_v1.MatchServiceClient(client_options={"api_endpoint": API_ENDPOINT})

# Execute the request
try:
    response = match_service_client.find_neighbors(find_neighbors_request)
    print("Find Neighbors request executed successfully.")
except Exception as e:
    print(f"Error during query execution: {e}")
    print(f"API Endpoint: {API_ENDPOINT}")
    print(f"Index Endpoint: {index_endpoint_name}")
    print(f"Deployed Index ID: {deployed_index_id}")


Query embedding generated.


AttributeError: module 'google.cloud.aiplatform' has no attribute 'MatchServiceClient'

In [None]:
print(f"Query: {query}, Type: {type(query)}") 
find_neighbors_query

In [28]:
# response = match_service_client.find_neighbors(find_neighbors_request)
# print(response)
# print(find_neighbors_query)

# Create the index endpoint instance from an existing endpoint.
my_index_endpoint_main = aiplatform.MatchingEngineIndexEndpoint(
    index_endpoint_name=index_endpoint_name
)

# Query the index endpoint for the nearest neighbors.
response = my_index_endpoint_main.find_neighbors(
    deployed_index_id=deployed_index_id,
    queries=[query_embedding],
    num_neighbors=4,
    return_full_datapoint=False,
)
print(response)

[[MatchNeighbor(id='8', distance=0.5725300908088684, sparse_distance=None, feature_vector=[], crowding_tag='0', restricts=[], numeric_restricts=[], sparse_embedding_values=[], sparse_embedding_dimensions=[]), MatchNeighbor(id='9', distance=0.5668851137161255, sparse_distance=None, feature_vector=[], crowding_tag='0', restricts=[], numeric_restricts=[], sparse_embedding_values=[], sparse_embedding_dimensions=[]), MatchNeighbor(id='7', distance=0.5605428814888, sparse_distance=None, feature_vector=[], crowding_tag='0', restricts=[], numeric_restricts=[], sparse_embedding_values=[], sparse_embedding_dimensions=[]), MatchNeighbor(id='4', distance=0.5574842691421509, sparse_distance=None, feature_vector=[], crowding_tag='0', restricts=[], numeric_restricts=[], sparse_embedding_values=[], sparse_embedding_dimensions=[])]]


In [29]:

# 4. Process and Display Results

# Prepare results DataFrame
results = []
for result in response.nearest_neighbors:
    for neighbor in result.neighbors:
        clip_id = int(neighbor.datapoint.datapoint_id)
        distance = neighbor.distance
        df_match = shots_df.loc[shots_df.index == clip_id]
        if not df_match.empty:
            match_info = df_match.iloc[0].to_dict()
            match_info['distance'] = distance
            results.append(match_info)

df_new = pd.DataFrame(results)

# Sort and display results
df_sorted = df_new.sort_values(by="distance", ascending=True)
print("Matching clips:")
print(df_sorted[["clip_name", "description", "distance"]])

# --- Display videos (Adapt for Jupyter Notebook) ---
# You might need to use a library like IPython.display to display videos in a notebook
# for index, row in df_sorted.iterrows():
#     print(f"Clip Name: {row['clip_name']}")
#     video_path = CLIPS_DIR + row['clip_name']
#     # Display video using IPython.display or another suitable method

AttributeError: 'list' object has no attribute 'nearest_neighbors'