In [16]:
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sentence_transformers import util

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
df = pd.read_csv('/content/drive/MyDrive/cleaned_merged_products.csv')


DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
MODEL_NAME = 'sentence-transformers/all-mpnet-base-v2'
CLEANED_DATA_PATH = 'cleaned_merged_products.csv'
PRODUCT_EMB_PATH = 'product_embeddings.npy'
QUERY_EMB_PATH = 'query_embeddings.npy'
MERGED_OUTPUT_PATH = 'merged_with_embeddings.pkl'

model = SentenceTransformer(MODEL_NAME, device=DEVICE)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

for product embeddings

In [8]:
product_texts = df['product_text'].astype(str).tolist()

product_embeddings = model.encode(
    product_texts,
    convert_to_numpy=True,
    batch_size=16,
    show_progress_bar=True,
    normalize_embeddings=True
)



Batches:   0%|          | 0/11370 [00:00<?, ?it/s]

In [10]:
product_embeddings = np.array(product_embeddings)

# Save the embeddings to a .npy file (you can also save to Google Drive by changing the path)
np.save('/content/drive/MyDrive/product_embeddings.npy', product_embeddings)

print(f"Saved {product_embeddings.shape[0]} embeddings successfully!")

Saved 181907 embeddings successfully!


In [19]:
def evaluate_search(df, product_embeddings, query_embeddings, k=5):
    correct = 0
    total = 0

    # Get unique queries, reset index to align with query_embeddings
    query_texts = df.drop_duplicates('query_id')[['query_id', 'query']].reset_index(drop=True)

    for i, row in tqdm(query_texts.iterrows(), total=len(query_texts), desc="Evaluating"):
        query_embed = query_embeddings[i]

        # Convert product embeddings once outside the loop if possible (optional optimization)
        sims = util.cos_sim(torch.tensor(query_embed), torch.tensor(product_embeddings))[0]

        # Get top-k indices with highest similarity scores
        top_k_indices = torch.topk(sims, k=k).indices.numpy()

        # Filter valid products with label either Exact or Partial for this query
        valid_products = df[(df['query_id'] == row['query_id']) & (df['label'].isin(['Exact', 'Partial']))]
        valid_product_ids = set(valid_products['product_id'])

        # Get product IDs retrieved by top-k indices
        retrieved_product_ids = set(df.iloc[top_k_indices]['product_id'])

        # Check if there is any overlap between relevant and retrieved products
        if len(valid_product_ids & retrieved_product_ids) > 0:
            correct += 1
        total += 1

    accuracy = correct / total
    print(f"Top-{k} Accuracy: {accuracy:.4f}")

In [21]:
evaluate_search(df, product_embeddings, query_embeddings, k=5)


Evaluating: 100%|██████████| 473/473 [05:51<00:00,  1.35it/s]

Top-5 Accuracy: 0.8753



