In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import torch
import re





In [2]:
products_df = pd.read_csv('/content/product.csv', sep='\t')
queries_df = pd.read_csv('/content/query.csv',sep='\t')
labels_df = pd.read_csv('/content/label.csv',sep='\t')

In [3]:
print(products_df.info())
print(queries_df.info())
print(labels_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42994 entries, 0 to 42993
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   product_id           42994 non-null  int64  
 1   product_name         42994 non-null  object 
 2   product_class        40142 non-null  object 
 3   category hierarchy   41438 non-null  object 
 4   product_description  36986 non-null  object 
 5   product_features     42994 non-null  object 
 6   rating_count         33542 non-null  float64
 7   average_rating       33542 non-null  float64
 8   review_count         33542 non-null  float64
dtypes: float64(3), int64(1), object(5)
memory usage: 3.0+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 480 entries, 0 to 479
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   query_id     480 non-null    int64 
 1   query        480 non-null    object
 2   qu

In [4]:
print(products_df.head())
print(queries_df.head())
print(labels_df.head())

   product_id                                       product_name  \
0           0                            solid wood platform bed   
1           1                        all-clad 7 qt . slow cooker   
2           2            all-clad electrics 6.5 qt . slow cooker   
3           3       all-clad all professional tools pizza cutter   
4           4  baldwin prestige alcott passage knob with roun...   

                  product_class  \
0                          Beds   
1                  Slow Cookers   
2                  Slow Cookers   
3  Slicers, Peelers And Graters   
4                    Door Knobs   

                                  category hierarchy  \
0  Furniture / Bedroom Furniture / Beds & Headboa...   
1  Kitchen & Tabletop / Small Kitchen Appliances ...   
2  Kitchen & Tabletop / Small Kitchen Appliances ...   
3                         Browse By Brand / All-Clad   
4  Home Improvement / Doors & Door Hardware / Doo...   

                                 product_de

In [5]:
merged_df = labels_df.merge(queries_df, on='query_id', how='left')

merged_df = merged_df.merge(products_df, on='product_id', how='left')

print(merged_df.head())

   id  query_id  product_id       label        query     query_class  \
0   0         0       25434       Exact  salon chair  Massage Chairs   
1   1         0       12088  Irrelevant  salon chair  Massage Chairs   
2   2         0       42931       Exact  salon chair  Massage Chairs   
3   3         0        2636       Exact  salon chair  Massage Chairs   
4   4         0       42923       Exact  salon chair  Massage Chairs   

                                        product_name  \
0       21.7 '' w waiting room chair with wood frame   
1                  22.5 '' wide polyester side chair   
2      24.4 '' w metal lounge chair with metal frame   
3  25 '' wide faux leather manual swivel standard...   
4  27.6 '' w antimicrobial leather seat waiting r...   

              product_class  \
0       Waiting Room Chairs   
1             Accent Chairs   
2  Reception Seating Chairs   
3                 Recliners   
4       Waiting Room Chairs   

                                  category 

PREPROCESSNG


In [6]:
import pandas as pd
import re

# Print the columns to see what exists
print("Columns in merged_df:", merged_df.columns.tolist())

# Check required columns exist
required_cols = ['product_name', 'product_description', 'product_features']
for col in required_cols:
    if col not in merged_df.columns:
        merged_df[col] = ''  # Create the column if it doesn't exist

# Safely create 'product_text'
merged_df['product_text'] = (
    merged_df['product_name'].fillna('') + ' ' +
    merged_df['product_description'].fillna('') + ' ' +
    merged_df['product_features'].fillna('')
)

print(merged_df.head())
print(merged_df.info())
print(merged_df.isnull().sum())
print(merged_df.isnull().sum().sum())

# Clean nulls from specific columns
cols_to_check = ['query_class', 'product_class', 'category hierarchy', 'product_description']
cleaned_df = merged_df.dropna(subset=cols_to_check).reset_index(drop=True)

print(cleaned_df.info())
print(cleaned_df.isnull().sum())
print(cleaned_df.isnull().sum().sum())
print(cleaned_df.head())

# Check if 'query' column exists before processing
if 'query' not in merged_df.columns:
    merged_df['query'] = ''

# Convert to lowercase
merged_df['query'] = merged_df['query'].str.lower()
merged_df['product_text'] = merged_df['product_text'].str.lower()

# Clean text function
def clean_text(text):
    return re.sub(r'[^a-z0-9\s]', ' ', str(text))

# Apply cleaning
merged_df['query'] = merged_df['query'].apply(clean_text)
merged_df['product_text'] = merged_df['product_text'].apply(clean_text)

# Normalize spaces
merged_df['query'] = merged_df['query'].str.strip().str.replace(r'\s+', ' ', regex=True)
merged_df['product_text'] = merged_df['product_text'].str.strip().str.replace(r'\s+', ' ', regex=True)


Columns in merged_df: ['id', 'query_id', 'product_id', 'label', 'query', 'query_class', 'product_name', 'product_class', 'category hierarchy', 'product_description', 'product_features', 'rating_count', 'average_rating', 'review_count']
   id  query_id  product_id       label        query     query_class  \
0   0         0       25434       Exact  salon chair  Massage Chairs   
1   1         0       12088  Irrelevant  salon chair  Massage Chairs   
2   2         0       42931       Exact  salon chair  Massage Chairs   
3   3         0        2636       Exact  salon chair  Massage Chairs   
4   4         0       42923       Exact  salon chair  Massage Chairs   

                                        product_name  \
0       21.7 '' w waiting room chair with wood frame   
1                  22.5 '' wide polyester side chair   
2      24.4 '' w metal lounge chair with metal frame   
3  25 '' wide faux leather manual swivel standard...   
4  27.6 '' w antimicrobial leather seat waiting r..

In [10]:
cleaned_csv_path = 'cleaned_merged_products.csv'
cleaned_df.to_csv(cleaned_csv_path, index=False)

print(f"✅ Cleaned DataFrame saved to {cleaned_csv_path}")

✅ Cleaned DataFrame saved to cleaned_merged_products.csv


In [7]:

!pip install -U sentence-transformers


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [9]:
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# ✅ Load model once on GPU
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=device)

# ✅ Faster reference
texts = merged_df['product_text'].astype(str).tolist()

# ✅ Encode all in one go if fits in memory
print("Encoding product_texts on GPU...")
with torch.no_grad():
    embeddings = model.encode(
        texts,
        convert_to_numpy=True,
        batch_size=256,   # 🚀 Higher batch size = fewer forward passes
        show_progress_bar=True,
        normalize_embeddings=True  # Optional: helps with cosine search later
    )

# ✅ Save embeddings
np.save("product_embeddings_full.npy", embeddings)

# ✅ Add to DataFrame and save
merged_df['product_embedding'] = embeddings.tolist()
merged_df.to_pickle("merged_with_embeddings.pkl")

print("✅ Fast embeddings done & saved.")


Encoding product_texts on GPU...


Batches:   0%|          | 0/912 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [8]:
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm
import torch

# Load model on GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=device)

# Extract queries
query_texts = merged_df['query'].tolist()

# Encode directly with batching
print("🚀 Encoding query_texts fast...")
query_embeddings = model.encode(
    query_texts,
    convert_to_numpy=True,
    batch_size=256,  # Adjust based on GPU memory
    show_progress_bar=True,
    normalize_embeddings=True  # Optional but helpful
)

# Save
np.save("query_embeddings_full.npy", query_embeddings)
print("✅ Fast Query Embeddings saved.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

🚀 Encoding query_texts fast...


Batches:   0%|          | 0/912 [00:00<?, ?it/s]

✅ Fast Query Embeddings saved.


In [None]:
# Extract query texts and convert to list
query_texts = merged_df['query'].tolist()

# Embedding loop for queries
print("Encoding query_texts on GPU...")
query_all_embeddings = []
model.eval()

with torch.no_grad():
    for i in tqdm(range(0, len(query_texts), batch_size)):
        batch = query_texts[i:i+batch_size]
        embeddings = model.encode(batch, convert_to_numpy=True, batch_size=batch_size, show_progress_bar=False)
        query_all_embeddings.append(embeddings)

# Stack all into one array
query_embeddings = np.vstack(query_all_embeddings)

# Add the query embeddings to the DataFrame
merged_df['query_embedding'] = list(query_embeddings)

# Optional: Save query embeddings
np.save("query_embeddings_full.npy", query_embeddings)

print("✅ Query Embeddings done and saved.")

Encoding query_texts on GPU...


100%|██████████| 1824/1824 [01:29<00:00, 20.40it/s]


✅ Query Embeddings done and saved.


In [None]:
# Map labels to binary for simplicity
label_map = {
    'Exact': 2,
    'Relevant': 1,
    'Irrelevant': 0
}
merged_df['label_numeric'] = merged_df['label'].map(label_map)

In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

# Load model (using default precision for max compatibility)
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Replace this with your actual data loading step
# merged_df = pd.read_csv('your_data.csv')  # Load your cleaned dataframe here

product_texts = merged_df['product_text'].tolist()

chunk_size = 5000
batch_size = 128
output_dir = './emb_chunks/'
os.makedirs(output_dir, exist_ok=True)

# Detect existing chunks
existing_chunks = sorted([
    int(f.split('_')[-1].split('.')[0])
    for f in os.listdir(output_dir)
    if f.startswith('product_embeddings_chunk_')
])

all_embeddings = []

for i in tqdm(range(0, len(product_texts), chunk_size)):
    chunk_index = i // chunk_size
    chunk_file = os.path.join(output_dir, f'product_embeddings_chunk_{chunk_index}.npy')

    if chunk_index in existing_chunks:
        emb = np.load(chunk_file)
        all_embeddings.append(emb)
        continue

    chunk = product_texts[i:i + chunk_size]
    embeddings = model.encode(
        chunk,
        batch_size=batch_size,
        convert_to_numpy=True,
        show_progress_bar=False,
        normalize_embeddings=False
    )

    np.save(chunk_file, embeddings)
    all_embeddings.append(embeddings)

final_embeddings = np.vstack(all_embeddings)
np.save('product_embeddings_full.npy', final_embeddings)
print("✅ Embeddings saved to product_embeddings_full.npy")


 13%|█▎        | 6/47 [01:02<07:06, 10.40s/it]


KeyboardInterrupt: 

In [None]:
top_matches = merged_df.drop_duplicates(subset=['product_description']).sort_values(by='test_query_similarity', ascending=False).head(10)
print(top_matches[['product_name', 'product_description', 'test_query_similarity']])
top_matches = merged_df.drop_duplicates(subset=['product_id']).sort_values(by='test_query_similarity', ascending=False).head(10)
print(top_matches[['product_name', 'product_features', 'test_query_similarity']])



KeyError: 'test_query_similarity'

In [None]:
model.save('./saved_model/')


In [None]:
!zip -r saved_model.zip ./saved_model/


  adding: saved_model/ (stored 0%)
  adding: saved_model/2_Normalize/ (stored 0%)
  adding: saved_model/config.json (deflated 47%)
  adding: saved_model/vocab.txt (deflated 53%)
  adding: saved_model/modules.json (deflated 62%)
  adding: saved_model/tokenizer_config.json (deflated 75%)
  adding: saved_model/1_Pooling/ (stored 0%)
  adding: saved_model/1_Pooling/config.json (deflated 57%)
  adding: saved_model/README.md (deflated 64%)
  adding: saved_model/sentence_bert_config.json (deflated 4%)
  adding: saved_model/tokenizer.json (deflated 71%)
  adding: saved_model/special_tokens_map.json (deflated 85%)
  adding: saved_model/model.safetensors (deflated 8%)
  adding: saved_model/config_sentence_transformers.json (deflated 34%)


In [None]:

# Only keep necessary columns
saved_df = merged_df[['product_id', 'product_name', 'product_description', 'product_features', 'product_embedding']]
# Convert embeddings to list if not already
saved_df['product_embedding'] = saved_df['product_embedding'].apply(lambda x: x.tolist())

# Save as JSON or Pickle
saved_df.to_json('product_data.json', orient='records', lines=True)
# OR: saved_df.to_pickle('product_data.pkl')


NameError: name 'merged_df' is not defined

In [None]:
files.download('product_data.json')
