In [1]:
# Cell 1: Imports and Setup
import pandas as pd
import torch
import faiss
import numpy as np
import yake
from transformers import AutoTokenizer, AutoModel
from IPython.display import display, HTML

# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)
model.eval()

Using device: cuda


DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [2]:
# Cell 2: Text Embedding Function
def embed_text(text_list):
    inputs = tokenizer(text_list, padding=True, truncation=True, max_length=128, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        last_hidden = outputs.last_hidden_state
        mask = inputs['attention_mask'].unsqueeze(-1).expand(last_hidden.size()).float()
        summed = (last_hidden * mask).sum(dim=1)
        counts = mask.sum(dim=1).clamp(min=1e-9)
        mean_pooled = summed / counts
    return mean_pooled.cpu()

In [3]:
# Cell 3: Load Dataset and Preprocess
df = pd.read_csv('Amazon-Products.csv').dropna(subset=['name'])
df['text_features'] = (
    df['name'].fillna('') + ' ' +
    df['main_category'].fillna('') + ' ' +
    df['sub_category'].fillna('')
)

In [None]:
# # Cell 4: Compute embeddings in batches
# batch_size = 256
# all_embeddings = []
# for i in range(0, len(df), batch_size):
#     batch_texts = df['text_features'].iloc[i:i+batch_size].tolist()
#     batch_embeds = embed_text(batch_texts)
#     all_embeddings.append(batch_embeds)
#     print(f"Processed {min(i+batch_size, len(df))}/{len(df)} samples", end='\r')
# embeddings = torch.cat(all_embeddings)
# print("\nAll embeddings computed.")

Processed 551585/551585 samples
All embeddings computed.


In [None]:
## Cell 4: Load precomputed FAISS index and embeddings or save them
## LOAD
# Load embeddings from .npy (assumes they were saved with float32 type)
emb_np = np.load("all_embeddings.npy")
embeddings = torch.from_numpy(emb_np)
# Load FAISS index
index = faiss.read_index("faiss_cosine.index")
print("✅ Loaded precomputed embeddings and FAISS index.")

## SAVE
# # Convert to numpy and normalize (L2 norm for cosine similarity)
# emb_np = embeddings.cpu().numpy().astype('float32')
# faiss.normalize_L2(emb_np)  # In-place normalization
# # Using inner product after L2 norm → equivalent to cosine similarity
# index = faiss.IndexFlatIP(emb_np.shape[1])
# index.add(emb_np)  # Add all vectors
# top_k = 5
# D, I = index.search(emb_np[:10], top_k)  # D = scores, I = indices
# for i, (scores, indices) in enumerate(zip(D, I)):
#     print(f"Query {i}:")
#     for score, idx in zip(scores, indices):
#         print(f"  Match idx: {idx}, Score: {score:.4f}")
# np.save("all_embeddings.npy", emb_np)
# faiss.write_index(index, "faiss_cosine.index")

Query 0:
  Match idx: 0, Score: 1.0000
  Match idx: 243792, Score: 0.9947
  Match idx: 69, Score: 0.9945
  Match idx: 59, Score: 0.9942
  Match idx: 6, Score: 0.9941
Query 1:
  Match idx: 233, Score: 1.0000
  Match idx: 1, Score: 1.0000
  Match idx: 3, Score: 0.9996
  Match idx: 22, Score: 0.9996
  Match idx: 35, Score: 0.9970
Query 2:
  Match idx: 2, Score: 1.0000
  Match idx: 274, Score: 0.9990
  Match idx: 22, Score: 0.9965
  Match idx: 3, Score: 0.9962
  Match idx: 233, Score: 0.9959
Query 3:
  Match idx: 3, Score: 1.0000
  Match idx: 22, Score: 0.9997
  Match idx: 233, Score: 0.9996
  Match idx: 1, Score: 0.9996
  Match idx: 35, Score: 0.9976
Query 4:
  Match idx: 4, Score: 1.0000
  Match idx: 243804, Score: 0.9931
  Match idx: 824, Score: 0.9929
  Match idx: 339950, Score: 0.9908
  Match idx: 585, Score: 0.9771
Query 5:
  Match idx: 5, Score: 1.0000
  Match idx: 243812, Score: 0.9938
  Match idx: 862, Score: 0.9935
  Match idx: 339980, Score: 0.9911
  Match idx: 23449, Score: 0.9

In [None]:
# Cell 5: Keyword extraction and category filtering
kw_extractor = yake.KeywordExtractor(top=5)

def extract_keywords_yake(text, max_keywords=5):
    keywords = kw_extractor.extract_keywords(text)
    return [kw for kw, score in keywords[:max_keywords]]

def filter_df_by_query_categories(df, query):
    keywords = extract_keywords_yake(query)
    keywords = [kw.lower() for kw in keywords]
    available_cats = df['main_category'].fillna('').str.lower().unique()
    selected_cats = [cat for cat in available_cats if any(kw in cat for kw in keywords)]
    if not selected_cats:
        selected_cats = available_cats
    filtered_df = df[df['main_category'].fillna('').str.lower().isin(selected_cats)]
    print(f"Filtered to categories: {selected_cats}")
    return filtered_df

In [None]:
from IPython.display import Image, display, HTML

# Display image from URL
def display_image(url, width=200):
    if isinstance(url, str) and url.startswith("http"):
        display(HTML(f'<img src="{url}" width="{width}"/>'))

In [None]:
# Add this function to your notebook (new cell)

def filter_df_by_query_categories(df, query, top_k=5):
    keywords = extract_keywords_yake(query, max_keywords=5)
    keywords = [kw.lower() for kw in keywords]
    
    # Get available categories in lower-case
    available_cats = df['main_category'].fillna('').str.lower().unique()
    
    # Find categories that contain any keyword
    selected_cats = [cat for cat in available_cats if any(kw in cat for kw in keywords)]
    if not selected_cats:
        selected_cats = available_cats  # fallback to all
    
    # Filter df by selected categories
    filtered_df = df[df['main_category'].fillna('').str.lower().isin(selected_cats)]
    
    print(f"Filtered to categories: {selected_cats}")
    return filtered_df

# Modify your search function to use this filter:
def user_search_and_recommend_with_filter(df, model, tokenizer, embeddings, top_k=5, threshold=0.7):
    while True:
        query = input("Enter product description to search: ").strip()
        if not query:
            print("Please enter something!")
            continue
        
        filtered_df = filter_df_by_query_categories(df, query)
        if filtered_df.empty:
            print("No matching categories found. Searching entire dataset.")
            filtered_df = df
        
        # Get indices of filtered products to slice embeddings and index search accordingly
        indices = filtered_df.index.to_list()
        filtered_embs = embeddings[indices].numpy().astype('float32')
        faiss.normalize_L2(filtered_embs)
        
        # Build temporary FAISS index on filtered embeddings
        tmp_index = faiss.IndexFlatIP(filtered_embs.shape[1])
        tmp_index.add(filtered_embs)
        
        query_emb = embed_text([query]).cpu().numpy().astype('float32')
        faiss.normalize_L2(query_emb)
        
        D, I = tmp_index.search(query_emb, top_k)
        filtered_results = [(score, indices[idx]) for score, idx in zip(D[0], I[0]) if score >= threshold]
        
        if not filtered_results:
            print(f"No products found with similarity above {threshold}. Please try again.\n")
            continue
        
        print(f"\nTop {len(filtered_results)} results for your query '{query}':\n")
        for score, idx in filtered_results:
            product = df.loc[idx]
            print(f"Score: {score:.4f}")
            print(f"Product Name: {product['name']}")
            print(f"Category: {product['main_category']} / {product['sub_category']}")
            print(f"Price: {product.get('discount_price', 'N/A')} (Original: {product.get('actual_price', 'N/A')})")
            print(f"Ratings: {product.get('ratings', 'N/A')} from {product.get('no_of_ratings', 'N/A')} users")
            print(f"Link: {product.get('link', 'N/A')}")
            display_image(product.get('image'))
            print("\n" + "-"*60 + "\n")
        
        break

In [None]:
# Cell 8: Run the search function
user_search_and_recommend_with_filter(df, model, tokenizer, embeddings)