In [None]:
from PIL import Image
import requests
from io import BytesIO

import numpy as np
import pandas as pd
import warnings

import itertools
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import math
import time
import re
import os

from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.metrics import pairwise_distances
import gradio as gr

from sentence_transformers import SentenceTransformer

from scipy.sparse import csr_matrix
import datasketch  # MinHash & LSH
import re


warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# loading the data using pandas' read_json file.
data = pd.read_json('tops_fashion.json')

In [3]:
data = data[['asin', 'brand', 'color', 'medium_image_url', 'product_type_name', 'title', 'formatted_price']]

In [4]:
# consider products which have price information
# data['formatted_price'].isnull() => gives the information 
#about the dataframe row's which have null values price == None|Null
data = data.loc[~data['formatted_price'].isnull()]
print('Number of data points After eliminating price=NULL :', data.shape[0])

Number of data points After eliminating price=NULL : 28395


In [5]:
# consider products which have color information
# data['color'].isnull() => gives the information about the dataframe row's which have null values price == None|Null
data =data.loc[~data['color'].isnull()]
print('Number of data points After eliminating color=NULL :', data.shape[0])

Number of data points After eliminating color=NULL : 28385


In [6]:
# Remove All products with very few words in title
data_sorted = data[data['title'].apply(lambda x: len(x.split())>4)]
print("After removal of products with short description:", data_sorted.shape[0])

After removal of products with short description: 27949


In [7]:
# Sort the whole data based on title (alphabetical order of title) 
data_sorted.sort_values('title',inplace=True, ascending=False)

In [8]:
indices = []
for i,row in data_sorted.iterrows():
    indices.append(i)

In [None]:

stage1_dedupe_asins = []
i = 0
j = 0
num_data_points = data_sorted.shape[0]
while i < num_data_points and j < num_data_points:
    
    previous_i = i

    # store the list of words of ith string in a, ex: a = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'X-Large']
    a = data['title'].loc[indices[i]].split()

    # search for the similar products sequentially 
    j = i+1
    while j < num_data_points:

        # store the list of words of jth string in b, ex: b = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'Small']
        b = data['title'].loc[indices[j]].split()

        # store the maximum length of two strings
        length = max(len(a), len(b))

        # count is used to store the number of words that are matched in both strings
        count  = 0

        # itertools.zip_longest(a,b): will map the corresponding words in both strings, it will appened None in case of unequal strings
        # example: a =['a', 'b', 'c', 'd']
        # b = ['a', 'b', 'd']
        # itertools.zip_longest(a,b): will give [('a','a'), ('b','b'), ('c','d'), ('d', None)]
        for k in itertools.zip_longest(a,b): 
            if (k[0] == k[1]):
                count += 1

        # if the number of words in which both strings differ are > 2 , we are considering it as those two apperals are different
        # if the number of words in which both strings differ are < 2 , we are considering it as those two apperals are same, hence we are ignoring them
        if (length - count) > 2: # number of words in which both sensences differ
            # if both strings are differ by more than 2 words we include the 1st string index
            stage1_dedupe_asins.append(data_sorted['asin'].loc[indices[i]])

            # if the comaprision between is between num_data_points, num_data_points-1 strings and they differ in more than 2 words we include both
            if j == num_data_points-1: stage1_dedupe_asins.append(data_sorted['asin'].loc[indices[j]])

            # start searching for similar apperals corresponds 2nd string
            i = j
            break
        else:
            j += 1
    if previous_i == i:
        break

In [10]:
data = data.loc[data['asin'].isin(stage1_dedupe_asins)]

In [11]:
# Step 1: Build MinHash Signatures
lsh = datasketch.MinHashLSH(threshold=0.8, num_perm=128)  # 80% similarity threshold
minhashes = {}
asin_dict = {}
data = data.reset_index(drop=True)

for i, row in data.iterrows():
    if pd.isna(row['title']):  # Handle missing titles
        continue

    words = set(re.findall(r"\w+", row['title'].lower()))  # Better tokenization
    minhash = datasketch.MinHash(num_perm=128)  # Create MinHash signature

    for word in words:
        minhash.update(word.encode('utf8'))  # Hash words into signature
    
    lsh.insert(i, minhash)  # Insert into LSH index
    minhashes[i] = minhash
    asin_dict[i] = data.iloc[i]['asin']

# Step 2: Find Similar Titles using LSH
considered_asins = set()
stage2_dedupe_asins = []

for i in minhashes.keys():  # Only loop over existing MinHashes
    if i in considered_asins:
        continue

    stage2_dedupe_asins.append(asin_dict[i])
    considered_asins.add(i)

    # Find similar items using LSH (FAST lookup)
    similar_items = lsh.query(minhashes[i])

    for j in similar_items:
        if j != i and j not in considered_asins:
            considered_asins.add(j)  # Mark as duplicate

# Output deduplicated ASINs
print("Unique ASINs count:", len(stage2_dedupe_asins))

Unique ASINs count: 16706


In [12]:
# from whole previous products we will consider only 
# the products that are found in previous cell 
data = data.loc[data['asin'].isin(stage2_dedupe_asins)]

In [13]:
# we use the list of stop words that are downloaded from nltk lib.
stop_words = set(stopwords.words('english'))
print ('list of stop words:', stop_words)

def nlp_preprocessing(total_text, index, column):
    if type(total_text) is not int:
        string = ""
        for words in total_text.split():
            # remove the special chars in review like '"#$@!%^&*()_+-~?>< etc.
            word = ("".join(e for e in words if e.isalnum()))
            # Conver all letters to lower-case
            word = word.lower()
            # stop-word removal
            if not word in stop_words:
                string += word + " "
        data[column][index] = string

list of stop words: {"shan't", 'be', 'so', 'through', 'here', 'what', "it's", 'that', "i'm", 'ourselves', 'has', "we've", "you're", "weren't", 'at', 'and', 'does', 're', 'those', 'his', 'yours', 'needn', 'wasn', 'her', 'your', 'i', "didn't", 'ain', 'should', "i've", 'we', 'is', "mustn't", 'ours', 'are', "they'd", "wasn't", 'shan', 'all', 'herself', 'aren', 'not', 'then', 'won', 'further', 'haven', 'most', 'about', 'off', 'on', 'why', 'yourselves', "hadn't", 'between', 'were', "we're", 's', 'do', 'will', "they'll", "don't", 'after', 'couldn', 've', "mightn't", 'an', "she's", 'for', 'only', "aren't", 'once', 'their', 'mightn', 't', 'both', "haven't", 'from', 'a', 'having', 'as', "they're", "should've", 'but', 'me', 'being', "he's", 'to', 'been', 'in', 'ma', 'until', "you'll", 'did', 'hers', 'than', "they've", "doesn't", 'hasn', 'who', "couldn't", 'them', 'above', "you'd", "we'd", "i'd", 'before', 'the', 'doing', "that'll", 'because', 'it', 'if', "she'd", 'how', 'where', "shouldn't", 'she

In [14]:
start_time = time.process_time()
# we take each title and we text-preprocess it.
for index, row in data.iterrows():
    nlp_preprocessing(row['title'], index, 'title')
# we print the time it took to preprocess whole titles 
print(time.process_time() - start_time, "seconds")

26.4375 seconds


In [32]:
# transformer 
# Load a strong pre-trained SBERT model
sbert_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Titles of products in your dataset
titles = data['title'].tolist()

# image urls
image_urls = data['medium_image_url'].tolist()

# Generate SBERT embeddings
sbert_embeddings = sbert_model.encode(titles, show_progress_bar=True)

def sbert_recommender_gradio(input_title, num_results=5):
    # Encode input title
    query_vec = sbert_model.encode([input_title])
    
    # Compute cosine similarity with dataset
    sim_scores = cosine_similarity(query_vec, sbert_embeddings)[0]
    
    # Add similarity scores to the DataFrame
    df = data.copy()
    df['score'] = sim_scores

    # Get top N matches
    top_matches = df.sort_values(by='score', ascending=False).head(num_results)

    results = []
    for _, row in top_matches.iterrows():
        img_url = row['medium_image_url']
        brand = row['brand']
        title = row['title']
        price_raw = row['formatted_price']
        score = row['score']

        # Format price if possible
        try:
            price_val = int(price_raw)
            price_str = f"₹{price_val:,}"
        except:
            price_str = "N/A"

        # HTML layout
        card_html = f"""
        <div style="display:flex;align-items:center;border:1px solid #ccc;border-radius:10px;padding:10px;margin-bottom:10px;gap:20px;">
            <img src="{img_url}" alt="image" style="height:150px;border-radius:8px;transition:transform 0.3s ease-in-out;" onmouseover="this.style.transform='scale(1.1)'" onmouseout="this.style.transform='scale(1)'">
            <div>
                <div><b>{title}</b></div>
                <div>Brand: {brand}</div>
                <div>Price: {price_str}</div>
                <div>Similarity Score: {score:.2f}</div>
            </div>
        </div>
        """
        results.append(card_html)

    return "\n".join(results)






Batches: 100%|███████████████████████████████████████████████████████████████████████| 523/523 [05:42<00:00,  1.53it/s]


In [33]:
# Gradio Interface
gr.Interface(
    fn=sbert_recommender_gradio,
    inputs=[
        gr.Textbox(label="Enter Product Title"),
        gr.Slider(minimum=1, maximum=10, value=5, step=1, label="Number of Recommendations")
    ],
    outputs=gr.HTML(),
    title="Amazon Apparel Recommender (SBERT)",
    description="Enter a product title and get similar recommendations using SBERT."
).launch()

* Running on local URL:  http://127.0.0.1:7867

To create a public link, set `share=True` in `launch()`.


