In [1]:
!pip install pandas numpy scikit-learn nltk gensim



In [33]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
import re

nltk.download('stopwords')
nltk.download('punkt')

# Load pre-trained GloVe embeddings (you might need to download a GloVe file)
glove_file = 'glove.6B.100d.txt'  # Or a different GloVe file (e.g., 300d for higher dimensionality)
embedding_dim = 100 # Adjust to match the GloVe dimension.
glove_vectors = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', str(text), re.UNICODE) #Remove punctuation
    text = text.lower() #Lowercase
    stop_words = stopwords.words('english') #Remove stop words
    text = [word for word in text.split() if word not in stop_words]
    text = " ".join(text)
    return text

# Load the data (replace with your actual data loading)
data = pd.read_csv('Dataset.csv', encoding = 'unicode_escape')

# Check if 'TITLE' column exists, if not, skip the operation
if 'TITLE' in data.columns:
    data['TITLE'] = data['TITLE'].astype(str)
    data['TITLE'] = data['TITLE'].apply(clean_text)
else:
    print("Column 'TITLE' not found in the DataFrame.")

# Check if 'DESCRIPTION' column exists, if not, skip the operation
if 'DESCRIPTION' in data.columns:
    data['DESCRIPTION'] = data['DESCRIPTION'].astype(str)
    data['DESCRIPTION'] = data['DESCRIPTION'].apply(clean_text)
else:
    print("Column 'DESCRIPTION' not found in the DataFrame.")

#Drop NA rows
data = data.dropna()

#Only take a portion of the rows to make calculations manageable
data = data.iloc[:500]

print(data[['TITLE', 'DESCRIPTION']].head() if 'TITLE' in data.columns and 'DESCRIPTION' in data.columns else data.head())

def get_embedding(text, embedding_dim, glove_vectors):
    """
    Generates an embedding for the entire text using the average of the word vectors.
    """
    words = text.split()
    embeddings = []
    for word in words:
        if word in glove_vectors:
            embeddings.append(glove_vectors[word])
    if embeddings:
        embeddings = np.array(embeddings)
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(embedding_dim) # Return a zero vector if no words are found in GloVe

def calculate_similarity(embedding1, embedding2):
    """
    Calculates the cosine similarity between two embeddings.
    """
    return cosine_similarity(embedding1.reshape(1, -1), embedding2.reshape(1, -1))[0][0]


def find_similar_products(query, data, top_n=5):
    """
    Finds the top N most similar products to a given query.
    """
    query = clean_text(query)
    query_embedding = get_embedding(query, embedding_dim, glove_vectors)

    #Always create/recreate 'TITLE_EMBEDDING' to ensure it's calculated on the current data
    #This is done to prevent issues when re-running the cell since `data` might be reset
    if 'TITLE' in data.columns:
        data['TITLE_EMBEDDING'] = data['TITLE'].apply(lambda x: get_embedding(x, embedding_dim, glove_vectors))
    else:
        print("Column 'TITLE' not found in the DataFrame. Cannot generate embeddings.")
        return pd.DataFrame() # Return empty DataFrame in case 'TITLE' is missing

    data['QUERY_SIMILARITY'] = data['TITLE_EMBEDDING'].apply(lambda x: calculate_similarity(query_embedding, x))
    similar_products = data.sort_values(by='QUERY_SIMILARITY', ascending=False).head(top_n)

    return similar_products[['TITLE', 'QUERY_SIMILARITY']]

# Example usage
query = "rtx laptop"
similar_products = find_similar_products(query, data)
print(f"Top 5 products similar to '{query}':\n{similar_products}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Column 'TITLE' not found in the DataFrame.
Column 'DESCRIPTION' not found in the DataFrame.
                                 query  product_id esci_label      split
0          trellis for climbing plants  B00L5K420S          I  generated
1  bathroom countertop soap dispensers  B08FM2Q49N          E      train
2     gifts for 4 year old girls dress  B07477HKFR          I       test
3            amplifi gamerâs edition  B07HHHC8JB          E      train
4                  over watch t-shirts  B0027V2EXQ          I  generated
Column 'TITLE' not found in the DataFrame. Cannot generate embeddings.
Top 5 products similar to 'rtx laptop':
Empty DataFrame
Columns: []
Index: []


In [32]:
# Example usage
query = "rtx laptop"
similar_products = find_similar_products(query, data)
print(f"Top 5 products similar to '{query}':\n{similar_products}")

Column 'TITLE' not found in the DataFrame. Cannot generate embeddings.
Top 5 products similar to 'rtx laptop':
Empty DataFrame
Columns: []
Index: []
