In [None]:
# Semantic Product Search and Ranking
# interpreting meaning of words and phrases in the context of product searches
# understand query and return meaning of query
# understand input query, search for relevant products, and rank them based on highest to lowest matches
# training data -> query-result pairs
# deploy model on web application
# put Amazon dataset in a database and create a simple web interface with a text box
# user will enter query to the model, which will find the matched products
# display these results on the web
# use query , product title and product description
# recommendation use BERT to see if this works

# Requirements
# - Load all the data set, combine product_title and product_description columns which will be used for searching.
# - Perform text preprocessing: Convert all to lower case, remove stopwords, perform Lemmatization or Stemming, remove Special Character
# - Convert text data into numerical representations. Common techniques include: TF-IDF (Term Frequency-Inverse Document Frequency), Word Embeddings (e.g., Word2Vec, GloVe, FastText) for capturing semantic meaning. You can also use Pretrained Language Models (e.g., BERT, GPT) for text representation.
# - Divide the dataset into training, validation, and testing sets. Common splits include 70- 80% for training, 10-15% for validation, and 10-15% for testing.
# - Select a suitable deep-learning model for training and testing on the given dataset.
# - Visualization of the training and testing errors
# - Fine-tune the model hyper parameters.
# - Select appropriate evaluation metrics for your problem, such as NDCG, MAP, Precision at K (P@K), Recall at K (R@K), F1 at K (R@K) or any relevant metric.
# - Deploy the model such that it can be served in the web application. Allow the user to enter a query and it should return the closest matched products.
# - Write a detailed description of your solution and the challenges that you face as a report under Experimental Results Section. Show the accuracies obtained for the task and summarize them as a table.


In [2]:
#loading data
import pandas as pd
import pandas as pd
df_examples = pd.read_parquet('./datasets/shopping_queries_dataset_examples.parquet')
df_products = pd.read_parquet('./datasets/shopping_queries_dataset_products.parquet')
df_sources = pd.read_csv("./datasets/shopping_queries_dataset_sources.csv")

In [23]:
df_examples.head(1)

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split
0,0,revent 80 cfm,0,B000MOO21W,us,I,0,1,train


In [24]:
df_products.head(1)

Unnamed: 0,product_id,product_title,product_description,product_bullet_point,product_brand,product_color,product_locale
0,B079VKKJN7,"11 Degrees de los Hombres Playera con Logo, Ne...",Esta playera con el logo de la marca Carrier d...,11 Degrees Negro Playera con logo\nA estrenar ...,11 Degrees,Negro,es


In [25]:
df_sources.head(1)

Unnamed: 0,query_id,source
0,0,other


In [30]:
print(df_examples.shape)
print(df_products.shape)
print(df_sources.shape)

(2621288, 9)
(1814924, 7)
(130652, 2)


In [31]:
df_examples_products = pd.merge(
    df_examples,
    df_products,
    how='left',
    left_on=['product_locale','product_id'],
    right_on=['product_locale', 'product_id']
)

In [32]:
df_examples_products_source = pd.merge(
    df_examples_products,
    df_sources,
    how='left',
    left_on=['query_id'],
    right_on=['query_id']
)

In [36]:
df_examples_products_source = df_examples_products_source[df_examples_products_source["esci_label"] == 'E']

In [39]:
df_examples_products_source = df_examples_products_source[df_examples_products_source["product_locale"] == 'us']

In [40]:
df_examples_products_source

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split,product_title,product_description,product_bullet_point,product_brand,product_color,source
1,1,revent 80 cfm,0,B07X3Y6B1V,us,E,0,1,train,Homewerks 7141-80 Bathroom Fan Integrated LED ...,,OUTSTANDING PERFORMANCE: This Homewerk's bath ...,Homewerks,80 CFM,other
2,2,revent 80 cfm,0,B07WDM7MQQ,us,E,0,1,train,Homewerks 7140-80 Bathroom Fan Ceiling Mount E...,,OUTSTANDING PERFORMANCE: This Homewerk's bath ...,Homewerks,White,other
3,3,revent 80 cfm,0,B07RH6Z8KW,us,E,0,1,train,Delta Electronics RAD80L BreezRadiance 80 CFM ...,This pre-owned or refurbished product has been...,Quiet operation at 1.5 sones\nBuilt-in thermos...,DELTA ELECTRONICS (AMERICAS) LTD.,White,other
4,4,revent 80 cfm,0,B07QJ7WYFQ,us,E,0,1,train,Panasonic FV-08VRE2 Ventilation Fan with Reces...,,The design solution for Fan/light combinations...,Panasonic,White,other
5,5,revent 80 cfm,0,B076Q7V5WX,us,E,0,1,train,Panasonic FV-0511VQ1 WhisperCeiling DC Ventila...,,Installation: Features a 4-inch or 6-inch duct...,Panasonic,White,other
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2618563,2618563,자전거트레일러,130539,B07ZFJJZLF,us,E,1,1,train,"LivTee 5 pcs Auto Trim Removal Tool Kit, Inter...","5 pcs Auto Trim Removal Tool Kit, Interior Doo...",Made of super durable plastic material for lon...,LivTee,Blue,other
2618564,2618564,자전거트레일러,130539,B07G5VLMN1,us,E,1,1,train,MAXXHAUL 50025 Hitch Mounted 2-Bike Rack-100 l...,,Improved durable hitch adapter to fit standard...,MAXXHAUL,,other
2618566,2618566,자전거트레일러,130539,B010LLGWL8,us,E,1,1,train,"Burley Honey Bee, 2 Seat Kids Bike Trailer & S...",,Sport Type: Cycling,Burley Design,Red,other
2618567,2618567,자전거트레일러,130539,B010LLGWKE,us,E,1,1,train,"BURLEY Design Bee, 2 Seat, Lightweight, Kids B...",,sport type: Cycling,Burley Design,Yellow,other


In [41]:
df_examples_products_source.shape

(1247558, 15)

In [42]:
#dropping all queries not in english
import langid
# Function to detect language using langid
def detect_language(text):
    lang, _ = langid.classify(text)
    return lang == 'en'

import swifter

df_examples_products_source['query'] = df_examples_products_source['query'].swifter.apply(detect_language)

Pandas Apply:   0%|          | 0/1247558 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_examples_products_source['query'] = df_examples_products_source['query'].swifter.apply(detect_language)


In [44]:
df_examples_products_source.to_csv('examples_products_source_dataframe.csv', index=False)
#all english sources

In [10]:
import pandas as pd
df_examples_products_source = pd.read_csv('examples_products_source_dataframe.csv')

In [14]:
df_examples_products_source.head(2)
#1247558 rows × 15 columns

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split,product_title,product_description,product_bullet_point,product_brand,product_color,source
0,1,True,0,B07X3Y6B1V,us,E,0,1,train,Homewerks 7141-80 Bathroom Fan Integrated LED ...,,OUTSTANDING PERFORMANCE: This Homewerk's bath ...,Homewerks,80 CFM,other
1,2,True,0,B07WDM7MQQ,us,E,0,1,train,Homewerks 7140-80 Bathroom Fan Ceiling Mount E...,,OUTSTANDING PERFORMANCE: This Homewerk's bath ...,Homewerks,White,other


In [15]:
df_examples_products_source['product_detail'] = (
    df_examples_products_source['product_title'].fillna('') +
    " " +
    df_examples_products_source['product_description'].fillna('')
)


In [19]:
df_examples_products_source.drop('product_title', axis=1, inplace=True)

KeyError: "['product_title'] not found in axis"

In [20]:
df_examples_products_source.drop('product_description', axis=1, inplace=True)

In [21]:
df_examples_products_source.head(1)

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split,product_bullet_point,product_brand,product_color,source,product_detail
0,1,True,0,B07X3Y6B1V,us,E,0,1,train,OUTSTANDING PERFORMANCE: This Homewerk's bath ...,Homewerks,80 CFM,other,Homewerks 7141-80 Bathroom Fan Integrated LED ...


In [22]:
# Specify the columns to drop

# example_id	query	query_id	product_id	product_locale	esci_label	small_version	large_version	split	product_title	product_description	product_bullet_point	product_brand	product_color	source	product_details
columns_to_drop = ['product_locale', 'esci_label', 'small_version', 'large_version', 'split', 'product_bullet_point', 'product_brand', 'product_color']

# Drop the specified columns
df_examples_products_source = df_examples_products_source.drop(columns=columns_to_drop)

In [26]:
df_examples_products_source.head(2)

Unnamed: 0,example_id,query,query_id,product_id,source,product_detail
0,1,True,0,B07X3Y6B1V,other,Homewerks 7141-80 Bathroom Fan Integrated LED ...
1,2,True,0,B07WDM7MQQ,other,Homewerks 7140-80 Bathroom Fan Ceiling Mount E...


In [28]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def text_preprocessing(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-z\s]', '', text)

    # Tokenize the text
    words = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Join the words back into a string
    preprocessed_text = ' '.join(words)

    return preprocessed_text



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/zeerakzubair/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zeerakzubair/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zeerakzubair/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [29]:
# applying preprocessing on query and product detail

df_examples_products_source['product_detail'] = df_examples_products_source['product_detail'].apply(text_preprocessing)


In [1]:
df_examples_products_source.to_csv('preprocessed_data.csv', index=False)

NameError: name 'df_examples_products_source' is not defined

In [4]:
import pandas as pd
df_examples_products_source = pd.read_csv('preprocessed_data.csv')

In [5]:
df_examples = pd.read_parquet('./datasets/shopping_queries_dataset_examples.parquet')


In [6]:
df_examples.shape

(2621288, 9)

In [7]:
df_examples.head(2)

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split
0,0,revent 80 cfm,0,B000MOO21W,us,I,0,1,train
1,1,revent 80 cfm,0,B07X3Y6B1V,us,E,0,1,train


In [8]:
df_examples = df_examples[df_examples['product_locale'] == 'us']

In [9]:
df_examples.shape

(1818825, 9)

In [10]:
df_examples = df_examples[df_examples['esci_label'] == 'E']

In [11]:
df_examples.shape

(1247558, 9)

In [None]:
# view the result of this representation

In [12]:
df_examples_products_source.shape

(1247558, 6)

In [13]:
df_examples_products_source['query'] = df_examples['query']

In [14]:
df_examples_products_source.head(2)

Unnamed: 0,example_id,query,query_id,product_id,source,product_detail
0,1,,0,B07X3Y6B1V,other,homewerks bathroom fan integrated led light ce...
1,2,revent 80 cfm,0,B07WDM7MQQ,other,homewerks bathroom fan ceiling mount exhaust v...


In [15]:
df_examples_products_source.to_csv('preprocessed_data.csv', index=False)

In [16]:
df_examples_products_source['query'] = df_examples_products_source['query'].fillna('')  # Replace NaN with empty string


In [None]:
# -----------------------------------HERE DUMMY-----------------------------

In [1]:
import pandas as pd
df_examples_products_source = pd.read_csv('preprocessed_data.csv')
df_examples_products_source

Unnamed: 0,example_id,query,query_id,product_id,source,product_detail
0,1,,0,B07X3Y6B1V,other,homewerks bathroom fan integrated led light ce...
1,2,revent 80 cfm,0,B07WDM7MQQ,other,homewerks bathroom fan ceiling mount exhaust v...
2,3,revent 80 cfm,0,B07RH6Z8KW,other,delta electronics radl breezradiance cfm heate...
3,4,revent 80 cfm,0,B07QJ7WYFQ,other,panasonic fvvre ventilation fan recessed led r...
4,5,revent 80 cfm,0,B076Q7V5WX,other,panasonic fvvq whisperceiling dc ventilation f...
...,...,...,...,...,...,...
1247553,2618563,loftek,130539,B07ZFJJZLF,other,livtee pc auto trim removal tool kit interior ...
1247554,2618564,loftek,130539,B07G5VLMN1,other,maxxhaul hitch mounted bike rack lb capacity
1247555,2618566,loftek,130539,B010LLGWL8,other,burley honey bee seat kid bike trailer strolle...
1247556,2618567,lofters for men,130539,B010LLGWKE,other,burley design bee seat lightweight kid bikeonl...


In [2]:
# Drop the specified columns
columns_to_drop = ['example_id','query_id','product_id','source']
df_examples_products_source = df_examples_products_source.drop(columns=columns_to_drop)
df_examples_products_source

Unnamed: 0,query,product_detail
0,,homewerks bathroom fan integrated led light ce...
1,revent 80 cfm,homewerks bathroom fan ceiling mount exhaust v...
2,revent 80 cfm,delta electronics radl breezradiance cfm heate...
3,revent 80 cfm,panasonic fvvre ventilation fan recessed led r...
4,revent 80 cfm,panasonic fvvq whisperceiling dc ventilation f...
...,...,...
1247553,loftek,livtee pc auto trim removal tool kit interior ...
1247554,loftek,maxxhaul hitch mounted bike rack lb capacity
1247555,loftek,burley honey bee seat kid bike trailer strolle...
1247556,lofters for men,burley design bee seat lightweight kid bikeonl...


In [3]:
df_examples_products_source.to_csv('query_product_detail.csv', index=False)

In [None]:
##------------------------HERE DUMMY------------------------

In [2]:
import pandas as pd
df_examples_products_source = pd.read_csv('query_product_detail.csv')
df_examples_products_source

Unnamed: 0,query,product_detail
0,,homewerks bathroom fan integrated led light ce...
1,revent 80 cfm,homewerks bathroom fan ceiling mount exhaust v...
2,revent 80 cfm,delta electronics radl breezradiance cfm heate...
3,revent 80 cfm,panasonic fvvre ventilation fan recessed led r...
4,revent 80 cfm,panasonic fvvq whisperceiling dc ventilation f...
...,...,...
1247553,loftek,livtee pc auto trim removal tool kit interior ...
1247554,loftek,maxxhaul hitch mounted bike rack lb capacity
1247555,loftek,burley honey bee seat kid bike trailer strolle...
1247556,lofters for men,burley design bee seat lightweight kid bikeonl...


In [3]:
df_examples_products_source.dropna()

Unnamed: 0,query,product_detail
1,revent 80 cfm,homewerks bathroom fan ceiling mount exhaust v...
2,revent 80 cfm,delta electronics radl breezradiance cfm heate...
3,revent 80 cfm,panasonic fvvre ventilation fan recessed led r...
4,revent 80 cfm,panasonic fvvq whisperceiling dc ventilation f...
5,revent 80 cfm,panasonic fvvsl whispervalue dc ventilation fa...
...,...,...
1247553,loftek,livtee pc auto trim removal tool kit interior ...
1247554,loftek,maxxhaul hitch mounted bike rack lb capacity
1247555,loftek,burley honey bee seat kid bike trailer strolle...
1247556,lofters for men,burley design bee seat lightweight kid bikeonl...


In [4]:
df_examples_products_source.to_csv('query_product_detail.csv', index=False)

In [2]:
# Assuming df_examples_products_source is your DataFrame
# X: 'query', y: 'product_detail'
X = df_examples_products_source['query'].fillna('')  # Handle NaN values
y = df_examples_products_source['product_detail'].fillna('')

In [3]:
print(X[1])

 revent 80 cfm


In [4]:
print(y[1])

homewerks bathroom fan ceiling mount exhaust ventilation sone cfm white


In [10]:
#this is from the first attempt led to kernel crash
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to numerical representation using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a logistic regression classifier
model = LogisticRegression(max_iter=500)
model.fit(X_train_tfidf, y_train)

# Predictions on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Additional evaluation metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


In [12]:
# trying to use BERT 
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn.functional import softmax

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

# Encode the input query
input_query = "revent 80 cfm"
encoded_input = tokenizer(input_query, return_tensors='pt')

# Make a prediction
with torch.no_grad():
    output = model(**encoded_input)

# Get the predicted class probabilities
probabilities = softmax(output.logits, dim=1).numpy()

# Print or use the probabilities as needed
print(probabilities)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[[1.]]


In [5]:
#using TF-IDF 
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# # Assuming df is your DataFrame with 'query' and 'product_detail' columns
# df = pd.DataFrame({
#     'query': ['revent 80 cfm', 'loftek', 'lofters for men'],
#     'product_detail': ['homewerks bathroom fan integrated led light',
#                        'homewerks bathroom fan ceiling mount exhaust ventilation sone cfm white',
#                        'delta electronics radl breezradiance cfm heater',
#                        'panasonic fvvre ventilation fan recessed led']
# })

# Apply TF-IDF on the 'query' column
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(X)

# Convert the TF-IDF matrix to a DataFrame for better visualization
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

# Display the TF-IDF results
print(tfidf_df)


        00  000  007  00m  00pp   01  013   02  02cool   03  ...  zapper   
0      0.0  0.0  0.0  0.0   0.0  0.0  0.0  0.0     0.0  0.0  ...     0.0  \
1      0.0  0.0  0.0  0.0   0.0  0.0  0.0  0.0     0.0  0.0  ...     0.0   
2      0.0  0.0  0.0  0.0   0.0  0.0  0.0  0.0     0.0  0.0  ...     0.0   
3      0.0  0.0  0.0  0.0   0.0  0.0  0.0  0.0     0.0  0.0  ...     0.0   
4      0.0  0.0  0.0  0.0   0.0  0.0  0.0  0.0     0.0  0.0  ...     0.0   
...    ...  ...  ...  ...   ...  ...  ...  ...     ...  ...  ...     ...   
49995  0.0  0.0  0.0  0.0   0.0  0.0  0.0  0.0     0.0  0.0  ...     0.0   
49996  0.0  0.0  0.0  0.0   0.0  0.0  0.0  0.0     0.0  0.0  ...     0.0   
49997  0.0  0.0  0.0  0.0   0.0  0.0  0.0  0.0     0.0  0.0  ...     0.0   
49998  0.0  0.0  0.0  0.0   0.0  0.0  0.0  0.0     0.0  0.0  ...     0.0   
49999  0.0  0.0  0.0  0.0   0.0  0.0  0.0  0.0     0.0  0.0  ...     0.0   

       zer0  zero  zinc  zip  ziplock  zipper  zirconia  zoom  zx6r  
0       0.0   0.0

In [1]:
# Doesn't work

# import seaborn as sns
# import matplotlib.pyplot as plt

# # Subsample a smaller portion of the data
# sample_size = 1000
# sampled_tfidf_df = tfidf_df.sample(sample_size, random_state=42)

# # Create a heatmap using Seaborn
# plt.figure(figsize=(10, 6))
# sns.heatmap(sampled_tfidf_df, cmap='viridis', annot=True, fmt=".2f", xticklabels=1, yticklabels=X.iloc[sampled_tfidf_df.index])
# plt.title('TF-IDF Heatmap for Queries (Sampled)')
# plt.xlabel('Terms')
# plt.ylabel('Queries')
# plt.show()


In [5]:
num_rows = 300000
df_examples_products_source = pd.read_csv('../task2/query_product_detail.csv', nrows=num_rows)
df_examples_products_source = df_examples_products_source.dropna()
df_examples_products_source.shape

(140568, 2)

In [6]:
# Assuming df_examples_products_source is your DataFrame
X = df_examples_products_source['query']
y = df_examples_products_source['product_detail']

In [7]:
# Assuming X and y are your variables
print("Type of X:", type(X))
print("Type of y:", type(y))


Type of X: <class 'pandas.core.series.Series'>
Type of y: <class 'pandas.core.series.Series'>


In [8]:
# Assuming X and y are pandas Series
element_type_X = type(X.iloc[0])
element_type_y = type(y.iloc[0])

print("Type of elements in X:", element_type_X)
print("Type of elements in y:", element_type_y)

Type of elements in X: <class 'str'>
Type of elements in y: <class 'str'>
