# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
import seaborn as sns
import re

# Import Dataset

In [None]:
datasetFolderPath = r'/Users/boonsuenoh/Documents/Dev/product-recommender-system/dataset/'

In [None]:
products = pd.read_json(datasetFolderPath + 'subset_meta_Electronics.json', lines=True)

In [None]:
products.head(50)

In [None]:
# There are 104802 records with 19 columns
print(products.shape)

## Remove unimportant columns

In [None]:
products = products.drop(columns=['fit', 'rank', 'details', 'tech1', 'tech2', 'price', 'date', 'imageURL', 'imageURLHighRes', 'similar_item'], errors='ignore')
products.head()

### Load a into new json file

In [None]:
products.to_json(datasetFolderPath + 'subset_meta_Electronics_cleansed.json')

In [15]:
# Can Jump Start Here
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
import seaborn as sns
import re

datasetFolderPath = r'/Users/boonsuenoh/Documents/Dev/product-recommender-system/dataset/'
products = pd.read_json(datasetFolderPath + 'subset_meta_Electronics_cleansed.json')
products = products.drop(columns=['fit', 'rank', 'details', 'tech1', 'tech2', 'price', 'date', 'imageURL', 'imageURLHighRes', 'similar_item'], errors='ignore')
products = products.head(30000)

# Data Preprocessing

In [16]:
# Take a look at the data in main_cat columns
for number, letter in enumerate(list(dict.fromkeys(products['main_cat'].tolist()))):
    print(number + 1, letter)

1 Camera & Photo
2 Books
3 All Electronics
4 Home Audio & Theater
5 Computers
6 Portable Audio & Accessories
7 Cell Phones & Accessories
8 Office Products
9 Sports & Outdoors
10 GPS & Navigation
11 Toys & Games
12 Software
13 Amazon Home
14 Arts, Crafts & Sewing
15 Baby
16 Car Electronics
17 Health & Personal Care
18 Tools & Home Improvement
19 Musical Instruments
20 Pet Supplies
21 Amazon Fashion
22 Amazon Devices
23 Industrial & Scientific
24 Automotive
25 Movies & TV
26 Video Games
27 All Beauty
28 Appliances


### Observation
- Notice that there NO 21, 27, 30 and 32 are shown as image HTML tags but they are actually all the main categories which is "Amazon Fashion".
- For some main categories, the '&' character is also shown as the HTML code of "&amp;", resulting in different categories.
- Therefore, NO 21, 27, 30 and 32 will be changed to become "Amazon Fashion".
- The "&amp;" will also be replaced to "&" for main_cat and brand.

In [17]:
# Defining text cleaning function
def text_cleaning(text):
    text = re.sub(r'amp;','',text)
    text = re.sub(r'&quot;', '"', text)
    text = re.sub(r'&reg;', '®', text)
    text = re.sub(r'</span>', '', text)
    return text

products.loc[products['main_cat'].str.contains('AMAZON FASHION'), 'main_cat'] = 'Amazon Fashion'
products["category"] = products["category"].str.join(' ')

# Applying text cleaning function to each row
products['main_cat'] = products['main_cat'].apply(lambda text: text_cleaning(text))
products['brand'] = products['brand'].apply(lambda text: text_cleaning(text))
products['title'] = products['title'].apply(lambda text: text_cleaning(text))
products['category'] = products['category'].apply(lambda text: text_cleaning(text))

In [18]:
# After cleansing
for number, letter in enumerate(sorted(list(dict.fromkeys(products['category'].tolist())))):
    print(number + 1, letter)

1 Electronics Accessories & Supplies
2 Electronics Accessories & Supplies Audio & Video Accessories
3 Electronics Accessories & Supplies Audio & Video Accessories 3D Glasses
4 Electronics Accessories & Supplies Audio & Video Accessories Antennas
5 Electronics Accessories & Supplies Audio & Video Accessories Antennas Radio Antennas
6 Electronics Accessories & Supplies Audio & Video Accessories Antennas TV Antennas
7 Electronics Accessories & Supplies Audio & Video Accessories Cables & Interconnects
8 Electronics Accessories & Supplies Audio & Video Accessories Cables & Interconnects Audio Cables
9 Electronics Accessories & Supplies Audio & Video Accessories Cables & Interconnects Audio Cables Digital Coaxial Cables
10 Electronics Accessories & Supplies Audio & Video Accessories Cables & Interconnects Audio Cables Fiber Optic Cables
11 Electronics Accessories & Supplies Audio & Video Accessories Cables & Interconnects Audio Cables RCA Cables
12 Electronics Accessories & Supplies Audio & 

In [19]:
# df = products.copy()
# df["category"] = df["category"].str.join(" ")

# sorted(list(dict.fromkeys(df['category'].tolist())))

# Exploratory Data Analysis

In [None]:
# Top most frequent brands
plt.subplots(figsize=(10,7))
products.brand.value_counts()[:10].plot(kind="bar")
plt.show()

In [None]:
# Top 10 most frequent main categories
plt.subplots(figsize=(10,7))
products.main_cat.value_counts()[:10].plot(kind="bar")
plt.show()
# print(products.main_cat.value_counts()[:10])

# Content Based Filtering Recommender System

### Recommendation of products based on similar *brand* only:

In [29]:
products_brand = products[['asin', 'title', 'brand', 'category', 'main_cat']]
products_brand.head()

Unnamed: 0,asin,title,brand,category,main_cat
0,11300000,Genuine Geovision 1 Channel 3rd Party NVR IP S...,GeoVision,Electronics Camera & Photo Video Surveillance ...,Camera & Photo
1,43396828,"Books ""Handbook of Astronomical Image Processi...",33 Books Co.,Electronics Camera & Photo,Camera & Photo
2,60009810,One Hot Summer,Visit Amazon's Carolina Garcia Aguilera Page,Electronics eBook Readers & Accessories eBook ...,Books
3,60219602,Hurray for Hattie Rabbit: Story and pictures (...,Visit Amazon's Dick Gackenbach Page,Electronics eBook Readers & Accessories eBook ...,Books
4,60786817,sex.lies.murder.fame.: A Novel,Visit Amazon's Lolita Files Page,Electronics eBook Readers & Accessories eBook ...,Books


In [30]:
# Remove stop words
tf = TfidfVectorizer(analyzer='word', stop_words='english')
# Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tf.fit_transform(products_brand['brand'])
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [31]:
cosine_sim

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [32]:
# Build 1-dimensional array with product titles
titles = products_brand[['title', 'brand', 'category', 'main_cat']]
indices = pd.Series(products_brand.index, index=products_brand['title'])

# print(indices["Microsoft Mouse (PS/2)"].iloc[0])
# print(list(enumerate(cosine_sim[indices["One Hot Summer"]])))

# Function that get product recommendations based on the cosine similarity score of brands
def brand_recommendations(title):
    # Get the index of the product that matches the title
    idx = indices[title]
    
    # Use the first product if there're multiple products with same title
    if (not isinstance(idx, (int, np.int64))):
        idx = indices[title].iloc[0]
    
    
    # Get the pairwsie similarity scores of all products with that product
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the products based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores of the 20 most similar products
    sim_scores = sim_scores[0:20]
    # Get the product indices
    product_indices = [i[0] for i in sim_scores]
    # Return the top 20 most similar products
    return titles.iloc[product_indices]

In [33]:
# Getting the product title input from user
title_input = input('Enter product title: ')

Enter product title: Microsoft Mouse (PS/2)


In [34]:
brand_recommendations(title_input)

Unnamed: 0,title,brand,category,main_cat
42,Microsoft MultiMedia Keyboard 1.0A KB-0168,Microsoft,Electronics Computers & Accessories Computer A...,Computers
2816,Microsoft Mouse (PS/2),Microsoft,Electronics Computers & Accessories Computer A...,All Electronics
2823,Microsoft Basic Mouse V1.0 Series and PS/2 Win...,Microsoft,Electronics Computers & Accessories Computer A...,All Electronics
2828,Microsoft Natural Keyboard Elite,Microsoft,Electronics Computers & Accessories Computer A...,All Electronics
2832,Microsoft 673-00089 IntelliMouse,Microsoft,Electronics Computers & Accessories Computer A...,All Electronics
2863,Microsoft IntelliMouse Trackball V1.0 and PS/2,Microsoft,Electronics Computers & Accessories Computer A...,All Electronics
2864,Microsoft Wheel Mouse for Windows 98,Microsoft,Electronics Computers & Accessories Computer A...,All Electronics
3668,Microsoft B75-00001 IntelliMouse Explorer,Microsoft,Electronics Computers & Accessories Computer A...,All Electronics
3670,Microsoft Natural Keyboard Pro,Microsoft,Electronics Computers & Accessories Computer A...,All Electronics
3671,Microsoft Cordless Wheel Mouse (PS2),Microsoft,Electronics Computers & Accessories Computer A...,All Electronics


In [None]:
# print(list(dict.fromkeys(products['title'].tolist())))
print(sorted(list(dict.fromkeys(products['brand'].tolist()))))

In [None]:
print(products.iloc[8]['title'])

### Recommendation of products based on similar *main category* only:

In [None]:
products_main_cat = products[['asin', 'title', 'main_cat', 'category', 'brand']]
products_main_cat.head()

In [None]:
# Remove stop words
tf_1 = TfidfVectorizer(analyzer='word', stop_words='english')
# Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix_1 = tf_1.fit_transform(products_main_cat['main_cat'])
# Compute the cosine similarity matrix
cosine_sim_1 = linear_kernel(tfidf_matrix_1, tfidf_matrix_1)

In [None]:
cosine_sim_1

In [None]:
# Build 1-dimensional array with product titles
titles_1 = products_main_cat[['title', 'main_cat', 'category', 'brand']]
indices_1 = pd.Series(products_main_cat.index, index=products_main_cat['title'])

# Function that get product recommendations based on the cosine similarity score of main_cat
def main_cat_recommendations(title):
    # Get the index of the product that matches the title
    idx = indices_1[title]
    
    # Use the first product if there're multiple products with same title
    if (not isinstance(idx, (int, np.int64))):
        idx = indices_1[title].iloc[0]
    
    # Get the pairwsie similarity scores of all products with that product
    sim_scores = list(enumerate(cosine_sim_1[idx]))
    # Sort the products based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores of the 20 most similar products
    sim_scores = sim_scores[0:20]
    # Get the product indices
    product_indices = [i[0] for i in sim_scores]
    # Return the top 20 most similar products
    return titles_1.iloc[product_indices]

In [None]:
# Getting the product title input from user
title_input = input('Enter product title: ')

In [None]:
main_cat_recommendations(title_input)

In [None]:
# list(dict.fromkeys(products.iloc[200:300]['title'].tolist()))

products[products['brand'].str.contains("Microsoft", case=False)]

### Recommendation of products based on similar *category* tags only:

In [6]:
products_category = products[['asin', 'title', 'category', 'brand', 'main_cat']]
products_category.head()

Unnamed: 0,asin,title,category,brand,main_cat
0,11300000,Genuine Geovision 1 Channel 3rd Party NVR IP S...,Electronics Camera & Photo Video Surveillance ...,GeoVision,Camera & Photo
1,43396828,"Books ""Handbook of Astronomical Image Processi...",Electronics Camera & Photo,33 Books Co.,Camera & Photo
2,60009810,One Hot Summer,Electronics eBook Readers & Accessories eBook ...,Visit Amazon's Carolina Garcia Aguilera Page,Books
3,60219602,Hurray for Hattie Rabbit: Story and pictures (...,Electronics eBook Readers & Accessories eBook ...,Visit Amazon's Dick Gackenbach Page,Books
4,60786817,sex.lies.murder.fame.: A Novel,Electronics eBook Readers & Accessories eBook ...,Visit Amazon's Lolita Files Page,Books


In [7]:
# Remove stop words
tf_2 = TfidfVectorizer(analyzer='word', stop_words='english')
# Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix_2 = tf_2.fit_transform(products_category['category'])
# Compute the cosine similarity matrix
cosine_sim_2 = linear_kernel(tfidf_matrix_2, tfidf_matrix_2)

### Recommendation of products based on categories tags, brand and main category:

In [20]:
# Creating datasoup made of selected columns
products['ensemble'] = products['category'] + ' ' + products['brand'] + ' ' + products['main_cat']

# Printing record at index 0
print(products['ensemble'].iloc[0])

Electronics Camera & Photo Video Surveillance Surveillance Systems Surveillance DVR Kits GeoVision Camera & Photo


In [21]:
tf_coll = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix_coll = tf_coll.fit_transform(products['ensemble'])
cosine_sim_coll = linear_kernel(tfidf_matrix_coll, tfidf_matrix_coll)

In [25]:
# Build a 1-dimensional array with product titles
titles_3 = products[['title', 'category', 'brand', 'main_cat']]
indices_3 = pd.Series(products.index, index=products['title'])

# Function that get product recommendations based on the cosine similarity score of ensemble
def ensemble_recommendations(title):
    # Get the index of the product that matches the title
    idx = indices_3[title]
    
    # Use the first product if there're multiple products with same title
    if (not isinstance(idx, (int, np.int64))):
        idx = indices_1[title].iloc[0]
    
    # Get the pairwsie similarity scores of all products with that product
    sim_scores = list(enumerate(cosine_sim_coll[idx]))
    # Sort the products based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores of the 20 most similar products
    sim_scores = sim_scores[0:20]
    # Get the product indices
    product_indices = [i[0] for i in sim_scores]
    # Return the top 20 most similar products
    return titles_3.iloc[product_indices]

In [35]:
# Getting the product name from user
# Example input: Sony CFD-C1000 Compact Stereo System
title_input = input('Enter product name: ')

Enter product name: Microsoft Mouse (PS/2)


In [36]:
ensemble_recommendations(title_input)

Unnamed: 0,title,category,brand,main_cat
2816,Microsoft Mouse (PS/2),Electronics Computers & Accessories Computer A...,Microsoft,All Electronics
2823,Microsoft Basic Mouse V1.0 Series and PS/2 Win...,Electronics Computers & Accessories Computer A...,Microsoft,All Electronics
2832,Microsoft 673-00089 IntelliMouse,Electronics Computers & Accessories Computer A...,Microsoft,All Electronics
2864,Microsoft Wheel Mouse for Windows 98,Electronics Computers & Accessories Computer A...,Microsoft,All Electronics
3668,Microsoft B75-00001 IntelliMouse Explorer,Electronics Computers & Accessories Computer A...,Microsoft,All Electronics
3671,Microsoft Cordless Wheel Mouse (PS2),Electronics Computers & Accessories Computer A...,Microsoft,All Electronics
4100,Microsoft D58-00002 Intellimouse Optical,Electronics Computers & Accessories Computer A...,Microsoft,All Electronics
7248,Microsoft Wheel Mouse Optical,Electronics Computers & Accessories Computer A...,Microsoft,All Electronics
9389,Microsoft Wheel Mouse Optical,Electronics Computers & Accessories Computer A...,Microsoft,All Electronics
9390,Microsoft Intellimouse Optical Mouse,Electronics Computers & Accessories Computer A...,Microsoft,All Electronics


## Using CountVectorizer to recommend products based on brand

In [37]:
products_brand_cv = products[['asin', 'title', 'brand', 'category', 'main_cat']]
products_brand_cv.head()

Unnamed: 0,asin,title,brand,category,main_cat
0,11300000,Genuine Geovision 1 Channel 3rd Party NVR IP S...,GeoVision,Electronics Camera & Photo Video Surveillance ...,Camera & Photo
1,43396828,"Books ""Handbook of Astronomical Image Processi...",33 Books Co.,Electronics Camera & Photo,Camera & Photo
2,60009810,One Hot Summer,Visit Amazon's Carolina Garcia Aguilera Page,Electronics eBook Readers & Accessories eBook ...,Books
3,60219602,Hurray for Hattie Rabbit: Story and pictures (...,Visit Amazon's Dick Gackenbach Page,Electronics eBook Readers & Accessories eBook ...,Books
4,60786817,sex.lies.murder.fame.: A Novel,Visit Amazon's Lolita Files Page,Electronics eBook Readers & Accessories eBook ...,Books


In [78]:
# Getting the product name from user
# Example input: Sony CFD-C1000 Compact Stereo System
title_input = input('Enter product name: ')

Enter product name: One Hot Summer


In [79]:
# Initialize vectorizer
# min_df = rare words, max_df = most used words
vect1 = CountVectorizer(analyzer = 'word', ngram_range = (1,2), stop_words = 'english', min_df = 0.002)

# Fit into the brands
vect1.fit(products_brand_cv['brand'])
title_matrix = vect1.transform(products_brand_cv['brand'])

In [80]:
features = vect1.get_feature_names_out()
features

array(['acoustic', 'acoustic research', 'adorama', 'aiwa', 'amazon',
       'antec', 'apc', 'apple', 'audio', 'audiovox', 'barnes',
       'barnes noble', 'battery', 'belkin', 'brother', 'bushnell', 'c2g',
       'c2g cables', 'cables', 'canon', 'case', 'case logic', 'celestron',
       'cisco', 'cisco systems', 'cobra', 'coby', 'cokin', 'compaq',
       'creative', 'da', 'da lite', 'days', 'days tech', 'digital',
       'direct', 'electronics', 'factory', 'factory direct', 'fellowes',
       'fujifilm', 'garmin', 'ge', 'hakuba', 'harman', 'hawking',
       'hawking technology', 'hewlett', 'hewlett packard', 'hosa', 'hoya',
       'hp', 'ibm', 'ilford', 'imation', 'innovations', 'intel', 'iogear',
       'iomega', 'jbl', 'jensen', 'jvc', 'kardon', 'kensington',
       'kenwood', 'kingston', 'kodak', 'konica', 'konica minolta', 'koss',
       'labtec', 'leica', 'lenmar', 'lexar', 'link', 'linksys', 'lite',
       'logic', 'logitech', 'lowepro', 'magellan', 'maxell', 'meade',
       'mem

In [81]:
cosine_sim_titles = cosine_similarity(title_matrix, title_matrix)
cosine_sim_titles

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [82]:
indices1 = pd.Series(products_brand_cv.index,index=products_brand_cv['title'])
title_id = indices1[title_input]

In [83]:
# Find out what features have been considered  by the vectorizer for a given title 
feature_array = np.squeeze(title_matrix[title_id].toarray()) #squeeze activity matrix into array
idx = np.where(feature_array > 0)

In [84]:
# Cosine similarity with other similar titles
n = 21 # how many books to be recommended
top_n_idx = np.flip(np.argsort(cosine_sim_titles[title_id,]), axis = 0)[0:n]
top_n_sim_values = cosine_sim_titles[title_id, top_n_idx]

In [85]:
# find top n with values > 0
top_n_idx = top_n_idx[top_n_sim_values > 0]
scores = top_n_sim_values[top_n_sim_values > 0]

In [86]:
products_brand_cv['title'].iloc[title_id]
pd.DataFrame({"products": products_brand_cv['title'].iloc[top_n_idx].values,
           "scores":scores}, columns = ["products","scores"])

Unnamed: 0,products,scores
0,Nightfall,1.0
1,Summer Breeze (Four Seasons),1.0
2,Last Tango in Aberystwyth,1.0
3,Star Gazing,1.0
4,With You And Without You,1.0
5,The Shadow Guard,1.0
6,The Wolf Prince,1.0
7,The Bite Before Christmas,1.0
8,The Touch,1.0
9,Not a Sparrow Falls,1.0


## Using CountVectorizer to recommend products based on categories tags, brand and main category

In [60]:
# Getting the product name from user
# Input Sony CFD-C1000 Compact Stereo System
title_input = input('Enter the product name: ')

Enter the product name: Microsoft Mouse (PS/2)


In [61]:
# Initialize vectorizer
# min_df = rare words, max_df = most used words
vect2 = CountVectorizer(analyzer = 'word', ngram_range = (1,2), stop_words = 'english', min_df = 0.002)

#Fit into the collection
vect2.fit(products['ensemble'])
title_matrix1 = vect2.transform(products['ensemble'])

In [62]:
features1 = vect2.get_feature_names_out()
features1

array(['5e', '5e cables', 'ac', ..., 'wireless solutions', 'yamaha',
       'zip'], dtype=object)

In [63]:
cosine_sim_titles1 = cosine_similarity(title_matrix1, title_matrix1)

In [64]:
indices2 = pd.Series(products.index,index=products['title'])
title_id1 = indices2[title_input]

In [65]:
feature_array = np.squeeze(title_matrix1[title_id1].toarray()) #squeeze activity matrix into array
idx = np.where(feature_array > 0)

In [66]:
n = 21 # how many products to be recommended
top_n_idx1 = np.flip(np.argsort(cosine_sim_titles1[title_id1,]), axis = 0)[0:n]
top_n_sim_values1 = cosine_sim_titles1[title_id1, top_n_idx1]

In [67]:
# find top n with values > 0
top_n_idx1 = top_n_idx1[top_n_sim_values1 > 0]
scores1 = top_n_sim_values1[top_n_sim_values1 > 0]

In [68]:
products['title'].iloc[title_id1]
pd.DataFrame({"products": products['title'].iloc[top_n_idx1].values,
           "scores":scores1}, columns = ["products","scores"])

Unnamed: 0,products,scores
0,Microsoft Intellimouse Explorer,1.0
1,Microsoft Wireless Optical Mouse - Black,1.0
2,Microsoft IntelliMouse 3.0,1.0
3,Microsoft Wireless Intellimouse Explorer SE - ...,1.0
4,Microsoft Wireless Optical Mouse Blue,1.0
5,"Microsoft Wheel Mouse Optical PS2&USB (Beige),...",1.0
6,Microsoft D58-00002 Intellimouse Optical,1.0
7,Microsoft Bluetooth Wireless Intellimouse Expl...,1.0
8,Microsoft Wireless Intellimouse Explorer for W...,1.0
9,Microsoft INTELLIMOUSE EXPLORER (B75-00011),1.0
