# Simple Recommender System

## 1. Importing Libraries

In [288]:
#import library to be used in the project
import numpy as np
import pandas as pd
import html
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

## 2. File Reading and Data Cleansing: products

In [289]:
# Read the electronic products file
products_dataset_path = r'C:\Users\User\Desktop\product-recommender-system\dataset\subset_meta_Electronics.json'
products = pd.read_json(products_dataset_path, lines=True)

# Output the first 10 rows
products.head(10)

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes,details
0,"[Electronics, Camera &amp; Photo, Video Survei...",,[The following camera brands and models have b...,,Genuine Geovision 1 Channel 3rd Party NVR IP S...,[],,GeoVision,"[Genuine Geovision 1 Channel NVR IP Software, ...","[>#3,092 in Tools &amp; Home Improvement &gt; ...",[],Camera &amp; Photo,,"January 28, 2014",$65.00,11300000,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,
1,"[Electronics, Camera &amp; Photo]",,[This second edition of the Handbook of Astron...,,"Books ""Handbook of Astronomical Image Processi...",[0999470906],,33 Books Co.,[Detailed chapters cover these fundamental top...,"[>#55,933 in Camera &amp; Photo (See Top 100 i...","[0943396670, 1138055360, 0999470906]",Camera &amp; Photo,,"June 17, 2003",,43396828,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,
2,"[Electronics, eBook Readers &amp; Accessories,...",,[A zesty tale. (Publishers Weekly)<br /><br />...,,One Hot Summer,"[0425167798, 039914157X]",,Visit Amazon's Carolina Garcia Aguilera Page,[],"3,105,177 in Books (",[],Books,,,$11.49,60009810,[],[],
3,"[Electronics, eBook Readers & Accessories, eBo...",,[],,Hurray for Hattie Rabbit: Story and pictures (...,"[0060219521, 0060219580, 0060219394]",,Visit Amazon's Dick Gackenbach Page,[],"2,024,298 in Books (","[0060219521, 0060219475, 0060219394]",Books,,,.a-section.a-spacing-mini{margin-bottom:6px!im...,60219602,[],[],
4,"[Electronics, eBook Readers & Accessories, eBo...",,[&#8220;sex.lies.murder.fame. is brillllli&#82...,,sex.lies.murder.fame.: A Novel,[],,Visit Amazon's Lolita Files Page,[],"3,778,828 in Books (",[],Books,,,$13.95,60786817,[],[],
5,"[Electronics, eBook Readers &amp; Accessories,...",,"[, ]",,College Physics,"[0073049557, 0134454170, 1118142063, 007733968...",,Visit Amazon's Alan Giambattista Page,[],"3,330,771 in Books (","[0073512141, 0077339681, 0073049557, 007304956...",Books,,,,70524076,[],[],
6,"[Electronics, eBook Readers & Accessories, eBo...",,[GIRL WITH A ONE-TRACK MIND: CONFESSIONS OF TH...,,Girl with a One-track Mind: Confessions of the...,[0330509691],,ABBY LEE,[],"3,304,037 in Books (",[B0719LDQR1],Books,,,$4.76,91912407,[],[],
7,"[Electronics, Portable Audio & Video, MP3 & MP...",,[Support system: Windows XP/Vsita/7 * SNR: 85d...,,abcGoodefg&reg; 4GB USB 2.0 Mp3 Music Player w...,"[B01NAJ3KQB, B00WYSPT0C, B00AF40U5G, B00OFVNM4...",,Crazy Cart,[Package Content: 1 x Display MP3 Player 1 x E...,"[>#177,454 in Electronics (See Top 100 in Elec...","[B01NAJ3KQB, B00OFVNM4G, B00L41WY8K, B07F34PNP...",All Electronics,"class=""a-bordered a-horizontal-stripes a-spa...","December 28, 2012",,101635370,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,
8,"[Electronics, Headphones, Earbud Headphones]",,"[, <b>True High Definition Sound:</b><br>With ...",,Wireless Bluetooth Headphones Earbuds with Mic...,[],,Enter The Arena,[Superb Sound Quality: Plays crystal clear aud...,[>#950 in Cell Phones & Accessories (See Top 1...,[],Home Audio & Theater,,"October 23, 2017",$7.99,132492776,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,
9,"[Electronics, Computers &amp; Accessories, Com...",,[],,Kelby Training DVD: Mastering Blend Modes in A...,[],,Kelby Training,[],"[>#932,732 in Computers &amp; Accessories &gt;...",[],Computers,,"December 9, 2011",,132793040,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,


In [290]:
print(products.shape)
# 104802 row (records), 19 columns

(104802, 19)


In [291]:
#retrieving column name
products.columns

Index(['category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'tech2',
       'brand', 'feature', 'rank', 'also_view', 'main_cat', 'similar_item',
       'date', 'price', 'asin', 'imageURL', 'imageURLHighRes', 'details'],
      dtype='object')

### 2.1 General Cleansing

#### 2.1.1 Drop Unnecessary columns

In [292]:
# keep: asin (product_id) and title (product_name)
# keep brand, main_cat, price for filtering usage 
products = products.drop(columns=['category', 'tech1', 'description', 'fit', 'also_buy', 'tech2',
       'feature', 'rank', 'also_view', 'similar_item', 'date', 'imageURL', 'imageURLHighRes', 'details'], errors='ignore')

products.columns

Index(['title', 'brand', 'main_cat', 'price', 'asin'], dtype='object')

In [293]:
products = products[['asin', 'title', 'brand', 'price', 'main_cat']]
products.columns = ['product_id', 'product_name', 'brand_or_author', 'price', 'main_category']

In [294]:
products.head(10)

Unnamed: 0,product_id,product_name,brand_or_author,price,main_category
0,11300000,Genuine Geovision 1 Channel 3rd Party NVR IP S...,GeoVision,$65.00,Camera &amp; Photo
1,43396828,"Books ""Handbook of Astronomical Image Processi...",33 Books Co.,,Camera &amp; Photo
2,60009810,One Hot Summer,Visit Amazon's Carolina Garcia Aguilera Page,$11.49,Books
3,60219602,Hurray for Hattie Rabbit: Story and pictures (...,Visit Amazon's Dick Gackenbach Page,.a-section.a-spacing-mini{margin-bottom:6px!im...,Books
4,60786817,sex.lies.murder.fame.: A Novel,Visit Amazon's Lolita Files Page,$13.95,Books
5,70524076,College Physics,Visit Amazon's Alan Giambattista Page,,Books
6,91912407,Girl with a One-track Mind: Confessions of the...,ABBY LEE,$4.76,Books
7,101635370,abcGoodefg&reg; 4GB USB 2.0 Mp3 Music Player w...,Crazy Cart,,All Electronics
8,132492776,Wireless Bluetooth Headphones Earbuds with Mic...,Enter The Arena,$7.99,Home Audio & Theater
9,132793040,Kelby Training DVD: Mastering Blend Modes in A...,Kelby Training,,Computers


#### 2.1.2 Remove Duplicates

In [295]:
print(products.shape)

(104802, 5)


In [296]:
products = products.drop_duplicates()

print(products.shape)

(74434, 5)


#### 2.1.3 Format Strings

In [297]:
# Defining text cleaning function

def text_cleaning(text):
    # 1. convert any HTML entities in the text to their corresponding characters
    # e.g. &amp; to &, &quot; to ", &reg; to ®
    text = html.unescape(text)
    
    # 2. convert to lower case
    return text.lower()

text = "&amp; &quot &reg;"
cleaned_text = text_cleaning(text)
print(cleaned_text)

& " ®


In [298]:
# List of columns to be cleaned
cols_to_clean = ['product_id', 'product_name', 'brand_or_author', 'price', 'main_category']

# Apply the text cleaning function to each column
for col in cols_to_clean:
    products[col] = products[col].apply(lambda x: text_cleaning(x))

In [299]:
products.head(10)

Unnamed: 0,product_id,product_name,brand_or_author,price,main_category
0,11300000,genuine geovision 1 channel 3rd party nvr ip s...,geovision,$65.00,camera & photo
1,43396828,"books ""handbook of astronomical image processi...",33 books co.,,camera & photo
2,60009810,one hot summer,visit amazon's carolina garcia aguilera page,$11.49,books
3,60219602,hurray for hattie rabbit: story and pictures (...,visit amazon's dick gackenbach page,.a-section.a-spacing-mini{margin-bottom:6px!im...,books
4,60786817,sex.lies.murder.fame.: a novel,visit amazon's lolita files page,$13.95,books
5,70524076,college physics,visit amazon's alan giambattista page,,books
6,91912407,girl with a one-track mind: confessions of the...,abby lee,$4.76,books
7,101635370,abcgoodefg® 4gb usb 2.0 mp3 music player with ...,crazy cart,,all electronics
8,132492776,wireless bluetooth headphones earbuds with mic...,enter the arena,$7.99,home audio & theater
9,132793040,kelby training dvd: mastering blend modes in a...,kelby training,,computers


### 2.2 Cleaning 'main_category' column

In [300]:
main_category_df = products.groupby('main_category').size().reset_index(name='count')
main_category_df

Unnamed: 0,main_category,count
0,"<img src=""https://images-na.ssl-images-amazon....",38
1,"<img src=""https://images-na.ssl-images-amazon....",129
2,"<img src=""https://m.media-amazon.com/images/g/...",1
3,"<img src=""https://m.media-amazon.com/images/g/...",1
4,all beauty,30
5,all electronics,23269
6,amazon devices,37
7,amazon home,428
8,appliances,2
9,"arts, crafts & sewing",110


In [301]:
# The four image HTML elements are actually belongs to 'amazon fashion'
for i in (main_category_df.loc[0:3, 'main_category']):
    print(i)

#There are total 169 products for amazon fashion
total_af = {'main_category': 'Total', 'count': len(products.loc[products['main_category'].str.contains('amazon fashion')])}
main_category_df.loc[main_category_df['main_category'].str.contains('amazon fashion')].append(total_af, ignore_index=True)

<img src="https://images-na.ssl-images-amazon.com/images/g/01/nav2/images/gui/amazon-fashion-store-new._cb520838675_.png" class="nav-categ-image" alt="amazon fashion" />
<img src="https://images-na.ssl-images-amazon.com/images/g/01/nav2/images/gui/amazon-fashion-store-new._cb520838675_.png" class="nav-categ-image" alt="amazon fashion"/>
<img src="https://m.media-amazon.com/images/g/01/nav2/images/gui/amazon-fashion-store-new._cb520838675_.png" class="nav-categ-image" alt="amazon fashion" />
<img src="https://m.media-amazon.com/images/g/01/nav2/images/gui/amazon-fashion-store-new._cb520838675_.png" class="nav-categ-image" alt="amazon fashion"/>


Unnamed: 0,main_category,count
0,"<img src=""https://images-na.ssl-images-amazon....",38
1,"<img src=""https://images-na.ssl-images-amazon....",129
2,"<img src=""https://m.media-amazon.com/images/g/...",1
3,"<img src=""https://m.media-amazon.com/images/g/...",1
4,Total,169


In [303]:
# assign the four image HTML elements to 'amazon fashion' in main_category_df data frame
main_category_df.loc[main_category_df['main_category'].str.contains('amazon fashion'), 'main_category'] = 'amazon fashion'

main_category_df.groupby('main_category').sum()

Unnamed: 0_level_0,count
main_category,Unnamed: 1_level_1
all beauty,30
all electronics,23269
amazon devices,37
amazon fashion,169
amazon home,428
appliances,2
"arts, crafts & sewing",110
automotive,417
baby,19
books,335


In [304]:
# assign the four image HTML elements to 'amazon fashion' in products data frame
products.loc[products['main_category'].str.contains('amazon fashion'), 'main_category'] = 'amazon fashion'

products.groupby('main_category').size().reset_index(name='count')

Unnamed: 0,main_category,count
0,all beauty,30
1,all electronics,23269
2,amazon devices,37
3,amazon fashion,169
4,amazon home,428
5,appliances,2
6,"arts, crafts & sewing",110
7,automotive,417
8,baby,19
9,books,335


### 2.3 Cleaning 'brand_or_author' column

In [305]:
# The visit amazon's .... page pattern is actually the author.
pattern = "^visit amazon's (.*) page$"
mask = products['brand_or_author'].str.contains(pattern)
selected_columns = ['brand_or_author', 'main_category']
visit_amazon_pattern = products.loc[mask, selected_columns]

visit_amazon_pattern

Unnamed: 0,brand_or_author,main_category
2,visit amazon's carolina garcia aguilera page,books
3,visit amazon's dick gackenbach page,books
4,visit amazon's lolita files page,books
5,visit amazon's alan giambattista page,books
10,visit amazon's claire messud page,books
...,...,...
1403,visit amazon's dan wells page,books
1405,visit amazon's ismael cala page,books
1439,visit amazon's maría nuñez quesada page,books
68972,visit amazon's karin slaughter page,books


In [306]:
visit_amazon_pattern.groupby('main_category').size().reset_index(name='count')

Unnamed: 0,main_category,count
0,books,181


In [307]:
# Clean the brand or author column
def clean_author_string(author_string):
    author_string = author_string.replace("visit amazon's ", "")
    author_string = author_string.replace(" page", "")
    return author_string

products['brand_or_author'] = products['brand_or_author'].apply(clean_author_string)

In [308]:
# result
products.head(10)

Unnamed: 0,product_id,product_name,brand_or_author,price,main_category
0,11300000,genuine geovision 1 channel 3rd party nvr ip s...,geovision,$65.00,camera & photo
1,43396828,"books ""handbook of astronomical image processi...",33 books co.,,camera & photo
2,60009810,one hot summer,carolina garcia aguilera,$11.49,books
3,60219602,hurray for hattie rabbit: story and pictures (...,dick gackenbach,.a-section.a-spacing-mini{margin-bottom:6px!im...,books
4,60786817,sex.lies.murder.fame.: a novel,lolita files,$13.95,books
5,70524076,college physics,alan giambattista,,books
6,91912407,girl with a one-track mind: confessions of the...,abby lee,$4.76,books
7,101635370,abcgoodefg® 4gb usb 2.0 mp3 music player with ...,crazy cart,,all electronics
8,132492776,wireless bluetooth headphones earbuds with mic...,enter the arena,$7.99,home audio & theater
9,132793040,kelby training dvd: mastering blend modes in a...,kelby training,,computers


### 2.4 Cleaning 'price' column

In [310]:
# Clean the 'price' column
def clean_price_string (price_string):
    if price_string.startswith('$'):
        price_string = price_string.replace('$', '')
    else:
        price_string = '0'
    try:
        price_float = float(price_string)
    except ValueError:
        price_float = 0.0
    return price_float

products['price'] = products['price'].apply(clean_price_string)

In [311]:
# result
products.head(10)

Unnamed: 0,product_id,product_name,brand_or_author,price,main_category
0,11300000,genuine geovision 1 channel 3rd party nvr ip s...,geovision,65.0,camera & photo
1,43396828,"books ""handbook of astronomical image processi...",33 books co.,0.0,camera & photo
2,60009810,one hot summer,carolina garcia aguilera,11.49,books
3,60219602,hurray for hattie rabbit: story and pictures (...,dick gackenbach,0.0,books
4,60786817,sex.lies.murder.fame.: a novel,lolita files,13.95,books
5,70524076,college physics,alan giambattista,0.0,books
6,91912407,girl with a one-track mind: confessions of the...,abby lee,4.76,books
7,101635370,abcgoodefg® 4gb usb 2.0 mp3 music player with ...,crazy cart,0.0,all electronics
8,132492776,wireless bluetooth headphones earbuds with mic...,enter the arena,7.99,home audio & theater
9,132793040,kelby training dvd: mastering blend modes in a...,kelby training,0.0,computers


## 3. File Reading and Data Cleansing: ratings

In [312]:
# Read the ratings file
ratings_dataset_path = r'C:\Users\User\Desktop\product-recommender-system\dataset\electronic_user.csv'
ratings = pd.read_csv(ratings_dataset_path, names=['user_id', 'product_id','rating','timestamp'], index_col=False)

# Output the first 10 rows
ratings.head(10)

Unnamed: 0,user_id,product_id,rating,timestamp
0,AKM1MP6P0OYPR,132793040,5.0,1365811200
1,A2CX7LUOHB2NDG,321732944,5.0,1341100800
2,A2NWSAGRHCP8N5,439886341,1.0,1367193600
3,A2WNBOD3WNDNKT,439886341,3.0,1374451200
4,A1GI0U4ZRJA8WN,439886341,1.0,1334707200
5,A1QGNMC6O1VW39,511189877,5.0,1397433600
6,A3J3BRHTDRFJ2G,511189877,2.0,1397433600
7,A2TY0BTJOTENPG,511189877,5.0,1395878400
8,A34ATBPOK6HCHY,511189877,5.0,1395532800
9,A89DO69P0XZ27,511189877,5.0,1395446400


In [317]:
# merge first? or no need if simple
# drop duplicates first?

# Merge the products and ratings dataframes
products_merge = products.merge(ratings, on='product_id')

# Output the first 10 rows
products_merge.head(10)

Unnamed: 0,product_id,product_name,brand_or_author,price,main_category,user_id,rating,timestamp
0,132793040,kelby training dvd: mastering blend modes in a...,kelby training,0.0,computers,AKM1MP6P0OYPR,5.0,1365811200
1,321732944,kelby training dvd: adobe photoshop cs5 crash ...,kelby training,0.0,computers,A2CX7LUOHB2NDG,5.0,1341100800
2,511189877,clikr-5 time warner cable remote control ur5u-...,urc,0.0,all electronics,A1QGNMC6O1VW39,5.0,1397433600
3,511189877,clikr-5 time warner cable remote control ur5u-...,urc,0.0,all electronics,A3J3BRHTDRFJ2G,2.0,1397433600
4,511189877,clikr-5 time warner cable remote control ur5u-...,urc,0.0,all electronics,A2TY0BTJOTENPG,5.0,1395878400
5,511189877,clikr-5 time warner cable remote control ur5u-...,urc,0.0,all electronics,A34ATBPOK6HCHY,5.0,1395532800
6,511189877,clikr-5 time warner cable remote control ur5u-...,urc,0.0,all electronics,A89DO69P0XZ27,5.0,1395446400
7,511189877,clikr-5 time warner cable remote control ur5u-...,urc,0.0,all electronics,AZYNQZ94U6VDB,5.0,1401321600
8,528881469,rand mcnally 528881469 7-inch intelliroute tnd...,rand mcnally,0.0,all electronics,A1DA3W4GTFXP6O,5.0,1405641600
9,528881469,rand mcnally 528881469 7-inch intelliroute tnd...,rand mcnally,0.0,all electronics,A29LPQQDG7LD5J,1.0,1352073600


In [17]:
#import library to be used in the project
import numpy as np
import pandas as pd
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# Read the file
datasetFolderPath = r'C:\Users\User\Desktop\product-recommender-system\dataset\electronic_user.csv'
product_ratings = pd.read_csv(datasetFolderPath, names=['user_id', 'product_id','rating','timestamp'], index_col=False)

# Set the index start from 1 instead of 0
product_ratings.index = product_ratings.index + 1

# Output the first 10 rows
product_ratings.head(10)

Unnamed: 0,user_id,product_id,rating,timestamp
1,AKM1MP6P0OYPR,132793040,5.0,1365811200
2,A2CX7LUOHB2NDG,321732944,5.0,1341100800
3,A2NWSAGRHCP8N5,439886341,1.0,1367193600
4,A2WNBOD3WNDNKT,439886341,3.0,1374451200
5,A1GI0U4ZRJA8WN,439886341,1.0,1334707200
6,A1QGNMC6O1VW39,511189877,5.0,1397433600
7,A3J3BRHTDRFJ2G,511189877,2.0,1397433600
8,A2TY0BTJOTENPG,511189877,5.0,1395878400
9,A34ATBPOK6HCHY,511189877,5.0,1395532800
10,A89DO69P0XZ27,511189877,5.0,1395446400


In [20]:
product_ratings.iloc[0:1]

Unnamed: 0,user_id,product_id,rating,timestamp
1,AKM1MP6P0OYPR,132793040,5.0,1365811200


In [27]:
# Group the product ratings by product_id and count the number of rows
product_popularity = product_ratings.groupby('product_id').count()

# Sort the dataframe in descending order by the count column
product_popularity = product_popularity.sort_values(by='user_id', ascending=False)

# Rename the count column to 'count'
product_popularity = product_popularity.rename(columns={'user_id': 'count'})

# Drop the unnecessary columns
product_popularity = product_popularity[['count']]

# Output the first 10 rows of the dataframe
product_popularity.head(10)



Unnamed: 0_level_0,count
product_id,Unnamed: 1_level_1
B0074BW614,18244
B00DR0PDNE,16454
B007WTAJTO,14172
B0019EHU8G,12285
B006GWO5WK,12226
B003ELYQGG,11617
B003ES5ZUU,10276
B007R5YDYA,9907
B00622AG6S,9823
B0002L5R78,9487


In [28]:
product_ratings.shape

(7824482, 4)