# Product Recommender System
- **Module 1**: Simple Recommender System (Chai Wei Qi)
- **Module 2**: Content-Based Filtering Recommender System (Oh Boon Suen)
- **Module 3**: Collaborative Filtering Recommender System (Tan Cherng Ming)

The project is using datasets of Amazon electronic products.<br>
Source: https://cseweb.ucsd.edu/~jmcauley/datasets/amazon_v2/ 

## Importing Libraries

In [None]:
# Import library to be used in the project
import pandas as pd
import numpy as np
import html
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
import seaborn as sns
import re
import warnings
warnings.filterwarnings('ignore')

## Importing Dataset

1. electronic_products.json 
2. user_ratings.csv

In [None]:
# Read the electronic products file
products_dataset_path = 'dataset/electronic_products.json'
global_products = pd.read_json(products_dataset_path, lines=True)

# Read the ratings file
ratings_dataset_path = 'dataset/user_ratings.csv'
global_ratings = pd.read_csv(ratings_dataset_path, names=['user_id', 'product_id','rating','timestamp'], index_col=False)

# Simple Recommender System
Done by Chai Wei Qi

## 2. File Reading and Features Engineering: products

In [None]:
# Read the electronic products file
products = global_products

# Output the first 10 rows
products.head(10)

In [None]:
print(products.shape)
# output: (rows, columns)

In [None]:
# retrieving column name
products.columns

### 2.1 General Cleansing

#### 2.1.1 Drop Unnecessary columns

In [None]:
# keep: asin, title, brand, main_cat, and price (some columns are for filtering usage)
products = products.drop(columns=['category', 'tech1', 'description', 'fit', 'also_buy', 'tech2',
       'feature', 'rank', 'also_view', 'similar_item', 'date', 'imageURL', 'imageURLHighRes', 'details'], errors='ignore')

products.columns

In [None]:
products = products[['asin', 'title', 'brand', 'price', 'main_cat']]
products.columns = ['product_id', 'product_name', 'brand_or_author', 'price', 'main_category']

In [None]:
products.head(10)

#### 2.1.2 Remove Duplicates

In [None]:
products.shape

In [None]:
products.duplicated().sum()

In [None]:
products = products.drop_duplicates()

products.shape

#### 2.1.3 Format Strings

In [None]:
# Defining text cleaning function

def text_cleaning(text):
    # 1. convert any HTML entities in the text to their corresponding characters
    # e.g. &amp; to &, &quot; to ", &reg; to ®
    text = html.unescape(text)
    
    # 2. convert to lower case
    return text.lower()

text = "&amp; &quot &reg;"
cleaned_text = text_cleaning(text)
print(cleaned_text)

In [None]:
# List of columns to be cleaned
cols_to_clean = ['product_name', 'brand_or_author', 'price', 'main_category']

# Apply the text cleaning function to each column
for col in cols_to_clean:
    products[col] = products[col].apply(lambda x: text_cleaning(x))

In [None]:
products

### 2.2 Cleaning 'main_category' column

In [None]:
main_category_df = products.groupby('main_category').size().reset_index(name='count')
main_category_df

In [None]:
# The four image HTML elements are actually belongs to 'amazon fashion'
for i in (main_category_df.loc[0:3, 'main_category']):
    print(i)

# There are total 169 products for amazon fashion
total_af = {'main_category': 'Total', 'count': len(products.loc[products['main_category'].str.contains('amazon fashion')])}
main_category_df.loc[main_category_df['main_category'].str.contains('amazon fashion')].append(total_af, ignore_index=True)

In [None]:
# assign the four image HTML elements to 'amazon fashion' in main_category_df data frame
main_category_df.loc[main_category_df['main_category'].str.contains('amazon fashion'), 'main_category'] = 'amazon fashion'

main_category_df.groupby('main_category').sum()

In [None]:
# assign the four image HTML elements to 'amazon fashion' in products data frame
products.loc[products['main_category'].str.contains('amazon fashion'), 'main_category'] = 'amazon fashion'

products.groupby('main_category').size().reset_index(name='count')

### 2.3 Cleaning 'brand_or_author' column

In [None]:
# The visit amazon's .... page pattern is actually the author.
pattern = "^visit amazon's (.*) page$"
mask = products['brand_or_author'].str.contains(pattern)
selected_columns = ['brand_or_author', 'main_category']
visit_amazon_pattern = products.loc[mask, selected_columns]

visit_amazon_pattern

In [None]:
visit_amazon_pattern.groupby('main_category').size().reset_index(name='count')

In [None]:
# Clean the brand or author column
def clean_author_string(author_string):
    author_string = author_string.replace("visit amazon's ", "")
    author_string = author_string.replace(" page", "")
    return author_string

products['brand_or_author'] = products['brand_or_author'].apply(clean_author_string)

In [None]:
# result
products.head(10)

### 2.4 Cleaning 'price' column

In [None]:
dirty_price_df = products[~products['price'].str.contains('^\$')]

dirty_price_type_df = dirty_price_df.groupby('price').size().reset_index(name='count')
dirty_price_type_df

In [None]:
for i in (dirty_price_type_df.loc[1:6, 'price']):
    print(i)

In [None]:
# Since all the dirty price data have no price information (are all css or html code)
# We can set them all to '0'
def clean_price_string (price_string):
    if price_string.startswith('$'):
        price_string = price_string.replace('$', '')
    else:
        price_string = '0'
    try:
        price_float = float(price_string)
    except ValueError:
        price_float = 0.0
    return price_float

products['price'] = products['price'].apply(clean_price_string)

In [None]:
# result
products.head(10)

## 3. File Reading and Features Engineering: ratings

In [None]:
# Read the ratings file
ratings = global_ratings

In [None]:
# Output the first 10 rows
ratings.head(10)

In [None]:
ratings.columns

### 3.1 Features Engineering

#### 3.1.1 Check Duplicates ( no duplicated rows )

In [None]:
ratings.shape

In [None]:
# must check duplicates first before dropping the columns
# because all the four columns, especially the timestamp, are determining the duplication

ratings.duplicated().sum()

# no need to drop duplicates
# no need: ratings = ratings.drop_duplicates()#### 3.1.2 Drop Unnecessary Columns

#### 3.1.2 Drop Unnecessary Columns

In [None]:
# keep: product_id and rating  (each rating is valuable only)
# drop: user_id and timestamp
ratings = ratings.drop(columns=['user_id', 'timestamp'], errors='ignore')

ratings.columns

In [None]:
ratings.head(10)

#### 3.1.3 Create  'rating_average' and 'rating_count' from 'rating'

In [None]:
# Calculate the average rating and count of ratings for each product_id
ratings = ratings.groupby('product_id').agg(rating_average=('rating', 'mean'), rating_count=('rating', 'count'))

# use (0,1,2,3,4,5,6 as row indication instead of product_id)
ratings = ratings.reset_index()

In [None]:
ratings.head(10)

In [None]:
ratings.shape

In [None]:
# Top 10 products with the highest rating_count first then highest rating_average
ratings.sort_values(by=['rating_count','rating_average'], ascending=[False, False]).head(10)

## 4. Merge 'products' and 'ratings' into 'products_merge'

In [None]:
# Merge the products and ratings dataframes (keep all the products records)
products_merge = pd.merge(products, ratings, on='product_id', how='left')

# Output the first 10 rows
products_merge.head(10)

In [None]:
products_merge.shape

### 4.1 Replace NaN

In [None]:
products_merge.isnull().sum()

In [None]:
products_merge[products_merge['main_category'] == 'all electronics'].count()

In [None]:
products_merge[products_merge['main_category'] == 'all electronics'].isnull().sum()

In [None]:
products_merge.fillna({'rating_average': 0.0, 'rating_count': 0}, inplace=True)

In [None]:
products_merge.isnull().sum()

## 5. Simple Recommender System

### 5.1 Simple Rating Sort

▪ Sorting 'products_merge' by multiple columns.

▪ Issue: Even if the **rating_count** for a product is very high, but it can have a lower **rating_average**.

In [None]:
# Top 10 products with the highest rating_count first then highest rating_average
products_merge.sort_values(by=['rating_count','rating_average'], ascending=[False, False]).head(20)

### 5.2 Weighted Rating

▪ A *weighted rating* that takes into account the **rating_average** and the **rating_count** it has accumulated.

▪ We can calculate the Weighted Rating Score into a new 'score' column.

▪ The formula of weighted rating is as follows:

<img src="weighted_rating.png" width="600">

\>>> **v** is the number of rating for the product (represented by **rating_count**)

\>>> **m** is the **minimum rating count** required to be listed in the chart (to be calculated)

\>>> **R** is the average rating of the product (represented by **rating_average**)

\>>> **C** is the **mean of rating average** across the whole dataframe (to be calculated)

In [None]:
# describe() - can analyse the dataframe overallly
# C can be viewed using describe().

products_merge.describe()

# From the output:
# get C: mean of **rating_average** of product

In [None]:
# C, the mean of rating average across the whole dataframe
C = products_merge['rating_average'].mean()
C

In [None]:
# m, minimum rating count required to be listed in the chart

# consider the 90th percentile.

# for a product to be recommended, it must have more votes than at least 90% of the products.

m = products_merge['rating_count'].quantile(0.90) 
m

In [None]:
q_products = products_merge.copy().loc[products_merge['rating_count'] >= m]

q_products

In [None]:
# Function that computes the weighted rating of each product
def weighted_rating(x, m = m, C = C):
    
    v = x['rating_count']
    R = x['rating_average']
    
    # Calculation based on the IMDB formula
    return (v / (v + m) * R) + (m / (m + v) * C)

In [None]:
q_products['score'] = q_products.apply(weighted_rating, axis=1)

q_products.head(20)

### 5.3 Result using Weighted Rating

#### 5.3.1 Recommending Top 20 Products

▪ Sort q_proucts in descending order based on the score feature column.

▪ Output the product_name, main_category, brand_or_author, price, rating_average, rating_count, and weighted rating (score) of the top 20 products.

In [None]:
# Sort products based on 'score' and recommend the top 20 products
top_20_proucts = q_products.sort_values('score', ascending = False).head(20).reset_index()

columns = ['product_id', 'product_name', 'main_category','brand_or_author', 'price',  'rating_average', 'rating_count', 'score']
top_20_proucts = top_20_proucts [columns]

top_20_proucts.index = top_20_proucts.index + 1

top_20_proucts

#### 5.3.2 Recommending Top 20 Products According to product_name

In [None]:
# E.g. speaker
product_name = input("Enter the product name : ")

In [None]:
top_20_product_name = q_products[q_products['product_name'].str.contains(product_name.lower())]

top_20_product_name = top_20_product_name.sort_values('score', ascending = False).reset_index()[columns]
top_20_product_name.index = top_20_product_name.index + 1

top_20_product_name.head(20)

#### 5.3.3 Recommending Top 20 Products According to main_category

In [None]:
# e.g camera
main_category = input("Enter the main category : ")

In [None]:
top_20_main_category = q_products[q_products['main_category'].str.contains(main_category.lower())]

top_20_main_category = top_20_main_category [columns].sort_values('score', ascending = False).reset_index()
top_20_main_category.index = top_20_main_category.index + 1

top_20_main_category.head(20)

#### 5.3.4 Recommending Top 20 Products According to brand_or_author

In [None]:
# e.g. microsoft
brand_or_author = input("Enter the brand or author : ")

In [None]:
top_20_brand_or_author = q_products[q_products['brand_or_author'].str.contains(brand_or_author.lower())]

top_20_brand_or_author = top_20_brand_or_author [columns].sort_values('score', ascending = False).reset_index()
top_20_brand_or_author.index = top_20_brand_or_author.index + 1

top_20_brand_or_author.head(20)

#### 5.3.5 Recommending Top 20 Products According to price range

In [None]:
# e.g. 10, 100
min_price = int(input("Enter the minimum price : "))
max_price = int(input("Enter the maximum price : "))

In [None]:
top_20_within_price_range = q_products[(q_products['price'] >= min_price) & (q_products['price'] <= max_price)]

top_20_within_price_range = top_20_within_price_range [columns].sort_values('score', ascending = False).reset_index()
top_20_within_price_range.index = top_20_within_price_range.index + 1

top_20_within_price_range.head(20)

# Content-Based Filtering Recommender System
Done by Oh Boon Suen

## 2. File Reading and Features Engineering: products

In [None]:
# Read the electronic products file
products_dataset_path = './dataset/electronic_products.json'
products = pd.read_json(products_dataset_path, lines=True)

# Output the first 10 rows
products.head(10)

In [None]:
print(products.shape)
# output: (rows, columns)

In [None]:
# retrieving column name
products.columns

In [None]:
# There are 104802 records with 19 columns
print(products.shape)

### 2.1 General Cleansing

#### 2.1.1 Drop Unnecessary columns

In [None]:
# keep: asin, title, brand, main_cat, category
products = products.drop(columns=['fit', 'description', 'rank', 'details', 'also_buy', 
                                  'tech1', 'tech2', 'feature', 'price', 'date', 'imageURL', 
                                  'also_view', 'imageURLHighRes', 'similar_item'], errors='ignore')
products.columns

In [None]:
products = products[['asin', 'title', 'brand', 'main_cat', 'category']]
products.columns = ['product_id', 'product_name', 'brand_or_author', 'main_category', 'category_tags']

In [None]:
products.head(10)

#### 2.1.2 Remove Duplicates

In [None]:
products.shape

In [None]:
products["category_tags"] = products["category_tags"].str.join(" ")
products.duplicated().sum()

In [None]:
products = products.drop_duplicates()

products.shape

#### 2.1.3 Format Strings

In [None]:
# Defining text cleaning function
def text_cleaning(text):
    # 1. convert any HTML entities in the text to their corresponding characters
    # e.g. &amp; to &, &quot; to ", &reg; to ®
    text = html.unescape(text)
    
    # 2. convert to lower case
    return text.lower()

text = "&amp; &quot &reg;"
cleaned_text = text_cleaning(text)
print(cleaned_text)

In [None]:
# List of columns to be cleaned
cols_to_clean = ['product_name', 'brand_or_author', 'main_category', 'category_tags']

# Apply the text cleaning function to each column
for col in cols_to_clean:
    products[col] = products[col].apply(lambda x: text_cleaning(x))

In [None]:
products

### 2.2 Cleaning 'main_category' column

In [None]:
main_category_df = products.groupby('main_category').size().reset_index(name='count')
main_category_df

In [None]:
# The four image HTML elements are actually belongs to 'amazon fashion'
for i in (main_category_df.loc[0:3, 'main_category']):
    print(i)

# There are total 169 products for amazon fashion
total_af = {'main_category': 'Total', 'count': len(products.loc[products['main_category'].str.contains('amazon fashion')])}
main_category_df.loc[main_category_df['main_category'].str.contains('amazon fashion')].append(total_af, ignore_index=True)

In [None]:
# assign the four image HTML elements to 'amazon fashion' in main_category_df data frame
main_category_df.loc[main_category_df['main_category'].str.contains('amazon fashion'), 'main_category'] = 'amazon fashion'

main_category_df.groupby('main_category').sum()

In [None]:
# assign the four image HTML elements to 'amazon fashion' in products data frame
products.loc[products['main_category'].str.contains('amazon fashion'), 'main_category'] = 'amazon fashion'

products.groupby('main_category').size().reset_index(name='count')

### 2.3 Cleaning 'brand_or_author' column

In [None]:
# The visit amazon's .... page pattern is actually the author.
pattern = "^visit amazon's (.*) page$"
mask = products['brand_or_author'].str.contains(pattern)
selected_columns = ['brand_or_author', 'main_category']
visit_amazon_pattern = products.loc[mask, selected_columns]

visit_amazon_pattern

In [None]:
visit_amazon_pattern.groupby('main_category').size().reset_index(name='count')

In [None]:
# Clean the brand or author column
def clean_author_string(author_string):
    author_string = author_string.replace("visit amazon's ", "")
    author_string = author_string.replace(" page", "")
    return author_string

products['brand_or_author'] = products['brand_or_author'].apply(clean_author_string)

In [None]:
# result
products.head(10)

### 2.4 Creating 'ensemble' column

In [None]:
# Creating datasoup made of the selected 4 columns
products['ensemble'] = products['product_name'] + ' ' + products['brand_or_author'] + ' ' + products['main_category'] + ' ' + products['category_tags']

# Printing record at index 0
print(products['ensemble'].iloc[0])

## 3. Content-Based Filtering Recommender System

In [None]:
# Only get the first 30k records due to memory limitation
# when computing the cosine similarity matrix
products = products.head(30000)

### 3.1 Exploratory Data Analysis

In [None]:
# Top most frequent brands
plt.subplots(figsize=(10,7))
products.brand_or_author.value_counts()[:10].plot(kind="bar")
plt.show()

In [None]:
# Top 10 most frequent main categories
plt.subplots(figsize=(10,7))
products.main_category.value_counts()[:10].plot(kind="bar")
plt.show()
# print(products.main_cat.value_counts()[:10])

### 3.2 Using TfidfVectorizer

#### 3.2.1 Reusable Function Definitions

In [None]:
# Function that compute the cosine similarity matrix for an attribute
def compute_cosine_sim(attribute):
    # Remove stop words
    tf = TfidfVectorizer(analyzer='word', stop_words='english')
    # Construct the required TF-IDF matrix by fitting and transforming the data
    tfidf_matrix = tf.fit_transform(products[attribute])
    # Compute the cosine similarity matrix
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    return cosine_sim

# Function that get product recommendations based on the 
# cosine similarity score of a certain attribute passed in
def get_recommendations(product_name, n, cosine_sim):
    # Build 1-dimensional array with product names
    indices = pd.Series(products.index, index=products['product_name'])
    
    # Get the index of the product that matches the names
    idx = indices[product_name]
    
    # Use the first product if there're multiple products with same name
    if (not isinstance(idx, (int, np.int64))):
        idx = indices[product_name].iloc[0]
        
    # Get the pairwsie similarity scores of all products with that product
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the products based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Exclude the entered product
    sim_scores = [tup for tup in sim_scores if tup[0] != idx]
    # Get the scores of the N most similar products
    sim_scores = sim_scores[0:n]
    # Get the product indices
    product_indices = [i[0] for i in sim_scores]
    # Return the top N most similar products
    return products.iloc[product_indices]

# Function that get product name input from user
def getProductNameInput():
    name_input = input('Enter product name: ').lower()
    
    all_names = [products['product_name'][i] for i in range(len(products['product_name']))]
    
    if name_input not in all_names:
        print("Product not found, please enter again!")
    
    return name_input

#### 3.2.2 Recommending products based on "product_name"

In [None]:
# Get the cosine similarity matrix for product_name attribute
cosine_sim_product_name = compute_cosine_sim('product_name')

In [None]:
# Getting the product name input from user
name_input = getProductNameInput()

In [None]:
# Top 20 most similar products
get_recommendations(name_input, 20, cosine_sim_product_name)

#### 3.2.3 Recommending products based on "brand_or_author"

In [None]:
# Get the cosine similarity matrix for brand_or_author attribute
cosine_sim_brand_or_author = compute_cosine_sim('brand_or_author')

In [None]:
# Getting the product name input from user
name_input = getProductNameInput()

In [None]:
# Top 20 most similar products
get_recommendations(name_input, 20, cosine_sim_brand_or_author)

#### 3.2.4 Recommending products based on "main_category"

In [None]:
# Get the cosine similarity matrix for main_category attribute
cosine_sim_main_category = compute_cosine_sim('main_category')

In [None]:
# Getting the product name input from user
name_input = getProductNameInput()

In [None]:
# Top 20 most similar products
get_recommendations(name_input, 20, cosine_sim_main_category)

#### 3.2.5 Recommending products based on "category_tags"

In [None]:
# Get the cosine similarity matrix for category_tags attribute
cosine_sim_category_tags = compute_cosine_sim('category_tags')

In [None]:
# Getting the product name input from user
name_input = getProductNameInput()

In [None]:
# Top 20 most similar products
get_recommendations(name_input, 20, cosine_sim_category_tags)

#### 3.2.6 Recommending products based on a mixture of "product_name", "brand_or_author", "main_category" and "categories_tags"

In [None]:
# Get the cosine similarity matrix for ensemble attribute
cosine_sim_ensemble = compute_cosine_sim('ensemble')

In [None]:
# Getting the product name input from user
name_input = getProductNameInput()

In [None]:
# Top 20 most similar products
get_recommendations(name_input, 20, cosine_sim_ensemble)

### 3.3 Using CountVectorizer

#### 3.3.1 Reusable Function Definitions

In [1]:
# Function that compute the cosine similarity matrix for an attribute
def get_cosine_sim_and_matrix_cv(attribute):
    # Initialize vectorizer
    # min_df = rare words, max_df = most used words
    vect = CountVectorizer(analyzer = 'word', stop_words = 'english')
    # Fit into the attribute
    vect.fit(products[attribute])
    cv_matrix = vect.transform(products[attribute])
    
    cosine_sim = cosine_similarity(cv_matrix, cv_matrix)
    
    return [cosine_sim, cv_matrix]
    
def get_recommendations_cv(product_name, n, cosine_sim, matrix):
    # Build 1-dimensional array with product names
    indices = pd.Series(products.index, index=products['product_name'])
    
    # Get the index of the product that matches the names
    name_idx = indices[product_name]
    
    # Use the first product if there're multiple products with same name
    if (not isinstance(name_idx, (int, np.int64))):
        name_idx = indices[product_name].iloc[0]
        
    # Find out what features have been considered by the vectorizer for a given name 
    feature_array = np.squeeze(matrix[name_idx].toarray()) # squeeze activity matrix into array
    idx = np.where(feature_array > 0)
        
    # Cosine similarity with other similar attributes
    top_n_idx = np.flip(np.argsort(cosine_sim[name_idx,]), axis = 0)[0:n+1]
    top_n_sim_values = cosine_sim[name_idx, top_n_idx]
    
    # Find top n with values > 0
    top_n_idx = top_n_idx[top_n_sim_values > 0]
    scores = top_n_sim_values[top_n_sim_values > 0]
    
    # Exclude entered product
    for idx, i in enumerate(top_n_idx):
        if i == name_idx:
            top_n_idx = np.delete(top_n_idx, idx)
            scores = np.delete(scores, idx)
    
    return pd.DataFrame(
        {
            "product_name": products['product_name'].iloc[top_n_idx].values,
            "brand_or_author": products['brand_or_author'].iloc[top_n_idx].values,
            "main_category": products['main_category'].iloc[top_n_idx].values,
            "category_tags": products['category_tags'].iloc[top_n_idx].values,
            "score": scores
        },
        columns = ["product_name", "brand_or_author", "main_category", "category_tags", "score"]
    )

# Function that get product name input from user
def getProductNameInput():
    name_input = input('Enter product name: ').lower()
    
    all_names = [products['product_name'][i] for i in range(len(products['product_name']))]
    
    if name_input not in all_names:
        print("Product not found, please enter again!")
        
    return name_input

#### 3.3.2 Recommending products based on "product_name"

In [None]:
# Get the cosine similarity matrix for product_name attribute
result = get_cosine_sim_and_matrix_cv('product_name')
cosine_sim_product_name_cv = result[0]
matrix_product_name_cv = result[1]

In [None]:
# Getting the product name input from user
name_input = getProductNameInput()

In [None]:
# Top 20 most similar products
get_recommendations_cv(name_input, 20, cosine_sim_product_name_cv, matrix_product_name_cv)

#### 3.3.3 Recommending products based on "brand_or_author"

In [None]:
# Get the cosine similarity matrix for brand_or_author attribute
result = get_cosine_sim_and_matrix_cv('brand_or_author')
cosine_sim_brand_or_author_cv = result[0]
matrix_brand_or_author_cv = result[1]

In [None]:
# Getting the product name input from user
name_input = getProductNameInput()

In [None]:
# Top 20 most similar products
get_recommendations_cv(name_input, 20, cosine_sim_brand_or_author_cv, matrix_brand_or_author_cv)

#### 3.3.4 Recommending products based on "main_category"

In [None]:
# Get the cosine similarity matrix for main_category attribute
result = get_cosine_sim_and_matrix_cv('main_category')
cosine_sim_main_category_cv = result[0]
matrix_main_category_cv = result[1]

In [None]:
# Getting the product name input from user
name_input = getProductNameInput()

In [None]:
# Top 20 most similar products
get_recommendations_cv(name_input, 20, cosine_sim_main_category_cv, matrix_main_category_cv)

#### 3.3.5 Recommending products based on "category_tags"

In [None]:
# Get the cosine similarity matrix for category_tags attribute
result = get_cosine_sim_and_matrix_cv('category_tags')
cosine_sim_category_tags_cv = result[0]
matrix_category_tags_cv = result[1]

In [None]:
# Getting the product name input from user
name_input = getProductNameInput()

In [None]:
# Top 20 most similar products
get_recommendations_cv(name_input, 20, cosine_sim_category_tags_cv, matrix_category_tags_cv)

#### 3.3.6 Recommending products based on a mixture of "product_name", "brand_or_author", "main_category" and "categories_tags"

In [None]:
# Get the cosine similarity matrix for ensemble attribute
result = get_cosine_sim_and_matrix_cv('ensemble')
cosine_sim_ensemble_cv = result[0]
matrix_ensemble_cv = result[1]

In [None]:
# Getting the product name input from user
name_input = getProductNameInput()

In [None]:
# Top 20 most similar products
get_recommendations_cv(name_input, 20, cosine_sim_ensemble_cv, matrix_ensemble_cv)

# Collaborative Filtering Product Recommendation System
Done by Tan Cherng Ming

In [None]:
#Read the json file with dataframe
product_title = pd.read_json(r'datasets\subset_meta_Electronics.json', lines=True)
product_title = pd.DataFrame(product_title)
product_title

In [None]:
#Print column
product_title.columns

In [None]:
#Drop unneccesary column
product_title = product_title.drop(['category', 'tech1', 'description', 'fit', 'also_buy', 'tech2', 'brand', 'feature', 'rank', 
              'also_view', 'similar_item', 'date', 'price', 'imageURL','imageURLHighRes', 'details'], axis=1)
product_title

In [None]:
#Check for missing values
print('Number of missing values across columns:')
print(product_title.isnull().sum())

In [None]:
#Rename the column from default column name
product_title.rename(columns = {'asin':'productId'}, inplace = True)
product_title

In [None]:
#Read the second csv file and print the row
user_product_ratings = pd.read_csv(r'C:\Users\Wilson Tan\Downloads\AI assignment\ratings_Electronics.csv', names=['userId', 'productId','Rating','timestamp'])
user_product_ratings.head()

In [None]:
#Shape of the data
user_product_ratings.shape

In [None]:
#Extract 100000 data from the dataset(7824482)
user_product_ratings = user_product_ratings.iloc[:100000,0:]

In [None]:
#Drop the unnecessary column
user_product_ratings = user_product_ratings.drop('timestamp', axis=1)

In [None]:
#Shape of the data
user_product_ratings.shape

In [None]:
user_product_ratings.info()

In [None]:
#Check the datatypes
user_product_ratings.dtypes

In [None]:
#Check for missing values
print('Number of missing values across columns:')
print(user_product_ratings.isnull().sum())

In [None]:
# Check with the ratings distribution
with sns.axes_style('white'):
    g = sns.catplot(x ="Rating", data = user_product_ratings, kind ='count')
    g.set_ylabels("Total number of ratings")

In [None]:
#Print the number of ratings, user and product
print("Total number of ratings  :",user_product_ratings.shape[0])
print("Total number of user     :", len(np.unique(user_product_ratings.userId)))
print("Total number of products :", len(np.unique(user_product_ratings.productId)))

In [None]:
#Merge two files based on productId then drop the duplication
product_ratings = pd.merge(user_product_ratings, product_title, on='productId').drop_duplicates()
product_ratings

In [None]:
#Check for missing values
print('Number of missing values across columns:')
print(product_ratings.isnull().sum())

In [None]:
#Show the product with the highest number of rating
total_rate_of_a_product = product_ratings.groupby(by='title')['Rating'].count().sort_values(ascending=False)
total_rate_of_a_product.head(10)

In [None]:
#Summarize data to userID and title with Pivot Table
user_product_matrix = pd.pivot_table(product_ratings, index='userId', columns='title', values ='Rating').fillna(0)
user_product_matrix

## Item-based Filtering

In [None]:
#One item is selected
users_ratings = user_product_matrix['Koss Porta Pro On Ear Headphones with Case, Black / Silver']
users_ratings.head(10)

In [None]:
#Calculate the correlation
similar_product = user_product_matrix.corrwith(users_ratings)
similar_product

In [None]:
# Create a dataframe
similar_product = pd.DataFrame(similar_product, columns = ['Correlation'])
similar_product.head(10)

In [None]:
#Sort the product with correlation descendingly
similar_product.sort_values(by = 'Correlation', ascending = False).head(10)

In [None]:
#Count number of rating for the title
df_rating = pd.DataFrame(product_ratings.groupby('title')['Rating'].count())

In [None]:
recommend_product = similar_product.join(df_rating['Rating']).sort_values(by = 'Correlation', ascending = False)
recommend_product

In [None]:
# Recommend top 20 product that has > 50 ratings
recommend_product = recommend_product[recommend_product['Rating'] > 50].sort_values(by = 'Correlation', ascending = False)
recommend_product.head(20)

In [None]:
#Extract 20 product and make the recommended items a list
recommend_product = recommend_product.iloc[1:21]
products = recommend_product.index.values.tolist()
products

## User-based Filtering

In [None]:
#Transpose the pivot table
product_user_matrix = user_product_matrix.transpose()
product_user_matrix.head()

In [None]:
# One user is selected, A2BGZ52M908MJY
user_title_ratings = product_user_matrix['A231WM2Z2JL0U3']
user_title_ratings.head(5)

In [None]:
#Calculate the correlation
similar_users = product_user_matrix.corrwith(user_title_ratings)

# Create a dataframe
similar_users = pd.DataFrame(similar_users, columns = ['Correlation'])
similar_users.head(10)

In [None]:
#Sort the user with correlation descendingly
most_similar_users = similar_users.sort_values(by = 'Correlation', ascending = False).iloc[1:21]
most_similar_users

In [None]:
#Extract the first most similar user 
user_list = most_similar_users.index.values.tolist()
user_list[0]

In [None]:
#Product that are rated the user
recommendation = product_ratings[product_ratings['userId'] == user_list[0]]
recommendation

In [None]:
#DataFrame slicing : product with the rating > 3.0
recommendation = product_ratings.loc[(product_ratings['userId'] == user_list[0]) & 
                                   (product_ratings['Rating'] > 3), 
                                   ['title', 'Rating']]
recommendation

In [None]:
recommendation = recommendation.set_index('title')
recommendation_list = recommendation.index.values.tolist()
print('List to recommend')
recommendation_list