# Simple Recommender System - Product Recommender

## 1. Importing Libraries

In [None]:
# Import library to be used in the project
import pandas as pd
import html
import warnings
warnings.filterwarnings('ignore')

## 2. File Reading and Features Engineering: products

In [None]:
# Read the electronic products file
products_dataset_path = '../dataset/electronic_products.json'
products = pd.read_json(products_dataset_path, lines=True)

# Output the first 10 rows
products.head(10)

In [None]:
print(products.shape)
# output: (rows, columns)

In [None]:
# retrieving column name
products.columns

### 2.1 General Cleansing

#### 2.1.1 Drop Unnecessary columns

In [None]:
# keep: asin, title, brand, main_cat, and price (some columns are for filtering usage)
products = products.drop(columns=['category', 'tech1', 'description', 'fit', 'also_buy', 'tech2',
       'feature', 'rank', 'also_view', 'similar_item', 'date', 'imageURL', 'imageURLHighRes', 'details'], errors='ignore')

products.columns

In [None]:
products = products[['asin', 'title', 'brand', 'price', 'main_cat']]
products.columns = ['product_id', 'product_name', 'brand_or_author', 'price', 'main_category']

In [None]:
products.head(10)

#### 2.1.2 Remove Duplicates

In [None]:
products.shape

In [None]:
products.duplicated().sum()

In [None]:
products = products.drop_duplicates()

products.shape

#### 2.1.3 Format Strings

In [None]:
# Defining text cleaning function

def text_cleaning(text):
    # 1. convert any HTML entities in the text to their corresponding characters
    # e.g. &amp; to &, &quot; to ", &reg; to ®
    text = html.unescape(text)
    
    # 2. convert to lower case
    return text.lower()

text = "&amp; &quot &reg;"
cleaned_text = text_cleaning(text)
print(cleaned_text)

In [None]:
# List of columns to be cleaned
cols_to_clean = ['product_name', 'brand_or_author', 'price', 'main_category']

# Apply the text cleaning function to each column
for col in cols_to_clean:
    products[col] = products[col].apply(lambda x: text_cleaning(x))

In [None]:
products

### 2.2 Cleaning 'main_category' column

In [None]:
main_category_df = products.groupby('main_category').size().reset_index(name='count')
main_category_df

In [None]:
# The four image HTML elements are actually belongs to 'amazon fashion'
for i in (main_category_df.loc[0:3, 'main_category']):
    print(i)

# There are total 169 products for amazon fashion
total_af = {'main_category': 'Total', 'count': len(products.loc[products['main_category'].str.contains('amazon fashion')])}
main_category_df.loc[main_category_df['main_category'].str.contains('amazon fashion')].append(total_af, ignore_index=True)

In [None]:
# assign the four image HTML elements to 'amazon fashion' in main_category_df data frame
main_category_df.loc[main_category_df['main_category'].str.contains('amazon fashion'), 'main_category'] = 'amazon fashion'

main_category_df.groupby('main_category').sum()

In [None]:
# assign the four image HTML elements to 'amazon fashion' in products data frame
products.loc[products['main_category'].str.contains('amazon fashion'), 'main_category'] = 'amazon fashion'

products.groupby('main_category').size().reset_index(name='count')

### 2.3 Cleaning 'brand_or_author' column

In [None]:
# The visit amazon's .... page pattern is actually the author.
pattern = "^visit amazon's (.*) page$"
mask = products['brand_or_author'].str.contains(pattern)
selected_columns = ['brand_or_author', 'main_category']
visit_amazon_pattern = products.loc[mask, selected_columns]

visit_amazon_pattern

In [None]:
visit_amazon_pattern.groupby('main_category').size().reset_index(name='count')

In [None]:
# Clean the brand or author column
def clean_author_string(author_string):
    author_string = author_string.replace("visit amazon's ", "")
    author_string = author_string.replace(" page", "")
    return author_string

products['brand_or_author'] = products['brand_or_author'].apply(clean_author_string)

In [None]:
# result
products.head(10)

### 2.4 Cleaning 'price' column

In [None]:
dirty_price_df = products[~products['price'].str.contains('^\$')]

dirty_price_type_df = dirty_price_df.groupby('price').size().reset_index(name='count')
dirty_price_type_df

In [None]:
for i in (dirty_price_type_df.loc[1:6, 'price']):
    print(i)

In [None]:
# Since all the dirty price data have no price information (are all css or html code)
# We can set them all to '0'
def clean_price_string (price_string):
    if price_string.startswith('$'):
        price_string = price_string.replace('$', '')
    else:
        price_string = '0'
    try:
        price_float = float(price_string)
    except ValueError:
        price_float = 0.0
    return price_float

products['price'] = products['price'].apply(clean_price_string)

In [None]:
# result
products.head(10)

## 3. File Reading and Features Engineering: ratings

In [None]:
# Read the ratings file
ratings_dataset_path = '../dataset/user_ratings.csv'

ratings = pd.read_csv(ratings_dataset_path, names=['user_id', 'product_id','rating','timestamp'], index_col=False)

In [None]:
# Output the first 10 rows
ratings.head(10)

In [None]:
ratings.columns

### 3.1 Features Engineering

#### 3.1.1 Check Duplicates ( no duplicated rows )

In [None]:
ratings.shape

In [None]:
# must check duplicates first before dropping the columns
# because all the four columns, especially the timestamp, are determining the duplication

ratings.duplicated().sum()

# no need to drop duplicates
# no need: ratings = ratings.drop_duplicates()

#### 3.1.2 Drop Unnecessary Columns

In [None]:
# keep: product_id and rating  (each rating is valuable only)
# drop: user_id and timestamp
ratings = ratings.drop(columns=['user_id', 'timestamp'], errors='ignore')

ratings.columns

In [None]:
ratings.head(10)

#### 3.1.3 Create  'rating_average' and 'rating_count' from 'rating'

In [None]:
# Calculate the average rating and count of ratings for each product_id
ratings = ratings.groupby('product_id').agg(rating_average=('rating', 'mean'), rating_count=('rating', 'count'))

# use (0,1,2,3,4,5,6 as row indication instead of product_id)
ratings = ratings.reset_index()

In [None]:
ratings.head(10)

In [None]:
ratings.shape

In [None]:
# Top 10 products with the highest rating_count first then highest rating_average
ratings.sort_values(by=['rating_count','rating_average'], ascending=[False, False]).head(10)

## 4. Merge 'products' and 'ratings' into 'products_merge'

In [None]:
# Merge the products and ratings dataframes (keep all the products records)
products_merge = pd.merge(products, ratings, on='product_id', how='left')

# Output the first 10 rows
products_merge.head(10)

In [None]:
products_merge.shape

### 4.1 Replace NaN

In [None]:
products_merge.isnull().sum()

In [None]:
products_merge[products_merge['main_category'] == 'all electronics'].count()

In [None]:
products_merge[products_merge['main_category'] == 'all electronics'].isnull().sum()

In [None]:
products_merge.fillna({'rating_average': 0.0, 'rating_count': 0}, inplace=True)

In [None]:
products_merge.isnull().sum()

## 5. Simple Recommender System

### 5.1 Simple Rating Sort

▪ Sorting 'products_merge' by multiple columns.

▪ Issue: Even if the **rating_count** for a product is very high, but it can have a lower **rating_average**.

In [None]:
# Top 10 products with the highest rating_count first then highest rating_average
products_merge.sort_values(by=['rating_count','rating_average'], ascending=[False, False]).head(20)

### 5.2 Weighted Rating

▪ A *weighted rating* that takes into account the **rating_average** and the **rating_count** it has accumulated.

▪ We can calculate the Weighted Rating Score into a new 'score' column.

▪ The formula of weighted rating is as follows:

<img src="weighted_rating.png" width="600">

\>>> **v** is the number of rating for the product (represented by **rating_count**)

\>>> **m** is the **minimum rating count** required to be listed in the chart (to be calculated)

\>>> **R** is the average rating of the product (represented by **rating_average**)

\>>> **C** is the **mean of rating average** across the whole dataframe (to be calculated)

In [None]:
# describe() - can analyse the dataframe overallly
# C can be viewed using describe().

products_merge.describe()

# From the output:
# get C: mean of **rating_average** of product

In [None]:
# C, the mean of rating average across the whole dataframe
C = products_merge['rating_average'].mean()
C

In [None]:
# m, minimum rating count required to be listed in the chart

# consider the 90th percentile.

# for a product to be recommended, it must have more votes than at least 90% of the products.

m = products_merge['rating_count'].quantile(0.90) 
m

In [None]:
q_products = products_merge.copy().loc[products_merge['rating_count'] >= m]

q_products

In [None]:
# Function that computes the weighted rating of each product
def weighted_rating(x, m = m, C = C):
    
    v = x['rating_count']
    R = x['rating_average']
    
    # Calculation based on the IMDB formula
    return (v / (v + m) * R) + (m / (m + v) * C)

In [None]:
q_products['score'] = q_products.apply(weighted_rating, axis=1)

q_products.head(20)

### 5.3 Result using Weighted Rating

#### 5.3.1 Recommending Top 20 Products

▪ Sort q_proucts in descending order based on the score feature column.

▪ Output the product_name, main_category, brand_or_author, price, rating_average, rating_count, and weighted rating (score) of the top 20 products.

In [None]:
# Sort products based on 'score' and recommend the top 20 products
top_20_proucts = q_products.sort_values('score', ascending = False).head(20).reset_index()

columns = ['product_id', 'product_name', 'main_category','brand_or_author', 'price',  'rating_average', 'rating_count', 'score']
top_20_proucts = top_20_proucts [columns]

top_20_proucts.index = top_20_proucts.index + 1

top_20_proucts

#### 5.3.2 Recommending Top 20 Products According to product_name

In [None]:
# E.g. speaker
product_name = input("Enter the product name : ")

In [None]:
top_20_product_name = q_products[q_products['product_name'].str.contains(product_name.lower())]

top_20_product_name = top_20_product_name.sort_values('score', ascending = False).reset_index()[columns]
top_20_product_name.index = top_20_product_name.index + 1

top_20_product_name.head(20)

#### 5.3.3 Recommending Top 20 Products According to main_category

In [None]:
# e.g camera
main_category = input("Enter the main category : ")

In [None]:
top_20_main_category = q_products[q_products['main_category'].str.contains(main_category.lower())]

top_20_main_category = top_20_main_category [columns].sort_values('score', ascending = False).reset_index()
top_20_main_category.index = top_20_main_category.index + 1

top_20_main_category.head(20)

#### 5.3.4 Recommending Top 20 Products According to brand_or_author

In [None]:
# e.g. microsoft
brand_or_author = input("Enter the brand or author : ")

In [None]:
top_20_brand_or_author = q_products[q_products['brand_or_author'].str.contains(brand_or_author.lower())]

top_20_brand_or_author = top_20_brand_or_author [columns].sort_values('score', ascending = False).reset_index()
top_20_brand_or_author.index = top_20_brand_or_author.index + 1

top_20_brand_or_author.head(20)

#### 5.3.5 Recommending Top 20 Products According to price range

In [None]:
# e.g. 10, 100
min_price = int(input("Enter the minimum price : "))
max_price = int(input("Enter the maximum price : "))

In [None]:
top_20_within_price_range = q_products[(q_products['price'] >= min_price) & (q_products['price'] <= max_price)]

top_20_within_price_range = top_20_within_price_range [columns].sort_values('score', ascending = False).reset_index()
top_20_within_price_range.index = top_20_within_price_range.index + 1

top_20_within_price_range.head(20)