# Import necessary libraries

In [236]:
import pandas as pd
import numpy as np
from surprise import SVD
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import KNNWithMeans, accuracy, Prediction
from collections import defaultdict

# Upload the csv files into individual dataframes

In [184]:
user_data_1_df = pd.read_csv("phone_user_review_file_1.csv", encoding="latin-1")

In [185]:
user_data_2_df = pd.read_csv("phone_user_review_file_2.csv", encoding="latin-1")

In [186]:
user_data_3_df = pd.read_csv("phone_user_review_file_3.csv", encoding="latin-1")

In [187]:
user_data_4_df = pd.read_csv("phone_user_review_file_4.csv", encoding="latin-1")

In [188]:
user_data_5_df = pd.read_csv("phone_user_review_file_5.csv", encoding="latin-1")

In [189]:
user_data_6_df = pd.read_csv("phone_user_review_file_6.csv", encoding="latin-1")

# Merge all dataframes into one dataframe

In [190]:
user_data_cons_df = pd.concat([user_data_1_df, user_data_2_df, user_data_3_df, user_data_4_df, user_data_5_df, user_data_6_df], 
axis=0)

# Check for any null / missing values

In [191]:
user_data_cons_df.isnull().any()

phone_url    False
date         False
lang         False
country      False
source       False
domain       False
score         True
score_max     True
extract       True
author        True
product       True
dtype: bool

# Conclusion: Null values are in score, score_max, extract, author and product columns

# Replace null value in score column with mean value

In [192]:
user_data_cons_df["score"] = np.where(user_data_cons_df["score"].isnull(), user_data_cons_df["score"].mean(), 
                                      user_data_cons_df["score"])

# Round of scores to nearest integers

In [193]:
user_data_cons_df["score"] = user_data_cons_df["score"].apply(np.int64)

# Check for duplicate values and remove them if there are any

In [194]:
user_data_cons_df.shape

(1415133, 11)

In [195]:
user_data_cons_df = pd.DataFrame.drop_duplicates(user_data_cons_df)

In [196]:
user_data_cons_df.shape

(1408713, 11)

# 6420 duplicate rows removed

# Keep only 10,000 data samples (and not 10,00,000) as getting out of memory error below while using CF model with kNNMeans (user based or item based) from surprise package

In [216]:
user_data_cons_df = user_data_cons_df.head(10000)

In [217]:
user_data_cons_df.head()

Unnamed: 0,phone_url,date,lang,country,source,domain,score,score_max,extract,author,product
0,/cellphones/samsung-galaxy-s8/,5/2/2017,en,us,Verizon Wireless,verizonwireless.com,10,10.0,As a diehard Samsung fan who has had every Sam...,CarolAnn35,Samsung Galaxy S8
1,/cellphones/samsung-galaxy-s8/,4/28/2017,en,us,Phone Arena,phonearena.com,10,10.0,Love the phone. the phone is sleek and smooth ...,james0923,Samsung Galaxy S8
2,/cellphones/samsung-galaxy-s8/,5/4/2017,en,us,Amazon,amazon.com,6,10.0,Adequate feel. Nice heft. Processor's still sl...,R. Craig,"Samsung Galaxy S8 (64GB) G950U 5.8"" 4G LTE Unl..."
3,/cellphones/samsung-galaxy-s8/,5/2/2017,en,us,Samsung,samsung.com,9,10.0,Never disappointed. One of the reasons I've be...,Buster2020,Samsung Galaxy S8 64GB (AT&T)
4,/cellphones/samsung-galaxy-s8/,5/11/2017,en,us,Verizon Wireless,verizonwireless.com,4,10.0,I've now found that i'm in a group of people t...,S Ate Mine,Samsung Galaxy S8


# Keep features like Author, Product, and Score

In [218]:
user_data_rs_df = user_data_cons_df[["author", "product", "score"]]

In [219]:
user_data_rs_df.shape

(10000, 3)

In [220]:
user_data_rs_df.head()

Unnamed: 0,author,product,score
0,CarolAnn35,Samsung Galaxy S8,10
1,james0923,Samsung Galaxy S8,10
2,R. Craig,"Samsung Galaxy S8 (64GB) G950U 5.8"" 4G LTE Unl...",6
3,Buster2020,Samsung Galaxy S8 64GB (AT&T),9
4,S Ate Mine,Samsung Galaxy S8,4


# Identify the most rated features / product with most number of reviews

In [96]:
user_data_rs_df.groupby('product')['score'].count().sort_values(ascending=False).head()  

product
Samsung Galaxy S6 edge+ 32GB (T-Mobile)    1040
Samsung Galaxy S7 edge 32GB (Verizon)       855
Samsung Galaxy S7 edge 32GB (T-Mobile)      758
Samsung Galaxy S6 edge+ 32GB (Verizon)      693
Samsung Galaxy S7 edge 32GB (AT&T)          616
Name: score, dtype: int64

# Above is a list of top 5 most rated features / products

# Identify the users with most number of reviews

In [97]:
user_data_rs_df.groupby('author')['score'].count().sort_values(ascending=False).head()  

author
Anonymous          284
Amazon Customer    270
Client d'Amazon     57
Cliente Amazon      45
einer Kundin        19
Name: score, dtype: int64

# Above is a list of top 5 users with most number of reviews given

# Select the data with products having more than 50 ratings and users who have given more than 50 ratings. Report the shape of the final dataset

In [181]:
final_data = user_data_rs_df.groupby(['product','author'])['score'].count() > 50

In [182]:
final_data.shape

(9305,)

# Conclusion: There are 9305 rows in the dataset of 10,000 rows having products with more than 50 ratings and users who have given more than 50 ratings

# Build a popularity based model and recommend top 5 mobile phones

In [109]:
# Select those mobile phones having highest mean / average score and sort the resulting list in descending order. 
# Display the top 5 from this list

user_data_rs_df.groupby('product')['score'].mean().sort_values(ascending=False).head(5)  

product
Samsung Galaxy S6 EDGE PLUS 32GB Black (SM-G928W8) Unlocked                                                                                                                       10.0
Samsung Galaxy S8 smartphone (grÃ¥)                                                                                                                                               10.0
Samsung Galaxy S7 Edge Dual Sim Factory Unlocked Phone 32 GB - Internationally Sourced (Middle East/Africa/Asia) Version G935FD- Black Oynx                                       10.0
Samsung Galaxy S6 Edge Plus - TelÃ©fono mÃ³vil (4 GB de RAM, 64 GB de almacenamiento, WiFi, Bluetooth, 3G)                                                                        10.0
Samsung Galaxy S8 Plus - Smartphone libre Android (6.2", 4 GB RAM, 4G, 12 MP), color plata. [VersiÃ³n alemana: no incluye Samsung Pay ni acceso a promociones Samsung members]    10.0
Name: score, dtype: float64

# Build a collaborative filtering model using SVD

In [225]:
reader = Reader(rating_scale=(1,10))

In [226]:
score_data = Dataset.load_from_df(user_data_rs_df[['author','product','score']], reader)

In [227]:
trainset = score_data.build_full_trainset()

In [229]:
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x20416736580>

In [230]:
testset = trainset.build_anti_testset()

In [232]:
predictions = algo.test(testset)

In [234]:
# function to get top 10 score predictions

def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [237]:
# To get top 10 predictions of the score, call the above function 

top_n = get_top_n(predictions, n=10)

In [None]:
# Print the recommended items for each user

for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

# Build a collaborative filtering model using kNNWithMeans from surprise. You can try both user-based and item-based model

In [134]:
trainset, testset = train_test_split(score_data, test_size=.25, random_state=123)

In [135]:
# Using CF user-based model with kNNWithMeans

algo_user_based = KNNWithMeans(k=51, sim_options={'name': 'pearson' , 'user_based' : False})
algo_user_based.fit(trainset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x20416aaa700>

In [149]:
test_pred_user_based = algo_user_based.test(testset)

In [137]:
# Using CF item-based model with kNNWithMeans

algo_item_based = KNNWithMeans(k=51, sim_options={'name': 'pearson' , 'item_based' : False})
algo_item_based.fit(trainset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x20416aa4460>

In [148]:
test_pred_item_based = algo_item_based.test(testset)

test_pred_item_based = algo.test(testset)

# Evaluate the collaborative model. Print RMSE value

In [140]:
# RMSE for user-based CF model
accuracy.rmse(test_pred_user_based)

RMSE: 1.7888


1.7888482638111611

In [141]:
# RMSE for item-based CF model
accuracy.rmse(test_pred_item_based)

RMSE: 1.8218


1.8217900558527378

# Predict score (average rating) for test users

In [142]:
test_pred_user_based_df = pd.DataFrame(test_pred_user_based)

In [170]:
# Average / Estimated rating for test users under user-based CF model
test_pred_user_based_df.head(5)

Unnamed: 0,uid,iid,r_ui,est,details
0,WildYoungCharm,Samsung Galaxy S6 edge+ 32GB (T-Mobile),10.0,8.909733,"{'was_impossible': True, 'reason': 'User and/o..."
1,boinero,Samsung Galaxy S8,10.0,8.909733,"{'was_impossible': True, 'reason': 'User and/o..."
2,dog,Samsung Galaxy S7 Edge G935P 32GB Gold- Sprint...,10.0,8.909733,"{'was_impossible': True, 'reason': 'User and/o..."
3,Memee77,Samsung Galaxy S6 edge+ 32GB (T-Mobile),10.0,8.909733,"{'was_impossible': True, 'reason': 'User and/o..."
4,Squeakla29,Samsung Galaxy S6 edge+ 32GB (Sprint),10.0,8.909733,"{'was_impossible': True, 'reason': 'User and/o..."


In [144]:
test_pred_item_based_df = pd.DataFrame(test_pred_item_based)

In [160]:
# Average / Estimated rating for test users under item-based CF model
test_pred_item_based_df.head(5)

Unnamed: 0,uid,iid,r_ui,est,details
0,WildYoungCharm,Samsung Galaxy S6 edge+ 32GB (T-Mobile),10.0,8.909733,"{'was_impossible': True, 'reason': 'User and/o..."
1,boinero,Samsung Galaxy S8,10.0,8.909733,"{'was_impossible': True, 'reason': 'User and/o..."
2,dog,Samsung Galaxy S7 Edge G935P 32GB Gold- Sprint...,10.0,8.909733,"{'was_impossible': True, 'reason': 'User and/o..."
3,Memee77,Samsung Galaxy S6 edge+ 32GB (T-Mobile),10.0,8.909733,"{'was_impossible': True, 'reason': 'User and/o..."
4,Squeakla29,Samsung Galaxy S6 edge+ 32GB (Sprint),10.0,8.909733,"{'was_impossible': True, 'reason': 'User and/o..."


# Report your findings and inferences

# There is no significant difference in the RMSE accuracy score between user-based and item-based CF models. As a result the average ratings for the test users is same across these two models, as shown in the two above tables having average / estimated ratings

# Try and recommend top 5 products for test users

In [168]:
top_5_prod_recos = test_pred_user_based_df.groupby('uid').head(5).reset_index(drop=True)

In [169]:
top_5_prod_recos

Unnamed: 0,uid,iid,r_ui,est,details
0,WildYoungCharm,Samsung Galaxy S6 edge+ 32GB (T-Mobile),10.0,8.909733,"{'was_impossible': True, 'reason': 'User and/o..."
1,boinero,Samsung Galaxy S8,10.0,8.909733,"{'was_impossible': True, 'reason': 'User and/o..."
2,dog,Samsung Galaxy S7 Edge G935P 32GB Gold- Sprint...,10.0,8.909733,"{'was_impossible': True, 'reason': 'User and/o..."
3,Memee77,Samsung Galaxy S6 edge+ 32GB (T-Mobile),10.0,8.909733,"{'was_impossible': True, 'reason': 'User and/o..."
4,Squeakla29,Samsung Galaxy S6 edge+ 32GB (Sprint),10.0,8.909733,"{'was_impossible': True, 'reason': 'User and/o..."
...,...,...,...,...,...
2363,Ana paula,Sim Free Samsung Galaxy S7 Edge Mobile Phone -...,10.0,8.909733,"{'was_impossible': True, 'reason': 'User and/o..."
2364,carlitosways,Samsung Galaxy S8,10.0,8.909733,"{'was_impossible': True, 'reason': 'User and/o..."
2365,Dadoo50002000,Samsung Galaxy S7 edge,8.0,8.909733,"{'was_impossible': True, 'reason': 'User and/o..."
2366,Beels,Sim Free Samsung Galaxy S7 Edge Mobile Phone -...,10.0,8.909733,"{'was_impossible': True, 'reason': 'User and/o..."


# In what business scenario you should use popularity based Recommendation Systems

# Popularity based recommendation systems work by recommending items viewed/purchased by most people and rated high. They are used when context is available / relevant (product / item features) and recommendations cannot be personalised (user data is not available). They are used in the B2C (Business to Customer) domains like recommending most popular books, movies, jobs, citations / research papers, courses (online / offline),  advertisements, restaurants, music tracks, etc. These recommendation systems are highly scalable.

# In what business scenario you should use CF based Recommendation Systems?

# Collaborative Filtering (CF) based Recommendation Systems are entirely based on the user’s past behaviour and not on the context. The basic assumption here is that customers who had similar tastes in the past, will have similar tastes in the future. They can be either User-User based - Recommend items to the user that are similar to the the users that have bought the same item; or Item-Item based - Recommend items to the user that are similar to the items the user has bought. They are used in the B2C (Business to Customer) domains like recommending books, movies, jobs, citations / research papers, courses (online / offline),  advertisements, restaurants, music tracks, etc. These recommendation systems are not highly scalable and suffer from cold start problem (require prior user data to be available)