In [1]:
import pandas as pd 
import numpy as np 
import gc

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error

In [2]:
users = pd.read_parquet("Datasets/users_data_cleaning.parquet.gzip")
books = pd.read_parquet("Datasets/books_data_cleaning.parquet.gzip")
books.head()

Unnamed: 0,Id,Name,Authors,ISBN,Rating,PublishYear,Publisher,CountsOfReview,NumberOfPages,Description
0,4000063,The Flintstones in Viva Rock Vegas,Ellen Miles,439173035,3.82,2000,Scholastic Inc.,1,136.0,spring mark release new flintstone film flints...
1,4000100,Little Rhody,Neta Lohnes Frazier,679250808,4.33,2000,David McKay Company,1,152.0,spunky ten year old girl move family farm invo...
2,4000228,Finance And Investments Using The Wall Street ...,Peter R. Crabb,72829362,0.0,2002,Irwin/McGraw-Hill,0,402.0,peter crabbs wall street journal workbook inve...
3,4000366,Shorty McCabe Looks 'Em Over,Sewell Ford,836935357,0.0,1970,Books for Libraries,0,344.0,exact reproduction book publish not ocrd book ...
4,4000441,Plant Pathology,George N. Agrios,120445646,4.52,1997,Academic Press,0,635.0,classic textbook plant disease recognize treat...


## 1) User based collaborative filtering

### 1) Simple recommender

https://www.datacamp.com/community/tutorials/recommender-systems-python

In [3]:
simple_reco = books.copy()
simple_reco = simple_reco[["Id", "Name", "Authors",
                           "Rating", "CountsOfReview"]]
simple_reco.head()

Unnamed: 0,Id,Name,Authors,ISBN,Rating,CountsOfReview
0,4000063,The Flintstones in Viva Rock Vegas,Ellen Miles,439173035,3.82,1
1,4000100,Little Rhody,Neta Lohnes Frazier,679250808,4.33,1
2,4000228,Finance And Investments Using The Wall Street ...,Peter R. Crabb,72829362,0.0,0
3,4000366,Shorty McCabe Looks 'Em Over,Sewell Ford,836935357,0.0,0
4,4000441,Plant Pathology,George N. Agrios,120445646,4.52,0


In [4]:
C = simple_reco['Rating'].mean()
print(C)

3.7320950652695215


In [5]:
m = simple_reco['CountsOfReview'].quantile(0.90)
print(m)

41.0


In [6]:
q_books = simple_reco.copy().loc[simple_reco['CountsOfReview'] >= m]
q_books.shape

(12116, 6)

In [7]:
def weighted_rating(x, m=m, C=C):
    v = x['CountsOfReview']
    R = x['Rating']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [8]:
q_books['score'] = q_books.apply(weighted_rating, axis=1)

In [9]:
#Sort movies based on score calculated above
q_books = q_books.sort_values('score', ascending=False)

#Print the top 15 books
q_books[["Id",'Name', "Authors", 'CountsOfReview', 'Rating', 'score']].head(20)

Unnamed: 0,Id,Name,Authors,CountsOfReview,Rating,score
44289,862041,"Harry Potter Series Box Set (Harry Potter, #1-7)",J.K. Rowling,6522,4.74,4.733703
40413,818056,Harry Potter and the Deathly Hallows (Harry Po...,J.K. Rowling,952,4.62,4.583339
115327,1215032,"The Wise Man's Fear (The Kingkiller Chronicle,...",Patrick Rothfuss,16523,4.57,4.567926
61446,3165162,Percy Jackson and the Olympians (Percy Jackson...,Rick Riordan,546,4.59,4.530078
72565,1025685,"The Absolute Sandman, Volume Two",Neil Gaiman,198,4.69,4.525673
91715,2495562,The Wise Man's Fear (The Kingkiller Chronicle...,Patrick Rothfuss,488,4.56,4.495833
82953,2186848,"The Absolute Sandman, Volume Three",Neil Gaiman,140,4.71,4.488486
98149,2767793,"The Hero of Ages (Mistborn, #3)",Brandon Sanderson,10101,4.49,4.486936
30441,1179967,"A Storm of Swords (A Song of Ice and Fire, #3)",George R.R. Martin,583,4.54,4.486917
66035,3362870,"The Hero of Ages (Mistborn, #3)",Brandon Sanderson,1289,4.49,4.466636


In [None]:
print(np.sqrt(mean_squared_error(q_books['Rating'], q_books["score"])))