# Recommender system for books: Modelisation

In [None]:
# Data manipulation
import pandas as pd 
import numpy as np 
import gc

# Graphical libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Modelisation libraries
from surprise import BaselineOnly, SVD, NMF, KNNBasic, KNNWithMeans, KNNWithZScore
from surprise import accuracy
from surprise.model_selection import train_test_split, GridSearchCV
from surprise import Dataset
from surprise import Reader

from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
path = 'Datasets/'

In [None]:
# Importing data about users and books 
users = pd.read_parquet(path+"users_data_cleaning.parquet.gzip")
books = pd.read_parquet(path+"books_data_cleaning.parquet.gzip")

It exists different kinds of recommender engines. In this proof of concept, simple recommender, user based collaborative filtering, content-based recommender and hybrid methods will be testing to find the best recommender engines. 

## 1) Simple recommender

Simple recommender is the most basic recommender engines. It based on popularity. To calcule it, rating and number of reviews are used. The formula used in the following is the formula used by the IMDB website to calcule.  

*Source:* https://www.datacamp.com/community/tutorials/recommender-systems-python

In [None]:
# creating a new dataset for the calculus
simple_reco = books.copy()
simple_reco = simple_reco[["Id", "Name", "Authors",
                           "Rating", "CountsOfReview"]]
simple_reco.head()

Unnamed: 0,Id,Name,Authors,Rating,RatingDistTotal
0,4000063,The Flintstones in Viva Rock Vegas,Ellen Miles,3.82,11
1,4000100,Little Rhody,Neta Lohnes Frazier,4.33,9
2,4000228,Finance And Investments Using The Wall Street ...,Peter R. Crabb,0.0,0
3,4000366,Shorty McCabe Looks 'Em Over,Sewell Ford,0.0,0
4,4000441,Plant Pathology,George N. Agrios,4.52,84


In [None]:
# calculating the average rating
C = simple_reco['Rating'].mean()
print(C)

3.7320950652695215


In [None]:
# calculating and keeping
# books with the 90th percentile
m = simple_reco['CountsOfReview'].quantile(0.90)
print(m)

8029.0


In [None]:
# selecting only books with at least 41 reviews
q_books = simple_reco.copy().loc[simple_reco['CountsOfReview'] >= m]
q_books.shape

(11959, 5)

In [None]:
def weighted_rating(x, m=m, C=C):
    v = x['CountsOfReview']
    R = x['Rating']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
q_books['score'] = q_books.apply(weighted_rating, axis=1)

In [None]:
#Sort movies based on score calculated above
q_books = q_books.sort_values('score', ascending=False)

#Print the top 15 books
q_books[["Id",'Name', "Authors", 'CountsOfReview', 'Rating', 'score']].head(20)

Unnamed: 0,Id,Name,Authors,RatingDistTotal,Rating,score
81070,2132220,"Harry Potter Audio Collection (Harry Potter, #...",J.K. Rowling,247506,4.74,4.708331
18260,1668766,"Harry Potter Boxed Set (Harry Potter, #1-7)",J.K. Rowling,245916,4.74,4.708133
18259,1668764,The Complete Harry Potter Collection Box Set (...,J.K. Rowling,245916,4.74,4.708133
56556,988373,Complete Harry Potter Boxed Set,J.K. Rowling,244668,4.74,4.707976
44289,862041,"Harry Potter Series Box Set (Harry Potter, #1-7)",J.K. Rowling,244331,4.74,4.707933
7450,4744747,"Harry Potter Adult Boxed Set (Harry Potter, #1-7)",J.K. Rowling,253300,4.73,4.699341
82708,2181379,Harry Potter and the Deathly Hallows (Harry Po...,J.K. Rowling,2758202,4.62,4.617423
51691,912224,Harry Potter and the Deathly Hallows (Harry Po...,J.K. Rowling,2708152,4.62,4.617375
40413,818056,Harry Potter and the Deathly Hallows (Harry Po...,J.K. Rowling,2704487,4.62,4.617372
60053,3106176,Harry Potter and the Prisoner of Azkaban,J.K. Rowling,2774928,4.57,4.567583


In [None]:
print(np.sqrt(mean_squared_error(q_books['Rating'],
                                 q_books["score"])))

0.09449245640671797


In [None]:
del q_books, simple_reco
gc.collect()

162

## 2) User-based collaborative filtering

### a) Preparing the data for Surprise library

In [None]:
reader = Reader(rating_scale=(1, 5))
data_surprise = Dataset.load_from_df(users[['User_Id', 'Id', 'Rating']], reader)

In [None]:
trainset, testset = train_test_split(data_surprise, test_size=.2)

In [None]:
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
base_als = BaselineOnly(bsl_options=bsl_options)
predictions = base_als.fit(trainset).test(testset)

Estimating biases using als...


In [None]:
del predictions
gc.collect()

50

In [None]:
bsl_options = {'method': 'sgd',
               'learning_rate': .00005,
               }
base_sgd = BaselineOnly(bsl_options=bsl_options)
predictions = base_sgd.fit(trainset).test(testset)

Estimating biases using sgd...


In [None]:
del predictions
gc.collect()

0

### b) Matrix Factorization based algorithms

In [None]:
svd = SVD()
nmf = NMF()

In [None]:
model = [svd, nmf]
dict_model = {}

for model in model:
    y_pred = model.fit(trainset).test(testset)
    rmse = accuracy.rmse(y_pred)
    dict_model[model] = rmse.round(3)

print(dict_model)

RMSE: 0.7623
RMSE: 0.8669
{<surprise.prediction_algorithms.matrix_factorization.SVD object at 0x7f3539e38650>: 0.762, <surprise.prediction_algorithms.matrix_factorization.NMF object at 0x7f3539e38710>: 0.867}


In [None]:
param_grid = {'n_factors': [50, 75, 100, 125],
              'n_epochs': [5, 10, 15, 20, 25], 
              'lr_all': [0.001, 0.002, 0.005, 0.1],
              'reg_all': [0.1, 0.2, 0.4, 0.6],
              'random_state' : [42]
}

gs = GridSearchCV(SVD,
                  param_grid,
                  measures=['rmse', 'mae'],
                  cv=5)

gs.fit(data_surprise)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.7483085077382944
{'n_factors': 125, 'n_epochs': 25, 'lr_all': 0.1, 'reg_all': 0.1, 'random_state': 42}


In [None]:
svd_gs = gs.best_estimator['rmse']
predictions = svd_gs.fit(trainset).test(testset)
accuracy.rmse(predictions)

RMSE: 0.7488


0.7488222550781085

In [None]:
del predictions, gs, dict_model
gc.collect()

50

### c) K-NN based models

In [None]:
# Initializing similarities options
sim_options = {'name': 'cosine',
               'user_based': True  # compute  similarities between users
               }

In [None]:
knnbasic = KNNBasic()
knnmeans = KNNWithMeans()
knnzscore = KNNWithZScore()

In [None]:
model = [knnbasic, knnmeans, knnzscore]
dict_model = {}

for model in model:
    y_pred = model.fit(trainset).test(testset)
    rmse = accuracy.rmse(y_pred)
    dict_model[model] = rmse.round(3)

print(dict_model)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.7675
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.7682
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.7616
{<surprise.prediction_algorithms.knns.KNNBasic object at 0x7f3523b7b350>: 0.768, <surprise.prediction_algorithms.knns.KNNWithMeans object at 0x7f3523b7b590>: 0.768, <surprise.prediction_algorithms.knns.KNNWithZScore object at 0x7f3523b7b1d0>: 0.762}


In [None]:
param_grid = {'k': [20, 30, 40, 50, 60, 70, 80]}

gs = GridSearchCV(KNNWithZScore,
                  param_grid,
                  measures=['rmse', 'mae'],
                  cv=5)

gs.fit(data_surprise)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [None]:
knn_gs = gs.best_estimator['rmse']
predictions = knn_gs.fit(trainset).test(testset)
accuracy.rmse(predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.7441


0.7441271782314157

In [None]:
del gs,\
    dict_model,\
    predictions
gc.collect()

## 3) Content-based algorithms

https://medium.com/analytics-vidhya/content-based-recommender-systems-in-python-2b330e01eb80

In [None]:
desc = books.copy()
desc = desc[["Id", "Name", "Authors", "Description"]]

In [None]:
desc = desc.sample(frac=.3,
                   random_state=42)

In [None]:
desc.reset_index(inplace=True)
desc.drop("index", axis=1, inplace=True)

In [None]:
tfidf = TfidfVectorizer(stop_words='english')
desc['Description'] = desc['Description'].fillna("")

descr_matrix = tfidf.fit_transform(desc['Description'])
descr_matrix.shape

(35874, 105600)

In [None]:
similarity_matrix = linear_kernel(descr_matrix, descr_matrix)

In [None]:
mapping = pd.Series(desc.index,
          index = desc['Name'])
print(mapping)

Name
On the Other Side of Mount Ararat: A Story of a Vanished City        0
The Tall Uncut: Stories                                              1
Heart Essence of the Vast Expanse: A Story of Transmission           2
Don't Look a Ghost Horse in the Mouth                                3
Turpentine                                                           4
                                                                 ...  
Junie B. Jones Is Not a Crook (Junie B. Jones, #9)               35869
The Rabbits' Wedding                                             35870
Calling the Shots (Angels Unlimited, #4)                         35871
Crimson: Loyalty and Loss - Tome 1 (Crimson, #1)                 35872
The Letterboxer's Companion                                      35873
Length: 35874, dtype: int64


In [None]:
def recommend_books_based_on_linear_kernel(book_input):
    book_index = mapping[book_input]
    #get similarity values with other books
    #similarity_score is the list of index and similarity matrix
    similarity_score = list(enumerate(similarity_matrix[book_index]))
    #sort in descending order the similarity score of movie inputted with all the other books
    similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 20 most similar books. Ignore the first book.
    similarity_score = similarity_score[1:20]
    
    #return book names using the mapping series
    book_indices = [i[0] for i in similarity_score]
    return (desc['Name'].iloc[book_indices])

In [None]:
recommend_books_based_on_linear_kernel('Pet Sematary')

3230                                  Black Cat, Volume 11
15651    Glimpses of Maine's Angling Past (Images of Am...
18878                        Oh, the Thinks You Can Think!
29986                                   Flags of the World
14474                           Go to the Room of the Eyes
663      Boston and the American Revolution: Boston Nat...
9972                                                   Ash
10932               True Blue (Sweet Valley Jr. High, #18)
8047     African American Life in the Rural South, 1900...
18362                                            Bold Wolf
24630                                       Cats from Away
25142                                        Isis Unveiled
7174                                       White House Q&A
34346                                  Night Winds Calling
30806            Among Other Things, I've Taken Up Smoking
31803    Changing Family Values: Difference, Diversity ...
29656                   The Funeral Makers (Mattagash, #