# Recommender system for books: Modelisation

In [1]:
# Data manipulation
import pandas as pd 
import numpy as np 
import gc

# Graphical libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Modelisation libraries
from surprise import BaselineOnly, SVD, NMF, KNNBasic, KNNWithMeans, KNNWithZScore
from surprise import accuracy
from surprise.model_selection import train_test_split, GridSearchCV
from surprise import Dataset
from surprise import Reader

from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Importing data about users and books 
users = pd.read_parquet("Datasets/users_data_cleaning.parquet.gzip")
books = pd.read_parquet("Datasets/books_data_cleaning.parquet.gzip")

It exists different kinds of recommender engines. In this proof of concept, simple recommender, user based collaborative filtering, content-based recommender and hybrid methods will be testing to find the best recommender engines. 

## 1) Simple recommender

Simple recommender is the most basic recommender engines. It based on popularity. To calcule it, rating and number of reviews are used. The formula used in the following is the formula used by the IMDB website to calcule.  

*Source:* https://www.datacamp.com/community/tutorials/recommender-systems-python

In [3]:
# creating a new dataset for the calculus
simple_reco = books.copy()
simple_reco = simple_reco[["Id", "Name", "Authors",
                           "Rating", "CountsOfReview"]]
simple_reco.head()

Unnamed: 0,Id,Name,Authors,Rating,CountsOfReview
0,4000063,The Flintstones in Viva Rock Vegas,Ellen Miles,3.82,1
1,4000100,Little Rhody,Neta Lohnes Frazier,4.33,1
2,4000228,Finance And Investments Using The Wall Street ...,Peter R. Crabb,0.0,0
3,4000366,Shorty McCabe Looks 'Em Over,Sewell Ford,0.0,0
4,4000441,Plant Pathology,George N. Agrios,4.52,0


In [4]:
# calculating the average rating
C = simple_reco['Rating'].mean()
print(C)

3.7320950652695215


In [5]:
# calculating and keeping
# books with the 90th percentile
m = simple_reco['CountsOfReview'].quantile(0.90)
print(m)

41.0


In [6]:
# selecting only books with at least 41 reviews
q_books = simple_reco.copy().loc[simple_reco['CountsOfReview'] >= m]
q_books.shape

(12116, 5)

In [7]:
def weighted_rating(x, m=m, C=C):
    v = x['CountsOfReview']
    R = x['Rating']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [8]:
q_books['score'] = q_books.apply(weighted_rating, axis=1)

In [9]:
#Sort movies based on score calculated above
q_books = q_books.sort_values('score', ascending=False)

#Print the top 15 books
q_books[["Id",'Name', "Authors", 'CountsOfReview', 'Rating', 'score']].head(20)

Unnamed: 0,Id,Name,Authors,CountsOfReview,Rating,score
44289,862041,"Harry Potter Series Box Set (Harry Potter, #1-7)",J.K. Rowling,6522,4.74,4.733703
40413,818056,Harry Potter and the Deathly Hallows (Harry Po...,J.K. Rowling,952,4.62,4.583339
115327,1215032,"The Wise Man's Fear (The Kingkiller Chronicle,...",Patrick Rothfuss,16523,4.57,4.567926
61446,3165162,Percy Jackson and the Olympians (Percy Jackson...,Rick Riordan,546,4.59,4.530078
72565,1025685,"The Absolute Sandman, Volume Two",Neil Gaiman,198,4.69,4.525673
91715,2495562,The Wise Man's Fear (The Kingkiller Chronicle...,Patrick Rothfuss,488,4.56,4.495833
82953,2186848,"The Absolute Sandman, Volume Three",Neil Gaiman,140,4.71,4.488486
98149,2767793,"The Hero of Ages (Mistborn, #3)",Brandon Sanderson,10101,4.49,4.486936
30441,1179967,"A Storm of Swords (A Song of Ice and Fire, #3)",George R.R. Martin,583,4.54,4.486917
66035,3362870,"The Hero of Ages (Mistborn, #3)",Brandon Sanderson,1289,4.49,4.466636


In [10]:
print(np.sqrt(mean_squared_error(q_books['Rating'],
                                 q_books["score"])))

0.10848119654754877


In [11]:
del q_books, simple_reco
gc.collect()

0

## 2) User-based collaborative filtering

### a) Preparing the data for Surprise library

In [12]:
reader = Reader(rating_scale=(1, 5))
data_surprise = Dataset.load_from_df(users[['User_Id', 'Id', 'Rating']], reader)

In [13]:
trainset, testset = train_test_split(data_surprise, test_size=.2)

In [14]:
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
base_als = BaselineOnly(bsl_options=bsl_options)
predictions = base_als.fit(trainset).test(testset)

Estimating biases using als...


In [15]:
del predictions
gc.collect()

0

In [16]:
bsl_options = {'method': 'sgd',
               'learning_rate': .00005,
               }
base_sgd = BaselineOnly(bsl_options=bsl_options)
predictions = base_sgd.fit(trainset).test(testset)

Estimating biases using sgd...


In [17]:
del predictions
gc.collect()

0

### b) Matrix Factorization based algorithms

In [18]:
svd = SVD()
nmf = NMF()

In [19]:
model = [svd, nmf]
dict_model = {}

for model in model:
    y_pred = model.fit(trainset).test(testset)
    rmse = accuracy.rmse(y_pred)
    dict_model[model] = rmse.round(3)

print(dict_model)

RMSE: 0.7692
RMSE: 0.8741
{<surprise.prediction_algorithms.matrix_factorization.SVD object at 0x7faec4da7ac0>: 0.769, <surprise.prediction_algorithms.matrix_factorization.NMF object at 0x7faec4da7910>: 0.874}


In [None]:
param_grid = {'n_factors': [25, 50, 75, 100, 125],
              'n_epochs': [5, 10, 15, 20, 25], 
              'lr_all': [0.001, 0.002, 0.005, 0.1],
              'reg_all': [0.1, 0.2, 0.4, 0.6],
              'random_state' : [42]
}

gs = GridSearchCV(SVD,
                  param_grid,
                  measures=['rmse', 'mae'],
                  cv=5)

gs.fit(data_surprise)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

In [None]:
svd_gs = gs.best_estimator['rmse']
predictions = svd_gs.fit(trainset).test(testset)
accuracy.rmse(predictions)

In [None]:
del predictions, gs, dict_model
gc.collect()

### c) K-NN based models

In [20]:
# Initializing similarities options
sim_options = {'name': 'cosine',
               'user_based': True  # compute  similarities between users
               }

In [21]:
knnbasic = KNNBasic()
knnmeans = KNNWithMeans()
knnzscore = KNNWithZScore()

In [22]:
model = [knnbasic, knnmeans, knnzscore]
dict_model = {}

for model in model:
    y_pred = model.fit(trainset).test(testset)
    rmse = accuracy.rmse(y_pred)
    dict_model[model] = rmse.round(3)

print(dict_model)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.7747
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.7740
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.7670
{<surprise.prediction_algorithms.knns.KNNBasic object at 0x7faec4da7640>: 0.775, <surprise.prediction_algorithms.knns.KNNWithMeans object at 0x7faec4da7370>: 0.774, <surprise.prediction_algorithms.knns.KNNWithZScore object at 0x7faec4da7dc0>: 0.767}


In [23]:
param_grid = {'k': [20, 30, 40, 50, 60, 70, 80]}

gs = GridSearchCV(KNNWithZScore,
                  param_grid,
                  measures=['rmse', 'mae'],
                  cv=5)

gs.fit(data_surprise)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [25]:
del gs, dict_model
gc.collect()

24

## 3) Content-based algorithms

https://medium.com/analytics-vidhya/content-based-recommender-systems-in-python-2b330e01eb80

In [26]:
desc = books.copy()
desc = desc[["Id", "Name", "Authors", "Description"]]

In [27]:
tfidf = TfidfVectorizer(stop_words='english')
desc['Description'] = desc['Description'].fillna("")

descr_matrix = tfidf.fit_transform(desc['Description'])
descr_matrix.shape

(119581, 215465)

In [None]:
similarity_matrix = linear_kernel(descr_matrix, descr_matrix)

In [30]:
mapping = pd.Series(desc.index,
          index = desc['Name'])
print(mapping)

In [None]:
def recommend_books_based_on_description(book_input):
    book_index = mapping[book_input]
    #get similarity values with other movies
    #similarity_score is the list of index and similarity matrix
    similarity_score = list(enumerate(similarity_matrix[book_index]))
    #sort in descending order the similarity score of movie inputted with all the other movies
    similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 15 most similar movies. Ignore the first movie.
    similarity_score = similarity_score[1:15]
    
    #return movie names using the mapping series
    book_indices = [i[0] for i in similarity_score]
    return (descr['Title'].iloc[book_indices])

In [None]:
recommend_books_based_on_description("")