# Recommender system for books: Modelisation

In [1]:
# Data manipulation
import pandas as pd 
import numpy as np 
import gc

# Graphical libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Modelisation libraries
from surprise import BaselineOnly, SVD, NMF, KNNBasic, KNNWithMeans, KNNWithZScore
from surprise import accuracy
from surprise.model_selection import train_test_split, GridSearchCV
from surprise import Dataset
from surprise import Reader

from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
path = 'Datasets/'

In [3]:
# Importing data about users and books 
users = pd.read_parquet(path+"users_data_cleaning.parquet.gzip")
books = pd.read_parquet(path+"books_data_cleaning.parquet.gzip")

It exists different kinds of recommender engines. In this proof of concept, simple recommender, user based collaborative filtering, content-based recommender and hybrid methods will be testing to find the best recommender engines. 

## 1) Simple recommender

Simple recommender is the most basic recommender engines. It based on popularity. To calcule it, rating and number of reviews are used. The formula used in the following is the formula used by the IMDB website to calcule.  

*Source:* https://www.datacamp.com/community/tutorials/recommender-systems-python

In [4]:
# creating a new dataset for the calculus
simple_reco = books.copy()
simple_reco = simple_reco[["Id", "Name", "Authors",
                           "Rating", "CountsOfReview"]]
simple_reco.head()

Unnamed: 0,Id,Name,Authors,Rating,CountsOfReview
0,4000063,The Flintstones in Viva Rock Vegas,Ellen Miles,3.82,1
1,4000100,Little Rhody,Neta Lohnes Frazier,4.33,1
2,4000228,Finance And Investments Using The Wall Street ...,Peter R. Crabb,0.0,0
3,4000366,Shorty McCabe Looks 'Em Over,Sewell Ford,0.0,0
4,4000441,Plant Pathology,George N. Agrios,4.52,0


In [5]:
# calculating the average rating
C = simple_reco['Rating'].mean()
print(C)

3.7320950652695215


In [6]:
# calculating and keeping
# books with the 90th percentile
m = simple_reco['CountsOfReview'].quantile(0.90)
print(m)

41.0


In [7]:
# selecting only books with at least 41 reviews
q_books = simple_reco.copy().loc[simple_reco['CountsOfReview'] >= m]
q_books.shape

(12116, 5)

In [8]:
def weighted_rating(x, m=m, C=C):
    v = x['CountsOfReview']
    R = x['Rating']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [9]:
q_books['score'] = q_books.apply(weighted_rating, axis=1)

In [10]:
#Sort movies based on score calculated above
q_books = q_books.sort_values('score', ascending=False)

#Print the top 15 books
q_books[["Id",'Name', "Authors", 'CountsOfReview', 'Rating', 'score']].head(20)

Unnamed: 0,Id,Name,Authors,CountsOfReview,Rating,score
44289,862041,"Harry Potter Series Box Set (Harry Potter, #1-7)",J.K. Rowling,6522,4.74,4.733703
40413,818056,Harry Potter and the Deathly Hallows (Harry Po...,J.K. Rowling,952,4.62,4.583339
115327,1215032,"The Wise Man's Fear (The Kingkiller Chronicle,...",Patrick Rothfuss,16523,4.57,4.567926
61446,3165162,Percy Jackson and the Olympians (Percy Jackson...,Rick Riordan,546,4.59,4.530078
72565,1025685,"The Absolute Sandman, Volume Two",Neil Gaiman,198,4.69,4.525673
91715,2495562,The Wise Man's Fear (The Kingkiller Chronicle...,Patrick Rothfuss,488,4.56,4.495833
82953,2186848,"The Absolute Sandman, Volume Three",Neil Gaiman,140,4.71,4.488486
98149,2767793,"The Hero of Ages (Mistborn, #3)",Brandon Sanderson,10101,4.49,4.486936
30441,1179967,"A Storm of Swords (A Song of Ice and Fire, #3)",George R.R. Martin,583,4.54,4.486917
66035,3362870,"The Hero of Ages (Mistborn, #3)",Brandon Sanderson,1289,4.49,4.466636


In [11]:
print(np.sqrt(mean_squared_error(q_books['Rating'],
                                 q_books["score"])))

0.10848119654754877


In [12]:
del q_books, simple_reco
gc.collect()

0

## 2) User-based collaborative filtering

### a) Preparing the data for Surprise library

In [13]:
reader = Reader(rating_scale=(1, 5))
data_surprise = Dataset.load_from_df(users[['User_Id', 'Id', 'Rating']], reader)

In [14]:
trainset, testset = train_test_split(data_surprise, test_size=.2)

In [15]:
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
base_als = BaselineOnly(bsl_options=bsl_options)
predictions = base_als.fit(trainset).test(testset)

Estimating biases using als...


In [16]:
del predictions
gc.collect()

0

In [17]:
bsl_options = {'method': 'sgd',
               'learning_rate': .00005,
               }
base_sgd = BaselineOnly(bsl_options=bsl_options)
predictions = base_sgd.fit(trainset).test(testset)

Estimating biases using sgd...


In [18]:
del predictions
gc.collect()

0

### b) Matrix Factorization based algorithms

In [19]:
svd = SVD()
nmf = NMF()

In [20]:
model = [svd, nmf]
dict_model = {}

for model in model:
    y_pred = model.fit(trainset).test(testset)
    rmse = accuracy.rmse(y_pred)
    dict_model[model] = rmse.round(3)

print(dict_model)

RMSE: 0.7643
RMSE: 0.8703
{<surprise.prediction_algorithms.matrix_factorization.SVD object at 0x7fdde012b5b0>: 0.764, <surprise.prediction_algorithms.matrix_factorization.NMF object at 0x7fdde012b910>: 0.87}


In [21]:
param_grid = {'n_factors': [50, 75, 100, 125],
              'n_epochs': [5, 10, 15, 20, 25], 
              'lr_all': [0.001, 0.002, 0.005, 0.1],
              'reg_all': [0.1, 0.2, 0.4, 0.6],
              'random_state' : [42]
}

gs = GridSearchCV(SVD,
                  param_grid,
                  measures=['rmse', 'mae'],
                  cv=5)

gs.fit(data_surprise)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.7496259504461186
{'n_factors': 125, 'n_epochs': 25, 'lr_all': 0.1, 'reg_all': 0.1, 'random_state': 42}


In [22]:
svd_gs = gs.best_estimator['rmse']
predictions = svd_gs.fit(trainset).test(testset)
accuracy.rmse(predictions)

RMSE: 0.7515


0.7514988478795563

In [23]:
del predictions, gs, dict_model
gc.collect()

0

### c) K-NN based models

In [24]:
# Initializing similarities options
sim_options = {'name': 'cosine',
               'user_based': True  # compute  similarities between users
               }

In [25]:
knnbasic = KNNBasic()
knnmeans = KNNWithMeans()
knnzscore = KNNWithZScore()

In [26]:
model = [knnbasic, knnmeans, knnzscore]
dict_model = {}

for model in model:
    y_pred = model.fit(trainset).test(testset)
    rmse = accuracy.rmse(y_pred)
    dict_model[model] = rmse.round(3)

print(dict_model)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.7693
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.7678
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.7617
{<surprise.prediction_algorithms.knns.KNNBasic object at 0x7fdd3e9c5220>: 0.769, <surprise.prediction_algorithms.knns.KNNWithMeans object at 0x7fdd3e9c5250>: 0.768, <surprise.prediction_algorithms.knns.KNNWithZScore object at 0x7fdd3e9c5c10>: 0.762}


In [27]:
param_grid = {'k': [20, 30, 40, 50, 60, 70, 80]}

gs = GridSearchCV(KNNWithZScore,
                  param_grid,
                  measures=['rmse', 'mae'],
                  cv=5)

gs.fit(data_surprise)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [28]:
knn_gs = gs.best_estimator['rmse']
predictions = knn_gs.fit(trainset).test(testset)
accuracy.rmse(predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.7438


0.7438439694579394

In [29]:
del gs,\
    dict_model,\
    predictions
gc.collect()

0

## 3) Content-based algorithms

### a) With Tfidf

https://medium.com/analytics-vidhya/content-based-recommender-systems-in-python-2b330e01eb80

In [4]:
desc = books.copy()
desc = desc[["Id", "Name", "Authors", "Description"]]

In [5]:
desc = desc.sample(frac=.2,
                   random_state=42)

In [6]:
desc.reset_index(inplace=True)
desc.drop("index", axis=1, inplace=True)

In [7]:
tfidf = TfidfVectorizer(stop_words='english')
desc['Description'] = desc['Description'].fillna("")

descr_matrix = tfidf.fit_transform(desc['Description'])
descr_matrix.shape

(23916, 82723)

In [8]:
similarity_matrix = linear_kernel(descr_matrix, descr_matrix)

In [9]:
mapping = pd.Series(desc.index,
          index = desc['Name'])
print(mapping)

Name
On the Other Side of Mount Ararat: A Story of a Vanished City        0
The Tall Uncut: Stories                                              1
Heart Essence of the Vast Expanse: A Story of Transmission           2
Don't Look a Ghost Horse in the Mouth                                3
Turpentine                                                           4
                                                                 ...  
Infernal Revenue (The Destroyer, #96)                            23911
Die with Me                                                      23912
Kingyo: The Artistry of the Japanese Goldfish                    23913
The Shadow Roads (The Swans' War, book 3)                        23914
Roommates                                                        23915
Length: 23916, dtype: int64


In [10]:
def recommend_books(book_input):
    book_index = mapping[book_input]
    #get similarity values with other books
    #similarity_score is the list of index and similarity matrix
    similarity_score = list(enumerate(similarity_matrix[book_index]))
    #sort in descending order the similarity score of movie inputted with all the other books
    similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 20 most similar books. Ignore the first book.
    similarity_score = similarity_score[1:20]
    
    #return book names using the mapping series
    book_indices = [i[0] for i in similarity_score]
    return (desc['Name'].iloc[book_indices])

In [11]:
recommend_books('Pet Sematary')

3230                                  Black Cat, Volume 11
15651    Glimpses of Maine's Angling Past (Images of Am...
18878                        Oh, the Thinks You Can Think!
14474                           Go to the Room of the Eyes
663      Boston and the American Revolution: Boston Nat...
9972                                                   Ash
10932               True Blue (Sweet Valley Jr. High, #18)
18362                                            Bold Wolf
8047     African American Life in the Rural South, 1900...
7174                                       White House Q&A
17721                                        The Long Life
8842     The Horizontal World: Growing Up Wild in the M...
15942                      That Yankee Cat: The Maine Coon
6775                                        The Best Story
22298                                       Mansfield Park
9071                                    The Good Neighbour
21028                                             The We

### b) Using Transformers

https://towardsdatascience.com/hands-on-content-based-recommender-system-using-python-1d643bf314e4

In [4]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

In [6]:
data = pd.read_parquet(path+"books_with_complete_description.parquet.gzip")
data = data[["Id", "Name", "Authors", "Description"]]
data.head()

Unnamed: 0,Id,Name,Authors,Description
0,4000063,The Flintstones in Viva Rock Vegas,Ellen Miles,Spring 2000 marks the release of the new Flint...
1,4000100,Little Rhody,Neta Lohnes Frazier,A spunky ten-year-old girl moves with her fami...
2,4000228,Finance And Investments Using The Wall Street ...,Peter R. Crabb,Peter Crabb's The Wall Street Journal Workbook...
3,4000366,Shorty McCabe Looks 'Em Over,Sewell Ford,This is an EXACT reproduction of a book publis...
4,4000441,Plant Pathology,George N. Agrios,"This is a classic textbook on plant diseases, ..."


In [7]:
X = np.array(data["Description"])

In [8]:
text_data = X
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
embeddings = model.encode(text_data, show_progress_bar=True)

Downloading: 100%|██████████| 690/690 [00:00<00:00, 529kB/s]
Downloading: 100%|██████████| 3.99k/3.99k [00:00<00:00, 3.41MB/s]
Downloading: 100%|██████████| 550/550 [00:00<00:00, 372kB/s]
Downloading: 100%|██████████| 122/122 [00:00<00:00, 48.2kB/s]
Downloading: 100%|██████████| 229/229 [00:00<00:00, 147kB/s]
Downloading: 100%|██████████| 265M/265M [02:18<00:00, 1.91MB/s] 
Downloading: 100%|██████████| 53.0/53.0 [00:00<00:00, 56.5kB/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 108kB/s]
Downloading: 100%|██████████| 466k/466k [00:00<00:00, 899kB/s] 
Downloading: 100%|██████████| 450/450 [00:00<00:00, 486kB/s]
Downloading: 100%|██████████| 232k/232k [00:00<00:00, 757kB/s] 
Downloading: 100%|██████████| 190/190 [00:00<00:00, 192kB/s]
NVIDIA GeForce RTX 3070 Laptop GPU with CUDA capability sm_86 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_70.
If you want to use the NVIDIA GeForce RTX 3070 Lap

RuntimeError: CUDA error: no kernel image is available for execution on the device
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [None]:
X = np.array(embed_data)
n_comp = 5
pca = PCA(n_components=n_comp)
pca.fit(X)
pca_data = pd.DataFrame(pca.transform(X))
pca_data.head()

In [None]:
sns.pairplot(pca_data)

In [None]:
cos_sim_data = pd.DataFrame(cosine_similarity(X))
def give_recommendations(index,print_recommendation = False,print_recommendation_plots= False,print_genres =False):
  index_recomm =cos_sim_data.loc[index].sort_values(ascending=False).index.tolist()[1:6]
  movies_recomm =  data['Series_Title'].loc[index_recomm].values
  result = {'Movies':movies_recomm,'Index':index_recomm}
  if print_recommendation==True:
    print('The watched movie is this one: %s \n'%(data['Series_Title'].loc[index]))
    k=1
    for movie in movies_recomm:
      print('The number %i recommended movie is this one: %s \n'%(k,movie))
  if print_recommendation_plots==True:
    print('The plot of the watched movie is this one:\n %s \n'%(data['Overview'].loc[index]))
    k=1
    for q in range(len(movies_recomm)):
      plot_q = data['Overview'].loc[index_recomm[q]]
      print('The plot of the number %i recommended movie is this one:\n %s \n'%(k,plot_q))
      k=k+1
    for q in range(len(movies_recomm)):
      plot_q = data['Genre'].loc[index_recomm[q]]
      print('The plot of the number %i recommended movie is this one:\n %s \n'%(k,plot_q))
      k=k+1
  return result

In [None]:
plt.figure(figsize=(20,20))
for q in range(1,5):
  plt.subplot(2,2,q)
  index = np.random.choice(np.arange(0,len(X)))
  to_plot_data = cos_sim_data.drop(index,axis=1)
  plt.plot(to_plot_data.loc[index],'.',color='firebrick')
  recomm_index = give_recommendations(index)
  x = recomm_index['Index']
  y = cos_sim_data.loc[index][x].tolist()
  m = recomm_index['Movies']
  plt.plot(x,y,'.',color='navy',label='Recommended Movies')
  plt.title('Movie Watched: '+data['Series_Title'].loc[index])
  plt.xlabel('Movie Index')
  k=0
  for x_i in x:
    plt.annotate('%s'%(m[k]),(x_i,y[k]),fontsize=10)
    k=k+1

  plt.ylabel('Cosine Similarity')
  plt.ylim(0,1)