In [1]:
!pip install sentence_transformers
!pip install surprise
!pip install codecarbon

Collecting sentence_transformers
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 3.6 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 34.1 MB/s 
[?25hCollecting tokenizers>=0.10.3
  Downloading tokenizers-0.11.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 41.8 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 40.7 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 43.5 MB/s 
Coll

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Recommender system for books: Modelisation

In [2]:
# Data manipulation
import pandas as pd 
import numpy as np 
import gc

# Graphical libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Modelisation libraries
from surprise import BaselineOnly, SVD, NMF, KNNBasic, KNNWithMeans, KNNWithZScore, CoClustering, SlopeOne
from surprise import accuracy
from surprise.model_selection import train_test_split, GridSearchCV
from surprise import Dataset
from surprise import Reader

from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer

# GHG Emissions tracking
from codecarbon import EmissionsTracker

In [3]:
path = '/content/drive/My Drive/Ingénieur ML - OC/P7/Datasets/'

In [4]:
# Importing data about users and books 
users = pd.read_parquet(path+"users_data_cleaning.parquet.gzip")
books = pd.read_parquet(path+"books_data_cleaning.parquet.gzip")

In [5]:
books.rename(columns={"Rating" : "Average_Rating"},
             inplace=True)

It exists different kinds of recommender engines. In this proof of concept, simple recommender, user based collaborative filtering, content-based recommender and hybrid methods will be testing to find the best recommender engines. 

## 1) Popularity based

Simple recommender is the most basic recommender engines. It based on popularity. To calcule it, rating and number of reviews are used. The formula used is detailled in Bhowmick et al. (2021).

In [None]:
# creating a new dataset for the calculus
simple_reco = books.copy()
simple_reco = simple_reco[["Id", "Name", "Authors",
                           "Average_Rating", "CountsOfReview"]]
simple_reco.head()

Unnamed: 0,Id,Name,Authors,Average_Rating,CountsOfReview
0,4000063,The Flintstones in Viva Rock Vegas,Ellen Miles,3.82,1
1,4000100,Little Rhody,Neta Lohnes Frazier,4.33,1
2,4000228,Finance And Investments Using The Wall Street ...,Peter R. Crabb,0.0,0
3,4000366,Shorty McCabe Looks 'Em Over,Sewell Ford,0.0,0
4,4000441,Plant Pathology,George N. Agrios,4.52,0


In [None]:
# calculating the average rating
C = simple_reco['Average_Rating'].mean()
print(C)

3.7320950652695215


In [None]:
# calculating and keeping
# books with the 90th percentile
m = simple_reco['CountsOfReview'].quantile(0.90)
print(m)

41.0


In [None]:
# selecting only books with at least 41 reviews
q_books = simple_reco.copy().loc[simple_reco['CountsOfReview'] >= m]
q_books.shape

(12116, 5)

In [None]:
def weighted_rating(x, m=m, C=C):
    v = x['CountsOfReview']
    R = x['Average_Rating']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
q_books['score'] = q_books.apply(weighted_rating, axis=1)

In [None]:
#Sort movies based on score calculated above
q_books = q_books.sort_values('score',
                              ascending=False)

#Print the top 5 books
q_books[["Id",
         "Name",
         "Authors",
         "CountsOfReview", 
         "Average_Rating", 
         "score"]].head(5)

Unnamed: 0,Id,Name,Authors,CountsOfReview,Average_Rating,score
44289,862041,"Harry Potter Series Box Set (Harry Potter, #1-7)",J.K. Rowling,6522,4.74,4.733703
40413,818056,Harry Potter and the Deathly Hallows (Harry Po...,J.K. Rowling,952,4.62,4.583339
115327,1215032,"The Wise Man's Fear (The Kingkiller Chronicle,...",Patrick Rothfuss,16523,4.57,4.567926
61446,3165162,Percy Jackson and the Olympians (Percy Jackson...,Rick Riordan,546,4.59,4.530078
72565,1025685,"The Absolute Sandman, Volume Two",Neil Gaiman,198,4.69,4.525673


In [None]:
print(round(np.sqrt(mean_squared_error(q_books['Average_Rating'],
                                 q_books["score"])), 3))

0.108


In [None]:
del q_books, simple_reco
gc.collect()

0

This method had disadvantage to make recommendation based on the most popular books. Books with few ratings have less change to be recommend. 

## 2) User-based collaborative filtering

### a) Preparing the data for Surprise library

In [None]:
data = pd.merge(users, books, on = "Id")
data.rename(columns={"Name" : "Title",
                     "Id" : "Book_Id"}, 
                     inplace=True)

In [None]:
reader = Reader(rating_scale=(1, 5))
data_surprise = Dataset.load_from_df(data[['User_Id', 'Book_Id', 'Rating']], reader)

In [None]:
trainset, testset = train_test_split(data_surprise, test_size=.2)

In [None]:
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
base_als = BaselineOnly(bsl_options=bsl_options)
predictions = base_als.fit(trainset).test(testset)

Estimating biases using als...


In [None]:
del predictions
gc.collect()

0

In [None]:
bsl_options = {'method': 'sgd',
               'learning_rate': .00005,
               }
base_sgd = BaselineOnly(bsl_options=bsl_options)
predictions = base_sgd.fit(trainset).test(testset)

Estimating biases using sgd...


In [None]:
del predictions
gc.collect()

0

### b) Collaborative filtering Algorithms

In [None]:
# Initializing similarities options
sim_options = {'name': 'cosine',
               'user_based': True
               }

In [None]:
svd = SVD()
nmf = NMF()
knn = KNNBasic(random_state=42)
slope = SlopeOne()
cocluster = CoClustering(random_state=42)

In [None]:
model = [svd, nmf, knn, slope, cocluster]
dict_model = {}

for model in model:
    tracker = EmissionsTracker()
    tracker.start()
    y_pred = model.fit(trainset).test(testset)
    emissions = tracker.stop()
    rmse = accuracy.rmse(y_pred)
    dict_model[model] = (rmse.round(3), emissions)

eval_model = pd.DataFrame(dict_model, index=["RMSE", "GHG Emissions"])
eval_model.columns = ["SVD", "NMF", "KNN", "SlopeOne", "Co-Clustering"]

eval_model

  if obj.zone == 'local':
  return self.timezone.normalize(next_fire_time)


RMSE: 0.7652
RMSE: 0.8745
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.7700


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_pred = model.fit(trainset).test(testset)


RMSE: 0.8908
RMSE: 0.8908


Unnamed: 0,SVD,NMF,KNN,SlopeOne,Co-Clustering
RMSE,0.765,0.875,0.77,0.891,0.891
GHG Emissions,2.7e-05,2.9e-05,1.5e-05,3.4e-05,1.5e-05


In [None]:
del dict_model,\
    sim_options
gc.collect()

0

### c) Focus on K-NN based models

In [None]:
# Initializing similarities options
sim_options = {'name': 'cosine',
               'user_based': True
               }

In [None]:
knnbasic = KNNBasic()
knnmeans = KNNWithMeans()
knnzscore = KNNWithZScore()

In [None]:
model = [knnbasic, knnmeans, knnzscore]
dict_model = {}

for model in model:
  model = [knnbasic, knnmeans, knnzscore]
dict_model = {}

for model in model:
    tracker = EmissionsTracker()
    tracker.start()
    y_pred = model.fit(trainset).test(testset)
    emissions = tracker.stop()
    rmse = accuracy.rmse(y_pred)
    dict_model[model] = (rmse.round(3), emissions)

eval_model = pd.DataFrame(dict_model, index=["RMSE", "GHG Emissions"])
eval_model.columns = ["KNN Basic", "KNN Means", "KNN Z-Score"]

eval_model

  if obj.zone == 'local':
  return self.timezone.normalize(next_fire_time)


Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.7700
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.7694
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.7620


Unnamed: 0,KNN Basic,KNN Means,KNN Z-Score
RMSE,0.77,0.769,0.762
GHG Emissions,2.2e-05,1.3e-05,1.4e-05


In [None]:
param_grid = {'k': [20, 30, 40, 50, 60, 70, 80]}

gs = GridSearchCV(KNNWithZScore,
                  param_grid,
                  measures=['rmse', 'mae'],
                  cv=5)

gs.fit(data_surprise)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [None]:
knn_gs = gs.best_estimator['rmse']

tracker = EmissionsTracker()

tracker.start()
predictions = knn_gs.fit(trainset).test(testset)
emissions = tracker.stop()


accuracy.rmse(predictions)
print("GHG emission: " + str(emissions) + " kg/CO2")

  if obj.zone == 'local':
  return self.timezone.normalize(next_fire_time)


Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.7443
GHG emission: 2.7803216914243218e-05 kg/CO2


In [None]:
del gs,\
    dict_model,\
    predictions,\
    tracker,\
    emissions
gc.collect()

88

## 3) Content-based algorithms

### a) With Tfidf

https://medium.com/analytics-vidhya/content-based-recommender-systems-in-python-2b330e01eb80

In [6]:
desc = books.copy()
desc = desc[["Id", "Name", "Authors", "Description"]]

In [7]:
desc = desc.sample(frac=.2,
                   random_state=42)

In [8]:
desc.reset_index(inplace=True)
desc.drop("index", axis=1, inplace=True)

In [9]:
tracker = EmissionsTracker()

tracker.start()
tfidf = TfidfVectorizer(stop_words='english')
desc['Description'] = desc['Description'].fillna("")
descr_matrix = tfidf.fit_transform(desc['Description'])
similarity_matrix = linear_kernel(descr_matrix, descr_matrix)
emissions = tracker.stop()

print("GHG Emissions: " + str(emissions) + ' kg/CO2')

CODECARBON : No CPU tracking mode found. Falling back on CPU constant mode.
CODECARBON : Failed to match CPU TDP constant. Falling back on a global constant.
  if obj.zone == 'local':
  return self.timezone.normalize(next_fire_time)
  return self.timezone.normalize(next_fire_time)


GHG Emissions: 0.00010607751017515084 kg/CO2


In [10]:
mapping = pd.Series(desc.index,
          index = desc['Name'])
print(mapping)

Name
On the Other Side of Mount Ararat: A Story of a Vanished City        0
The Tall Uncut: Stories                                              1
Heart Essence of the Vast Expanse: A Story of Transmission           2
Don't Look a Ghost Horse in the Mouth                                3
Turpentine                                                           4
                                                                 ...  
Infernal Revenue (The Destroyer, #96)                            23911
Die with Me                                                      23912
Kingyo: The Artistry of the Japanese Goldfish                    23913
The Shadow Roads (The Swans' War, book 3)                        23914
Roommates                                                        23915
Length: 23916, dtype: int64


In [13]:
def recommend_books(book_input):
    print("Book Read: ", book_input)
    book_index = mapping[book_input]
    #get similarity values with other books
    #similarity_score is the list of index and similarity matrix
    similarity_score = list(enumerate(similarity_matrix[book_index]))
    #sort in descending order the similarity score of movie inputted with all the other books
    similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 5 most similar books. Ignore the first book.
    similarity_score = similarity_score[1:6]
    
    #return book names using the mapping series
    book_indices = [i[0] for i in similarity_score]
    return (desc['Name'].iloc[book_indices])

In [15]:
recommend_books('The Shining')

Book Read:  The Shining


7724                 Room with a Clue (Pennyfoot Hotel #1)
20610                                  Buzby to the Rescue
2395                  Maid to Murder (Pennyfoot Hotel #12)
12329                                               Double
6688     Custer: The Controversial Life of George Armst...
Name: Name, dtype: object

In [16]:
del tracker, emissions
gc.collect()

88

### b) Using Transformers

https://towardsdatascience.com/hands-on-content-based-recommender-system-using-python-1d643bf314e4

In [17]:
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
data = pd.read_parquet(path+"books_with_complete_description.parquet.gzip")
data = data[["Name", "Authors", "Description"]].sample(frac=.3, random_state=42)
data.reset_index(inplace=True)
data.drop("index", axis=1, inplace=True)
data.head()

Unnamed: 0,Name,Authors,Description
0,On the Other Side of Mount Ararat: A Story of ...,Mariam Manoukian,On the Other Side of Mount Ararat is the story...
1,The Tall Uncut: Stories,Pete Fromm,"In this honest, contemporary collection of sho..."
2,Heart Essence of the Vast Expanse: A Story of ...,Anne Carolyn Klein,"Beautiful, evocative, and eminently useful, th..."
3,Don't Look a Ghost Horse in the Mouth,George E. Stanley,The horse Emily brags about to her classmates ...
4,Turpentine,Spring Warren,A comic glance at the old American West and a ...


In [19]:
X = np.array(data["Description"])

In [20]:
text_data = X

tracker = EmissionsTracker()
tracker.start()

model = SentenceTransformer('distilbert-base-nli-mean-tokens')
embeddings = model.encode(text_data, show_progress_bar=True)
X = np.array(embeddings)
cos_sim_data = pd.DataFrame(cosine_similarity(X))

emissions = tracker.stop()
print("GHG Emissions: " + str(emissions) + " kg/CO2")

CODECARBON : No CPU tracking mode found. Falling back on CPU constant mode.
CODECARBON : Failed to match CPU TDP constant. Falling back on a global constant.
  if obj.zone == 'local':
  return self.timezone.normalize(next_fire_time)


Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1122 [00:00<?, ?it/s]

  return self.timezone.normalize(next_fire_time)


GHG Emissions: 0.0014541530105872161 kg/CO2


In [22]:
def give_recommendations(index, print_recommendation = False,print_recommendation_plots= False):
  index_recomm =cos_sim_data.loc[index].sort_values(ascending=False).index.tolist()[1:6]
  books_recomm =  data['Name'].loc[index_recomm].values
  result = {'Books':books_recomm,'Index':index_recomm}
  if print_recommendation==True:
    print('Book Read: %s \n'%(data['Name'].loc[index]))
    k=1
    for book in books_recomm:
      print('Recommended book #%i : %s \n'%(k, book))
      k = k+1
  if print_recommendation_plots==True:
    print('Book Read - Description:\n %s \n'%(data['Description'].loc[index]))
    k=1
    for q in range(len(books_recomm)):
      plot_q = data['Description'].loc[index_recomm[q]]
      print('Recommended Book #%i - Description:\n %s \n'%(k,plot_q))
      k=k+1
  return result

In [23]:
give_recommendations(1952, True, True)

Book Read: The Shining 

Recommended book #1 : Fellowship of Fear (Gideon Oliver #1) 

Recommended book #2 : Fellowship of Fear (Gideon Oliver #1) 

Recommended book #3 : The House At Midnight 

Recommended book #4 : Life Expectancy 

Recommended book #5 : Life Expectancy 

Book Read - Description:
 Jack Torrance's new job at the Overlook Hotel is the perfect chance for a fresh start. As the off-season caretaker at the atmospheric old hotel, he'll have plenty of time to spend reconnecting with his family and working on his writing. But as the harsh winter weather sets in, the idyllic location feels ever more remote...and more sinister. And the only one to notice the strange and terrible forces gathering around the Overlook is Danny Torrance, a uniquely gifted five-year-old. 

Recommended Book #1 - Description:
 Meet Professor Gideon Oliver, full-time anthropologist, part-time detective, and brand-new visiting fellow at Heidelberg University. Even though the two previous occupants of hi

{'Books': array(['Fellowship of Fear (Gideon Oliver #1)',
        'Fellowship of Fear (Gideon Oliver #1)', 'The House At Midnight',
        'Life Expectancy', 'Life Expectancy'], dtype=object),
 'Index': [22624, 27409, 17143, 1579, 4574]}

In [None]:
recomm_list = []

for i in range(len(X)):
  recomm_i = give_recommendations(i)
  recomm_list.append(recomm_i['Books'])
recomm_data = pd.DataFrame(recomm_list,columns=['First Recommendation',
                                                'Second Recommendation',
                                                'Third Recommendation',
                                                'Fourth Recommendation',
                                                'Fifth Recommendation'])
recomm_data['Books Read'] = data['Name']
recomm_data = recomm_data[['Books Read',
                           'First Recommendation',
                           'Second Recommendation',
                           'Third Recommendation',
                           'Fourth Recommendation',
                           'Fifth Recommendation']]

In [None]:
recomm_data.sample(frac=1).head()

Unnamed: 0,Books Read,First Recommendation,Second Recommendation,Third Recommendation,Fourth Recommendation,Fifth Recommendation
11212,The First Dog,The Berenstain Bears Blaze a Trail,The Magic Gourd,"Yum, Yum, Yummy! (The Giggle Club)",The Great Tug of War,"What Time Is It, Mr. Crocodile?"
2548,Revelations Cycle II: The Marches (In Nomine: ...,Bone Volume 6: Old Man's Cave,Gregor and the Code of Claw (Underland Chronic...,Midnight (Midnight),"Winterbirth (The Godless World, #1)","Destiny's Truth (Deathlands, #60)"
6010,"Methods in Cell Biology, Volume 63: Cytometry,...","Methods in Cell Biology, Volume 63: Cytometry,...",Knowledge of Life,Avery's Diseases Of The Newborn,Borderline Personality Disorder: Clinical and ...,Case Studies in Immunology: A Clinical Companion
25380,Oxford Atlas of the World,Emerging Powers (The Illustrated History of th...,A History of the World in the Twentieth Century,An Incomplete Education,An Incomplete Education,Roots: The Saga of an American Family
10621,Just Us Girls: Secrets to Feeling Good About Y...,GirlForce: Vibe: A Girl's Guidebook to Confide...,My Own Thoughts and Feelings (for Girls): A Yo...,Guys Like Girls Who . . .,Changing Your World One Diaper at a Time: A Re...,It's a Girl: Women Writers on Raising Daughters


In [None]:
del users,\
    books,\
    data
gc.collect()

557