# Project 4 - Books Recommendation using SVD
Collaborative based filtering->Item based

In [1]:
# !pip install scikit-surprise

# Import Dependencies

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from surprise import accuracy, Dataset, Reader, SVD, BaselineOnly, PredictionImpossible
from surprise.model_selection import train_test_split, cross_validate, KFold, GridSearchCV
import random
import numpy as np
import statistics as st

# Explore the data

In [3]:
#Creating dataframes from csv files to read the data
books_df_original = pd.read_csv('./Resources/Books.csv')
users_df = pd.read_csv('./Resources/Users.csv')
ratings_df_original = pd.read_csv('./Resources/Ratings.csv')

  books_df_original = pd.read_csv('./Resources/Books.csv')


In [4]:
# Filter out data with no publication year
books_df = books_df_original[books_df_original['Year-Of-Publication'] != 0]

In [5]:
# remove duplicated books records if any by looking at ISBN
books_df=books_df.drop_duplicates(subset=['ISBN'])

In [6]:
books_df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [7]:
books_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 267790 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 267790 non-null  object
 1   Book-Title           267790 non-null  object
 2   Book-Author          267788 non-null  object
 3   Year-Of-Publication  267790 non-null  object
 4   Publisher            267788 non-null  object
 5   Image-URL-S          267790 non-null  object
 6   Image-URL-M          267790 non-null  object
 7   Image-URL-L          267787 non-null  object
dtypes: object(8)
memory usage: 18.4+ MB


In [8]:
duplicated_titles=books_df[books_df.duplicated(subset=['Book-Title'],keep=False)].sort_values(by='Book-Title')
duplicated_titles.head()
# ????????how to handle those duplicates???????? drop those with earlier year of publication

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
75637,1565920465,!%@ (A Nutshell handbook),Donnalyn Frey,1994,O'Reilly,http://images.amazon.com/images/P/1565920465.0...,http://images.amazon.com/images/P/1565920465.0...,http://images.amazon.com/images/P/1565920465.0...
156341,1565920317,!%@ (A Nutshell handbook),Donnalyn Frey,1993,O'Reilly,http://images.amazon.com/images/P/1565920317.0...,http://images.amazon.com/images/P/1565920317.0...,http://images.amazon.com/images/P/1565920317.0...
140618,792276833,'A Hell of a Place to Lose a Cow': An American...,Tim Brookes,2000,National Geographic,http://images.amazon.com/images/P/0792276833.0...,http://images.amazon.com/images/P/0792276833.0...,http://images.amazon.com/images/P/0792276833.0...
158204,792277295,'A Hell of a Place to Lose a Cow': An American...,Tim Brookes,2001,National Geographic,http://images.amazon.com/images/P/0792277295.0...,http://images.amazon.com/images/P/0792277295.0...,http://images.amazon.com/images/P/0792277295.0...
10438,451168089,'Salem's Lot,Stephen King,1990,Signet Book,http://images.amazon.com/images/P/0451168089.0...,http://images.amazon.com/images/P/0451168089.0...,http://images.amazon.com/images/P/0451168089.0...


In [9]:
ratings_df=ratings_df_original.copy()
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [10]:
# update the datatype of a 'Book-Rating' field to numeric one
ratings_df['Book-Rating']=pd.to_numeric(ratings_df['Book-Rating'],errors='coerce')
# and check the result
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [11]:
############## users with solid number of reviews shortens a lot!
# find the list of ISBN that we have book info for them
isbn=books_df['ISBN'].unique()

# adjust ratings  dataframe to match with those books
ratings_df=ratings_df[ratings_df['ISBN'].isin(isbn)]
############## users with solid number of reviews shortens a lot!

In [12]:
#check for n/a values
ratings_df[ratings_df['Book-Rating'].isna()|ratings_df['ISBN'].isna()|ratings_df['User-ID'].isna()]

Unnamed: 0,User-ID,ISBN,Book-Rating


In [13]:
# drop rows with N/As if any
ratings_df=ratings_df[~(ratings_df['Book-Rating'].isna()|ratings_df['ISBN'].isna()|ratings_df['User-ID'].isna())]
ratings_df

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
...,...,...,...
1149774,276704,0876044011,0
1149775,276704,1563526298,9
1149776,276706,0679447156,0
1149777,276709,0515107662,10


In [14]:
# Check distribution by scores 
groupped_r_scores=ratings_df.groupby(['Book-Rating']).size()
groupped_r_scores

Book-Rating
0     639805
1       1461
2       2339
3       5040
4       7504
5      44793
6      31236
7      65439
8      90494
9      59965
10     70316
dtype: int64

#### Update 0 scores with weighted averages

In [16]:
avg_ratings_scored = ratings_df[ratings_df['Book-Rating'] > 0].groupby('ISBN')['Book-Rating'].mean()
avg_ratings_scored

ISBN
0000913154     8.0
000104687X     6.0
0001047213     9.0
0001047973     9.0
000104799X     7.5
              ... 
B0001FZGPI     7.0
B0001FZGRQ     9.0
B0001GMSV2     8.0
B0001I1KOG    10.0
B000234N3A     9.0
Name: Book-Rating, Length: 147598, dtype: float64

In [17]:
count_ratings_scored = ratings_df[ratings_df['Book-Rating'] > 0].groupby('ISBN')['Book-Rating'].count()
count_ratings_scored

ISBN
0000913154    1
000104687X    1
0001047213    1
0001047973    2
000104799X    2
             ..
B0001FZGPI    1
B0001FZGRQ    1
B0001GMSV2    2
B0001I1KOG    1
B000234N3A    1
Name: Book-Rating, Length: 147598, dtype: int64

In [18]:
average_weighted_df=pd.DataFrame(avg_ratings_scored).rename(columns={'Book-Rating':'avg_book_rating'})
count_ratings_scored_df=pd.DataFrame(count_ratings_scored).rename(columns={'Book-Rating':'count_book_rating'})
average_weighted_df=pd.merge(average_weighted_df,count_ratings_scored_df,  on='ISBN', how='inner')
average_weighted_df=average_weighted_df.sort_values(by='count_book_rating', ascending=False)
average_weighted_df.head()

Unnamed: 0_level_0,avg_book_rating,count_book_rating
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1
316666343,8.18529,707
971880107,4.390706,581
385504209,8.435318,487
312195516,8.182768,383
60928336,7.8875,320


In [19]:
def average_weighted(row):
    min_th=25 #min number of rating received by the book
    neutral_score=5
    avg_w = ((row['avg_book_rating'] * row['count_book_rating']) + 
      (min_th * neutral_score))/(row['count_book_rating'] + min_th)
    return avg_w

In [20]:
average_weighted_df['avg_weighted']=average_weighted_df.apply(average_weighted, axis=1)
average_weighted_df

Unnamed: 0_level_0,avg_book_rating,count_book_rating,avg_weighted
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0316666343,8.185290,707,8.076503
0971880107,4.390706,581,4.415842
0385504209,8.435318,487,8.267578
0312195516,8.182768,383,7.987745
0060928336,7.887500,320,7.678261
...,...,...,...
0553013769,5.000000,1,5.000000
0553013653,8.000000,1,5.115385
0553013602,5.000000,1,5.000000
0553013394,6.000000,1,5.038462


In [21]:
ratings_df_adj=ratings_df.copy()

In [22]:
ratings_df_adj=ratings_df_adj.set_index('ISBN')
ratings_df_adj

Unnamed: 0_level_0,User-ID,Book-Rating
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1
034545104X,276725,0
0155061224,276726,5
0446520802,276727,0
052165615X,276729,3
0521795028,276729,6
...,...,...
0876044011,276704,0
1563526298,276704,9
0679447156,276706,0
0515107662,276709,10


In [23]:
ratings_df_adj.loc[ratings_df_adj['Book-Rating'] == 0, 'Book-Rating'] = ratings_df_adj.loc[ratings_df_adj['Book-Rating'] == 0].index.map(average_weighted_df['avg_weighted'])
ratings_df_adj

Unnamed: 0_level_0,User-ID,Book-Rating
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1
034545104X,276725,5.679245
0155061224,276726,5.000000
0446520802,276727,7.358025
052165615X,276729,3.000000
0521795028,276729,6.000000
...,...,...
0876044011,276704,
1563526298,276704,9.000000
0679447156,276706,6.050000
0515107662,276709,10.000000


In [24]:
# Filter out data with n/a rating score after mapping, as there could be books with only 0 scores
ratings_df_adj=ratings_df_adj.dropna(subset=['Book-Rating'])
ratings_df_adj

Unnamed: 0_level_0,User-ID,Book-Rating
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1
034545104X,276725,5.679245
0155061224,276726,5.000000
0446520802,276727,7.358025
052165615X,276729,3.000000
0521795028,276729,6.000000
...,...,...
0806917695,276704,5.000000
1563526298,276704,9.000000
0679447156,276706,6.050000
0515107662,276709,10.000000


In [25]:
ratings_df_adj['Book-Rating']=round(ratings_df_adj['Book-Rating'],0)
ratings_df_adj.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_df_adj['Book-Rating']=round(ratings_df_adj['Book-Rating'],0)


Unnamed: 0_level_0,User-ID,Book-Rating
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1
034545104X,276725,6.0
0155061224,276726,5.0
0446520802,276727,7.0
052165615X,276729,3.0
0521795028,276729,6.0


# Train the model

# Training: data preparation

In [27]:
# Change ISBN with Titles
# Merge ratings with books data in order to change isbn with title and leave only those ratings data for which we have title info
ratings_df_adj=pd.merge(books_df,ratings_df_adj,on='ISBN', how = 'inner')
ratings_df_adj=ratings_df_adj.drop(['ISBN','Book-Author','Year-Of-Publication','Publisher','Image-URL-S','Image-URL-M','Image-URL-L'], axis=1)
ratings_df_adj=ratings_df_adj.dropna()
ratings_df_adj

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating
0,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,8,5.0
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11400,6.0
2,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11676,8.0
3,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,41385,6.0
4,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,67544,8.0
...,...,...,...,...,...,...,...,...,...,...
840843,0395264707,Dreamsnake,Vonda N. McIntyre,1978,Houghton Mifflin,http://images.amazon.com/images/P/0395264707.0...,http://images.amazon.com/images/P/0395264707.0...,http://images.amazon.com/images/P/0395264707.0...,275318,10.0
840844,1845170423,Cocktail Classics,David Biggs,2004,Connaught,http://images.amazon.com/images/P/1845170423.0...,http://images.amazon.com/images/P/1845170423.0...,http://images.amazon.com/images/P/1845170423.0...,275970,7.0
840845,0449906736,Flashpoints: Promise and Peril in a New World,Robin Wright,1993,Ballantine Books,http://images.amazon.com/images/P/0449906736.0...,http://images.amazon.com/images/P/0449906736.0...,http://images.amazon.com/images/P/0449906736.0...,276313,5.0
840846,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,276463,7.0


In [30]:
# check if there are duplicated records when same user rated book(s) with same title several times
ratings_df_adj[ratings_df_adj.duplicated(subset=['Book-Title', 'User-ID'],keep=False)].sort_values(by=['Book-Title','User-ID'])

Unnamed: 0,Book-Title,User-ID,Book-Rating
478255,10 Lb. Penalty,94923,5.0
574558,10 Lb. Penalty,94923,6.0
478259,10 Lb. Penalty,128835,5.0
574562,10 Lb. Penalty,128835,9.0
478264,10 Lb. Penalty,198711,5.0
...,...,...,...
275040,"\O\"" Is for Outlaw""",155147,7.0
106005,"\O\"" Is for Outlaw""",158295,7.0
275043,"\O\"" Is for Outlaw""",158295,6.0
106030,"\O\"" Is for Outlaw""",196077,7.0


In [31]:
# Use avg rate per duplicates set
ratings_df_adj=ratings_df_adj.groupby(['Book-Title','User-ID'])['Book-Rating'].mean().reset_index()
ratings_df_adj

Unnamed: 0,Book-Title,User-ID,Book-Rating
0,Clara Callan,8,5.0
1,Clara Callan,11400,6.0
2,Clara Callan,11676,8.0
3,Clara Callan,41385,6.0
4,Clara Callan,67544,8.0
...,...,...,...
840843,Dreamsnake,275318,10.0
840844,Cocktail Classics,275970,7.0
840845,Flashpoints: Promise and Peril in a New World,276313,5.0
840846,There's a Bat in Bunk Five,276463,7.0


### Only leave statistically signifacant data

In [33]:
# Define whar are the number of rates per book and books rated by user we treat as statistically significant
min_books_rated_by_user=5
min_rates_received_by_book=5

In [34]:
groupped_r_users=ratings_df_adj.groupby('User-ID')['Book-Rating'].count()
groupped_r_users[:5]

User-ID
8     14
9      3
10     1
12     1
14     4
Name: Book-Rating, dtype: int64

In [35]:
groupped_r_books=ratings_df_adj.groupby('Book-Title')['User-ID'].count()
groupped_r_books[:5]

Book-Title
 A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861 (Dear America)     4
 Ask Lily (Young Women of Faith: Lily Series, Book 5)                                                          1
 Dark Justice                                                                                                  1
 Earth Prayers From around the World: 365 Prayers, Poems, and Invocations for Honoring the Earth              10
 Final Fantasy Anthology: Official Strategy Guide (Brady Games)                                                4
Name: User-ID, dtype: int64

In [36]:
#select only those books which were rated more than min_rates_received_by_book
titles_with_acceptable_rates_count=list(groupped_r_books[groupped_r_books>min_rates_received_by_book].index)
titles_with_acceptable_rates_count[:5]

[' Earth Prayers From around the World: 365 Prayers, Poems, and Invocations for Honoring the Earth',
 ' Good Wives: Image and Reality in the Lives of Women in Northern New England, 1650-1750',
 ' Goosebumps Monster Edition 1: Welcome to Dead House, Stay Out of the Basement, and Say Cheese and Die!',
 ' Murder of a Sleeping Beauty (Scumble River Mysteries (Paperback))',
 ' Q-Space (Star Trek The Next Generation, Book 47)']

In [37]:
#select only those users (user_id) who rated more than min_books_rated_by_user books
user_ids_with_acceptable_books_count_rated=list(groupped_r_users[groupped_r_users>min_books_rated_by_user].index)
user_ids_with_acceptable_books_count_rated[:5]

[8, 17, 99, 114, 242]

In [38]:
# filter rating-user data to have only books/users of interest (which have highest rates count and rated highest number of books respectively)
rating_input_df=ratings_df_adj[ratings_df_adj['Book-Title'].isin(titles_with_acceptable_rates_count)&ratings_df_adj['User-ID'].isin(user_ids_with_acceptable_books_count_rated)]
rating_input_df

Unnamed: 0,Book-Title,User-ID,Book-Rating
0,Clara Callan,8,5.0
1,Clara Callan,11400,6.0
2,Clara Callan,11676,8.0
3,Clara Callan,41385,6.0
4,Clara Callan,67544,8.0
...,...,...,...
840606,Hush Little Baby,245088,8.0
840711,The Twelve Days of Christmas,252676,10.0
840777,"The Two Towers (The Lord of the Rings, Part 2)",259901,10.0
840843,Dreamsnake,275318,10.0


### Prepare data for parsing by 'surprise algorithms' 

In [39]:
# Even having dataframe we need to create a 'reader' with the 'rating_scale' parameter to let know that our dataset has ratings from 1 to 10
# https://surprise.readthedocs.io/en/stable/getting_started.html#load-custom
reader = Reader(rating_scale=(1,10))

# and create respective surprise.dataset object, so that our data are in a proper format for the recommendation algorithms
data_surprise_o = Dataset.load_from_df(rating_input_df[['User-ID','Book-Title','Book-Rating']], reader)

data_surprise_o

<surprise.dataset.DatasetAutoFolds at 0x147e46e50>

#### Split our data into train and test in ratio 8:2
Use random_state = 42 so that we receive reproducable output

In [40]:
random_state = 42

In [41]:
# Option1:
# Reference: https://surprise.readthedocs.io/en/stable/getting_started.html?highlight=train_test_split#train-test-split-and-the-fit-method

train_set, test_set = train_test_split(data_surprise_o, test_size=0.2, shuffle=True, random_state=random_state)

train_data = list(train_set.all_ratings())

print(f'Size of train_set: {len(train_data)}')
print(f'Number of users in train_set: {train_set.n_users}')
print(f'Number of books in train_set: {train_set.n_items}')

train_set

Size of train_set: 440879
Number of users in train_set: 16053
Number of books in train_set: 28539


<surprise.trainset.Trainset at 0x297d05490>

In [42]:
train_set.all_items()

range(0, 28539)

In [43]:
# Option2:
# reference: https://datascience.stackexchange.com/questions/73583/how-to-train-test-split-and-cross-validate-in-surprise
# get already parsed data in its raw form
# check here for raw vs inner https://surprise.readthedocs.io/en/stable/FAQ.html#what-are-raw-and-inner-ids
train_dataset_obj = Dataset.load_from_df(rating_input_df[['User-ID','Book-Title','Book-Rating']], reader) # we kind of copying the original dataset
raw_data = train_dataset_obj.raw_ratings

# shuffle ratings to ensure that the data is randomized
np.random.seed(random_state)
np.random.shuffle(raw_data)
display (f'Size of raw_data: {len(raw_data)}')
display(f'Few elements of raw_data: {raw_data[:5]}')

#???????????????
# WHY DO WE HAVE 'None' here as 4-th element?
#???????????????


'Size of raw_data: 551099'

'Few elements of raw_data: [(153723, \'Kiss Me, Cowboy!\', 7.0, None), (163128, \'Caribbean\', 7.0, None), (264996, "The Death of Innocence : The Untold Story of JonBenet\'s Murder and How Its Exploitation Compromised the Pursuit of Truth", 6.0, None), (16916, \'Welcome to the World, Baby Girl!\', 7.0, None), (59172, \'Firefly\', 5.0, None)]'

In [44]:
# find the lenght of train set having it should be 80% of all data
train_lenght = int(0.8 * len(raw_data))
# split the whole set of raw data into train and test one
train_part_raw = raw_data[:train_lenght]
test_part_raw = raw_data[train_lenght:]

# create respective train object and test set
train_dataset_obj.raw_ratings=train_part_raw  #trick to receive a dataset object from raw data 'train_part_raw'
train_set = train_dataset_obj.build_full_trainset()
test_set = train_dataset_obj.construct_testset(test_part_raw)
print(f'Type of train_set: {type(train_set)}')
print(f'Type of test_set: {type(test_set)}')

Type of train_set: <class 'surprise.trainset.Trainset'>
Type of test_set: <class 'list'>


In [45]:
# check train and test data
train_data = list(train_set.all_ratings())

display('------Train Set------')
print(f'Size of train_set: {len(train_data)}')
print(f'Number of users in train_set: {train_set.n_users}')
print(f'Number of books in train_set: {train_set.n_items}')
display(f'Few elements of train_set (user, book, rating): {train_data[:15]}')
display('------Test Set------')
print(f'Size of test_set: {len(test_set)}')
display(f'Few elements of test_set (user, book, rating): {test_set[:15]}')

'------Train Set------'

Size of train_set: 440879
Number of users in train_set: 16053
Number of books in train_set: 28539


'Few elements of train_set (user, book, rating): [(0, 0, 7.0), (0, 895, 5.0), (0, 2181, 6.0), (0, 11031, 5.0), (0, 12833, 5.0), (0, 11873, 5.0), (0, 8019, 7.0), (0, 5643, 5.0), (0, 15765, 5.0), (0, 316, 7.0), (0, 23358, 5.0), (0, 24214, 5.0), (0, 3926, 7.0), (0, 4802, 7.0), (0, 23272, 6.0)]'

'------Test Set------'

Size of test_set: 110220


"Few elements of test_set (user, book, rating): [(52584, 'DEATH PERFECT MOTHER', 5.0), (220867, 'Driving over Lemons: An Optimist in Spain (Vintage Departures)', 6.0), (102967, 'Shadowfires', 6.0), (265313, 'Riptide', 6.0), (153662, 'The Majors (Brotherhood of War)', 5.0), (235173, 'The Fig Eater : A Novel', 6.0), (269352, 'A Clockwork Orange (Norton Paperback Fiction)', 10.0), (158295, 'Junie B. Jones Is a Graduation Girl', 5.0), (87707, 'Le Grand Secret', 7.0), (244428, 'These High, Green Hills (The Mitford Years)', 7.0), (20995, 'The Cobra Event', 10.0), (252392, 'One Flew Over the Cuckoos Nest', 10.0), (258185, 'Beautiful Dreamer', 5.0), (162639, 'The Miss America Family : A Novel', 5.0), (131046, 'Making over Mike (Regency Contemporary Romance)', 5.0)]"

In [46]:
# not really sure I understand the difference here between options (in general it's stated that the second one works if we want to cross validate)
# can we receive more input here?????

## Training: cross validate the model
Make preliminary estimation of error in predictions using RMSE metric for SVD() model and the whole set of data split into 5 folds


In [65]:
# reference: https://surprise.readthedocs.io/en/stable/getting_started.html?highlight=cross_validate#use-cross-validation-iterators

# define number of subsets the dataset will be partitioned for cross validations
folds_n=5

# define a cross-validation iterator
kf = KFold(n_splits=folds_n,random_state=42)

algo = [BaselineOnly(),SVD()]

for i in algo:
    mean_rmse=[]
    for trainset, testset in kf.split(data_surprise_o):

        # train and test algorithm.
        i.fit(trainset)
        predictions = i.test(testset)

        # Compute and print Root Mean Squared Error
        mean_rmse.append(accuracy.rmse(predictions))

    # Find an average rmse for all the folds
    mean_rmse=st.mean(mean_rmse)
    display(f'Average Root Mean Square Error (RMSE) for {i.__class__.__name__} is {round(mean_rmse,2)}')

Estimating biases using als...
RMSE: 1.2912
Estimating biases using als...
RMSE: 1.2903
Estimating biases using als...
RMSE: 1.2905
Estimating biases using als...
RMSE: 1.2875
Estimating biases using als...
RMSE: 1.2877


'Average Root Mean Square Error (RMSE) for BaselineOnly is 1.29'

RMSE: 1.3087
RMSE: 1.3047
RMSE: 1.3069
RMSE: 1.3029
RMSE: 1.3043


'Average Root Mean Square Error (RMSE) for SVD is 1.31'

## Training: identify best parameters

In [48]:
# # reference: https://medium.com/p/61c269402919
# # within the dictionary param_grid set ranges for parameters to try out, where 
# #  - key: parameter name, 
# #  - value:  list of parameter values to try
# param_grid = {'n_factors':[10,20,50,100,150], 'n_epochs':[10,20,30,50, 100], 'lr_all':[0.002,0.005,0.01,0.3],'reg_all':[0.02, 0.1, 0.2]}
# # n_factors - number of latent factors (Default is 100)
# # n_epochs - number of iterations (Default is 20)
# # lr_all - step size for the gradient descent optimization (learning rate, Default is 0.005)
# # reg_all -  regularization term for all parameter, used to prevent overfitting (Default is 0.02)

# # Select parameters for our SVD algorithm by cross validation and looking for rmse metric
# # GridSearchCV() calculates a score for each combination of hyperparameters on a k-fold cross validated dataset 
# # and returns the set of parameters that minimises the mean score across folds
# gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5)

# # train the model for all the combinations of parameters
# gs.fit(data_surprise_o)

# # select the set of parameters that produce lowest value for rmse metric
# params = gs.best_params['rmse']

# params


## Training: Build models

In [49]:
# Build SVD with default parameters
svd_default = SVD()

In [50]:
# Build SVD, using hyperparameters recevied in the result of hypertuning
# svd_best_parameters = SVD(n_factors=params['n_factors'], n_epochs=params['n_epochs'],lr_all=params['lr_all'], reg_all=params['reg_all'])

In [51]:
# TO DELETE IN CASE WE RUN PARAMETERS TUNING
svd_best_parameters=svd_default
# TO DELETE IN CASE WE RUN PARAMETERS TUNING

## Training: Fit models

In [52]:
# Train the models on trainset
svd_best_parameters_model = svd_best_parameters.fit(train_set)
svd_default_model = svd_default.fit(train_set)

# Recommendation generation

In [53]:
# Define the number of books to recommend by algorithm
recommendations_count=5

### Find list of books (titles) among which recommendations will be selected

In [54]:
# let us select dummy user   e.g. 25919, 
user_id=6251

# find all the books (titles) in ratings_df. 
# Note, this only include those books which both: 
# (1) were rated at least min_rates_received_by_book times by users, who rated at least min_books_rated_by_user
# and (2) we have titles for those books
# ??????????We may need to remove (1) condition and look for books, which were rated at least once??????????
all_titles=ratings_df_adj['Book-Title'].unique()
all_titles

# find the books (titles) that were rated and presumably read by a user
rated_titles=[i for i in ratings_df_adj.loc[ratings_df_adj['User-ID']==user_id,'Book-Title']]
rated_titles


# find the books (titles) that were not rated and presumably not read by a user
titles_input_to_recommend=[i for i in all_titles if i not in rated_titles]
titles_input_to_recommend[:5]

['Clara Callan',
 'Decision in Normandy',
 'Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It',
 "What If?: The World's Foremost Military Historians Imagine What Might Have Been",
 'PLEADING GUILTY']

In [55]:
rated_book_title_rate =[(row['Book-Title'],row['Book-Rating']) for i,row in ratings_df_adj[ratings_df_adj['User-ID']==user_id].iterrows()]
rated_book_title_rate

[('Wild Animus', 4.0),
 ('Timeline', 8.0),
 ('Isle of Dogs', 5.0),
 ('A Soldier of the Great War', 7.0),
 ('The Joy Luck Club', 8.0),
 ('The Tao of Pooh', 10.0),
 ('Seabiscuit', 7.0),
 ("Life's Little Instruction Book (Life's Little Instruction Books (Paperback))",
  7.0),
 ('Starship Troopers', 7.0),
 ("Corelli's Mandolin : A Novel", 7.0),
 ('The Five People You Meet in Heaven', 8.0),
 ('The Sum of All Fears', 7.0),
 ('Little Altars Everywhere: A Novel', 7.0),
 ('Body of Evidence (Kay Scarpetta Mysteries (Paperback))', 7.0),
 ('The Rescue', 7.0),
 ("The Kalahari Typing School for Men (No. 1 Ladies' Detective Agency)", 7.0),
 ('Wicked: The Life and Times of the Wicked Witch of the West', 7.0),
 ('Lies and the Lying Liars Who Tell Them: A Fair and Balanced Look at the Right',
  8.0),
 ("On the Bright Side, I'm Now the Girlfriend of a Sex God: Further Confessions of Georgia Nicolson",
  6.0),
 ('Eva Luna', 7.0),
 ('A Man in Full', 6.0),
 ("Vinegar Hill (Oprah's Book Club (Paperback))", 6

### Find predicted ratings for the user for all the books, which were not rated by him/her

In [56]:
# reference: https://surprise.readthedocs.io/en/stable/algobase.html?highlight=predict
# uid – (Raw) id of the user. 
# iid – (Raw) id of the item.
# verbose (bool) – Whether to print details of the prediction. Default is False.

predictions=[svd_default_model.predict(uid=user_id, iid=i) for i in titles_input_to_recommend]
predictions[:5]

[Prediction(uid=6251, iid='Clara Callan', r_ui=None, est=6.622579770799428, details={'was_impossible': False}),
 Prediction(uid=6251, iid='Decision in Normandy', r_ui=None, est=6.169392752492404, details={'was_impossible': False}),
 Prediction(uid=6251, iid='Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It', r_ui=None, est=5.550156902757992, details={'was_impossible': False}),
 Prediction(uid=6251, iid="What If?: The World's Foremost Military Historians Imagine What Might Have Been", r_ui=None, est=5.851956054006721, details={'was_impossible': False}),
 Prediction(uid=6251, iid='PLEADING GUILTY', r_ui=None, est=6.169392752492404, details={'was_impossible': False})]

In [57]:
# check of there are any predictions with 'was_impossible': True
predictions_impossible=[i for i in predictions if i.details['was_impossible']==True]
predictions_impossible

[]

In [58]:
# get ratings estimate for books by the user
ratings=[i.est for i in predictions]
ratings[:5]

[6.622579770799428,
 6.169392752492404,
 5.550156902757992,
 5.851956054006721,
 6.169392752492404]

In [59]:
# convert predicted estimates by the user for not read books into df
pred_dict={
    'Book_Title':titles_input_to_recommend,
    'Estimated_Rate':ratings}
predictions_book=pd.DataFrame(pred_dict).sort_values('Estimated_Rate',ascending = False)
predictions_book

Unnamed: 0,Book_Title,Estimated_Rate
27,To Kill a Mockingbird,9.338332
878,Dragonfly in Amber,9.014975
2643,The Hobbit : The Enchanting Prelude to The Lor...,8.994067
603,"The Two Towers (The Lord of the Rings, Part 2)",8.936750
3009,Charlotte's Web (Trophy Newbery),8.886899
...,...,...
16685,The Celestine Prophecy : AN EXPERIENTIAL GUIDE,4.507433
33304,Black Coffee Blues,4.488741
4894,The Tailor of Panama,4.473716
39219,The Burning Times: A Novel,4.452318


In [60]:
print(f'for a user {user_id}, who read the following book(s), with ratings shown:') 
display(rated_book_title_rate)
print('The recommendation is following:')
display(predictions_book.head(recommendations_count))

for a user 6251, who read the following book(s), with ratings shown:


[('Wild Animus', 4.0),
 ('Timeline', 8.0),
 ('Isle of Dogs', 5.0),
 ('A Soldier of the Great War', 7.0),
 ('The Joy Luck Club', 8.0),
 ('The Tao of Pooh', 10.0),
 ('Seabiscuit', 7.0),
 ("Life's Little Instruction Book (Life's Little Instruction Books (Paperback))",
  7.0),
 ('Starship Troopers', 7.0),
 ("Corelli's Mandolin : A Novel", 7.0),
 ('The Five People You Meet in Heaven', 8.0),
 ('The Sum of All Fears', 7.0),
 ('Little Altars Everywhere: A Novel', 7.0),
 ('Body of Evidence (Kay Scarpetta Mysteries (Paperback))', 7.0),
 ('The Rescue', 7.0),
 ("The Kalahari Typing School for Men (No. 1 Ladies' Detective Agency)", 7.0),
 ('Wicked: The Life and Times of the Wicked Witch of the West', 7.0),
 ('Lies and the Lying Liars Who Tell Them: A Fair and Balanced Look at the Right',
  8.0),
 ("On the Bright Side, I'm Now the Girlfriend of a Sex God: Further Confessions of Georgia Nicolson",
  6.0),
 ('Eva Luna', 7.0),
 ('A Man in Full', 6.0),
 ("Vinegar Hill (Oprah's Book Club (Paperback))", 6

The recommendation is following:


Unnamed: 0,Book_Title,Estimated_Rate
27,To Kill a Mockingbird,9.338332
878,Dragonfly in Amber,9.014975
2643,The Hobbit : The Enchanting Prelude to The Lor...,8.994067
603,"The Two Towers (The Lord of the Rings, Part 2)",8.93675
3009,Charlotte's Web (Trophy Newbery),8.886899


In [67]:
# prediction_full_info=pd.merge(predictions_book, books_df, left_on='Book_Title',right_on='Book-Title', how='left')
# prediction_full_info=prediction_full_info[['Book_Title','Book-Author','Year-Of-Publication','Publisher', 'Estimated_Rate']]
# prediction_full_info.head(20)

Unnamed: 0,Book_Title,Book-Author,Year-Of-Publication,Publisher,Estimated_Rate
0,To Kill a Mockingbird,Harper Lee,1988,Little Brown &amp; Company,9.338332
1,To Kill a Mockingbird,Harper Lee,1985,Warner Books,9.338332
2,To Kill a Mockingbird,Harper Lee,2002,Perennial Classics,9.338332
3,To Kill a Mockingbird,Harper Lee,1960,Harpercollins,9.338332
4,To Kill a Mockingbird,Harper Lee,1995,HarperCollins Publishers,9.338332
5,To Kill a Mockingbird,Harper Lee,1982,Not Avail,9.338332
6,To Kill a Mockingbird,Harper Lee,1997,Audio Partners,9.338332
7,To Kill a Mockingbird,Harper Lee,1991,Buccaneer Books Inc,9.338332
8,Dragonfly in Amber,DIANA GABALDON,1993,Dell,9.014975
9,Dragonfly in Amber,DIANA GABALDON,1992,Delacorte Press,9.014975


In [66]:
df1=rating_input_df[rating_input_df['Book-Title'].str.contains('Oprah', case=False, na=False)]
df1['Book-Rating'].value_counts()

Book-Rating
7.0     1702
6.0      913
8.0      829
9.0      264
10.0     260
5.0      214
4.0       29
3.0       28
2.0       13
1.0        8
Name: count, dtype: int64

# Check Accuracy

In [63]:
# Check the accuracy for both default vs tuned method
# display(f'Accuracy, based on Root Mean Square Error, of a Tuned Model: {accuracy.rmse(svd_best_parameters_model.test(test_set), verbose=False)}')
print(f'''Accuracy, based on Root Mean Square Error, of a Model with Default Parameters:
{accuracy.rmse(svd_default_model.test(test_set), verbose=False)}''')

Accuracy, based on Root Mean Square Error, of a Model with Default Parameters:
1.3041447719825463


In [64]:
print(f'''Accuracy, based on Mean Absolute Error, of a Model with Default Parameters:
{accuracy.mae(svd_default_model.test(test_set), verbose=False)}''')

Accuracy, based on Mean Absolute Error, of a Model with Default Parameters:
0.9522857634201722
