# Books Recommendations using SVD vs SVD Func

In [1]:
# !pip install scikit-surprise

# Import Dependencies

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from surprise import accuracy, Dataset, Reader, SVD, BaselineOnly, PredictionImpossible
from surprise.model_selection import train_test_split, cross_validate, KFold, GridSearchCV
import random
import numpy as np
import statistics as st
from scipy.sparse.linalg import svds
import math 

In [4]:
# having min_books_rated_by_user and min_rates_received_by_book to define what we treat as statistically significant
# remove those records from ratings_df, which have those books with less than min_rates_received_by_book reviews and those users who have left less than min_books_rated_by_user reviews
def leave_stat_sign_data(ratings_df, min_books_rated_by_user=5,min_rates_received_by_book=5):
    #select only those books which were rated more than min_rates_received_by_book
    groupped_r_books=ratings_df.groupby('Book-Title')['User-ID'].count()
    titles_with_acceptable_rates_count=list(groupped_r_books[groupped_r_books>min_rates_received_by_book].index)
    #select only those users (user_id) who rated more than min_books_rated_by_user books
    groupped_r_users=ratings_df.groupby('User-ID')['Book-Rating'].count()
    user_ids_with_acceptable_books_count_rated=list(groupped_r_users[groupped_r_users>min_books_rated_by_user].index)
    # filter rating-user data to have only books/users of interest (which have highest rates count and rated highest number of books respectively)
    rating_final_df=ratings_df[ratings_df['Book-Title'].isin(titles_with_acceptable_rates_count)&ratings_df['User-ID'].isin(user_ids_with_acceptable_books_count_rated)]
    return rating_final_df

In [5]:
# define formula for calculation of rmse, having actual and predicted rates lists
def rmse(actual_rates, predicted_rates):
    error = actual_rates - predicted_rates
    mean_square_error=sum([i*i for i in error])/len(error)
    return math.sqrt(mean_square_error)

In [6]:
# split data in such a ways that we have data for all the users in both train and test sets - to be used by pure SVD
def train_test_split_SVD (rating_final_df, test_ratio = 0.1):
    user_list = rating_final_df['User-ID'].unique() #list of all users
    test_set = pd.DataFrame(columns=rating_final_df.columns) # reserve df for a train set
    train_set = pd.DataFrame(columns=rating_final_df.columns) # reserve df for a test set
    for user in user_list:
        # for each user take their book/rating data 
        user_data_all = rating_final_df[rating_final_df['User-ID'] == user]
        n = len(user_data_all)
        user_data_all = user_data_all.reset_index()
        user_data_all.drop('index', axis=1, inplace=True)
        # split user data into train and test 
        test_size = int(test_ratio*n)

        # randomly select roughtly 10% of rows for test set per user using random_state=1, so that result is reproducible
        test = user_data_all.sample(n=test_size, random_state=1)  

        # rows not selected for test set, assigned to train one
        train = user_data_all.drop(test.index)

        test_set = pd.concat([test_set, test], ignore_index=True)
        train_set = pd.concat([train_set, train], ignore_index=True)
    return (train_set, test_set)

In [38]:
# test the performance over a different number of laatent factors k_set
def test_rmse_SVD(train_set, test_set, k_set = [8, 20, 50, 100, 150]):
    rmse_scores={}
    over_mean=train_set['Book-Rating'].mean()
    for l_f in k_set: 
        # Build the prediction matrix using the train_set
        all_predictions_df = build_prediction_matrix_SVD(train_set, l_f)
        
#          / experiment if time allows
#         merged_test_set = test_set.set_index(['User-ID', 'Book-Title'])
#         predicted_ratings = all_predictions_df.stack().reindex(merged_test_set.index).fillna(over_mean)

#         # Calculate RMSE for the current number of features
#         current_rmse = rmse(merged_test_set['Book-Rating'], predicted_ratings.values)
#         / experiment if time allows

        # reserve a list for predicted ratings
        pred = []
        for i, row in test_set.iterrows():
            user_id = row['User-ID']
            book_title = row['Book-Title']  
            if user_id in all_predictions_df.index.values and book_title in all_predictions_df.columns:
                pred_rating = all_predictions_df.loc[user_id, book_title]
            else:
                # If the book or user is not in the train_set, use a default prediction
                # which is the average of all ratings in the training set as a simple approach
                pred_rating = over_mean   
            pred.append(pred_rating)

        # Calculate RMSE for the current number of features
        current_rmse = rmse(test_set['Book-Rating'], pred)
        rmse_scores[l_f]=current_rmse
    return rmse_scores

In [9]:
# building pure SVD model
def build_prediction_matrix_SVD(rating_input_df, latent_factors=70):
# SVD finds a hidden feature space where the users and books they like have feature vectors that are closely aligned.
# Build the model based on decomposing 'user-rates' matrix df_books_ratigs_user into 3 matrices U×sigma×Vt:
# U matrix - represents the feature vectors corresponding to the users in the hidden feature space 
# sigma - represents
# Vt matrix - represents the feature vectors corresponding to the books in the hidden feature space 
    # Pivot to obtain a matrix that stores original ratings given by users for books and fill sparse values with 0-s
    df_books_ratigs_user=rating_input_df.pivot_table(index='User-ID', columns='Book-Title', values='Book-Rating').fillna(0)
    # Normilize the data, using mean normalization.
    data_original = df_books_ratigs_user.to_numpy() # vectorize the data
    ratings_mean = np.mean(data_original, axis = 1) # find a mean per each vector (user)
    normalized_data = data_original - ratings_mean.reshape(-1, 1) #subtract mean for each user from their ratings, which centers the ratings around 0 for each user
    # Decompose the normilized matrix into 3, with k = latent_factors (70 default) largest singular values in sigma
    U, sigma, Vt = svds(normalized_data, k = latent_factors)
    # Convert vector to a diagonal matrix
    sigma = np.diag(sigma)
    # Compose matrix with predictions, reversing data normalization
    # e.g. having two vectors from the same feature space if we want to find if they are similar we need to find a Dot product.
    # and to find out that user i likes book j, we would take the dot product of the i-th entry in U and j-th entry in Vt.
    all_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + ratings_mean.reshape(-1, 1)
    # convert numpy array into dataframe
    all_predictions_df = pd.DataFrame(all_predicted_ratings, columns=df_books_ratigs_user.columns, index=df_books_ratigs_user.index)
    return all_predictions_df

In [10]:
## Find books prediction for a specific user and recommend top recommendations_count books
def recommend_books_for_user (user_id, ratings_df, books_df, recommendations_count=5, model=None, all_predictions=None):
# find those titles that we consider for predictions (e.g. not read by a user)
    # find the books (titles) that were rated and presumably read by a user
    rated_titles=[i for i in ratings_df.loc[ratings_df['User-ID']==user_id,'Book-Title']]
    # find all the titles within the matrix
    if model is None:
        all_titles=all_predictions.columns
    else:
        all_titles=ratings_df['Book-Title'].unique()
    # separate those titles that were not read
    titles_input_to_recommend=[i for i in all_titles if i not in rated_titles]
    
# find predictions for a user
    if model is None:
        user_predictions_all=all_predictions.loc[user_id]
        # sort predictions and select top recommendations_count
        user_predictions_all=pd.DataFrame(user_predictions_all)
        user_recommendation= user_predictions_all.loc[titles_input_to_recommend].sort_values(by=user_id, ascending=False)
        top_recommendations=user_recommendation[:recommendations_count].rename(columns={user_id:'estimated rate'})
    else:
        # find predictions for a user
        # reference: https://surprise.readthedocs.io/en/stable/algobase.html?highlight=predict
        # uid – (Raw) id of the user. 
        # iid – (Raw) id of the item.
        # verbose (bool) – Whether to print details of the prediction. Default is False.
        predictions=[model.predict(uid=user_id, iid=i) for i in titles_input_to_recommend]
        # get ratings estimate for books by the user
        ratings=[i.est for i in predictions]
        # convert predicted estimates by the user for not read books into df
        pred_dict={
            'Book-Title':titles_input_to_recommend,
            'Estimated_Rate':ratings}
        predictions_book=pd.DataFrame(pred_dict).sort_values('Estimated_Rate',ascending = False)
        top_recommendations=predictions_book.head(recommendations_count)
        
# populate books with full info, selecting those books with the most recent year of publication
    recommendations_full_info=pd.merge(top_recommendations, books_df, left_on='Book-Title',right_on='Book-Title', how='left')
    dict_years=dict(recommendations_full_info.groupby('Book-Title')['Year-Of-Publication'].max())
    for i, row in recommendations_full_info.iterrows():
        if row['Year-Of-Publication']!=dict_years[row['Book-Title']]:
            recommendations_full_info.loc[i,'Year-Of-Publication']=0
    recommendations_full_info=recommendations_full_info[recommendations_full_info['Year-Of-Publication'] != 0]
    recommendations_full_info=recommendations_full_info.drop_duplicates(subset=['Book-Title'])
    return recommendations_full_info

# Explore the data and Prepare for Train 

In [13]:
#Creating dataframes from csv files to read the data
books_df_original = pd.read_csv('./Resources/Books.csv')
ratings_df_original = pd.read_csv('./Resources/Ratings.csv')

  books_df_original = pd.read_csv('./Resources/Books.csv')


In [14]:
# remove duplicated books records if any by looking at ISBN
books_df=books_df_original.copy()
books_df=books_df.drop_duplicates(subset=['ISBN'])

In [15]:
duplicated_titles=books_df[books_df.duplicated(subset=['Book-Title'],keep=False)].sort_values(by='Book-Title')
duplicated_titles.head(2)
# so far we leave those titles as is to not lost ratings

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
75637,1565920465,!%@ (A Nutshell handbook),Donnalyn Frey,1994,O'Reilly,http://images.amazon.com/images/P/1565920465.0...,http://images.amazon.com/images/P/1565920465.0...,http://images.amazon.com/images/P/1565920465.0...
156341,1565920317,!%@ (A Nutshell handbook),Donnalyn Frey,1993,O'Reilly,http://images.amazon.com/images/P/1565920317.0...,http://images.amazon.com/images/P/1565920317.0...,http://images.amazon.com/images/P/1565920317.0...


In [16]:
# update the datatype of a 'Year-Of-Publication' field to numeric one
books_df['Year-Of-Publication']=pd.to_numeric(books_df['Year-Of-Publication'],errors='coerce')
# Filter out data with no publication year
books_df = books_df[books_df['Year-Of-Publication'] > 0]
books_df['Year-Of-Publication']=books_df['Year-Of-Publication'].astype(int)
# and check the result
books_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 266739 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 266739 non-null  object
 1   Book-Title           266739 non-null  object
 2   Book-Author          266737 non-null  object
 3   Year-Of-Publication  266739 non-null  int64 
 4   Publisher            266737 non-null  object
 5   Image-URL-S          266739 non-null  object
 6   Image-URL-M          266739 non-null  object
 7   Image-URL-L          266739 non-null  object
dtypes: int64(1), object(7)
memory usage: 18.3+ MB


In [17]:
ratings_df=ratings_df_original.copy()
# update the datatype of a 'Book-Rating' field to numeric one
ratings_df['Book-Rating']=pd.to_numeric(ratings_df['Book-Rating'],errors='coerce')
# and check the result
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


### Change ISBN with Titles
Merge ratings with books data in order to change isbn with title and leave only those ratings data for which we have title info


In [18]:
ratings_df=pd.merge(books_df,ratings_df,on='ISBN', how = 'inner')
ratings_df.head(2)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,2,0
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,8,5


In [19]:
# delete those rows with no book titles or no book rates if any (even though there should no be such as we used inner join above)
ratings_df=ratings_df.dropna()
ratings_df_all_cols=ratings_df.copy()
# delete those columns, which we are not going to use within machine learning algos
ratings_df=ratings_df.drop(['ISBN','Book-Author','Year-Of-Publication','Publisher','Image-URL-S','Image-URL-M','Image-URL-L'], axis=1)
ratings_df.head(2)

Unnamed: 0,Book-Title,User-ID,Book-Rating
0,Classical Mythology,2,0
1,Clara Callan,8,5


In [20]:
# Filter out data with zero ratings
ratings_df_no_zeros = ratings_df[ratings_df['Book-Rating'] != 0]

In [24]:
# Use avg rate per duplicates set for three: ratings_df, ratings_df_adj and ratings_df_no_zeros
ratings_df_no_zeros=ratings_df_no_zeros.groupby(['Book-Title','User-ID'])['Book-Rating'].mean().reset_index()
ratings_df_no_zeros.head(2)

Unnamed: 0,Book-Title,User-ID,Book-Rating
0,A Light in the Storm: The Civil War Diary of ...,96448,9.0
1,"Ask Lily (Young Women of Faith: Lily Series, ...",269557,8.0


In [25]:
# leave only statistically significant data for both ratings_df_adj and ratings_df_no_zeros
rating_final_no_zeros_df=leave_stat_sign_data(ratings_df_no_zeros)

# Pure SVD model

### Train/Test split

In [26]:
train_set_no_zeros, test_set_no_zeros = train_test_split_SVD(rating_final_no_zeros_df)

##  Generation of a matrix with predicted rates per book

In [27]:
all_predictions_no_zeros=build_prediction_matrix_SVD(rating_final_no_zeros_df)

In [28]:
all_predictions_no_zeros

Book-Title,"Earth Prayers From around the World: 365 Prayers, Poems, and Invocations for Honoring the Earth",!Yo!,'Salem's Lot,01-01-00: The Novel of the Millennium,10 Lb. Penalty,"10,000 dreams interpreted: A dictionary of dreams",100 Best-Loved Poems (Dover Thrift Editions),100 Selected Poems by E. E. Cummings,1001 Things Everyone Should Know About Science,1001 Ways to Be Romantic,...,"\O\"" Is for Outlaw""","\Surely You're Joking, Mr. Feynman!\"": Adventures of a Curious Character""","\The Happy Prince\"" and Other Stories (Penguin Popular Classics)""","\What Do You Care What Other People Think?\"": Further Adventures of a Curious Character""",e,iI Paradiso Degli Orchi,murder@maggody.com : An Arly Hanks Mystery (Arly Hanks Mysteries (Paperback)),one hundred years of solitude,stardust,why I'm like this : True Stories
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0.000245,0.000219,0.000056,0.002000,-0.000336,0.000394,0.000052,0.000047,-0.000200,0.000140,...,-0.001038,0.001280,0.000396,0.000330,-0.000644,0.000353,0.000510,0.000039,0.002024,0.000403
99,0.002913,0.003248,0.001333,-0.001974,0.002214,-0.000002,0.004381,0.001519,0.005974,0.004444,...,-0.003512,0.003680,0.002735,0.002110,-0.006801,0.003581,0.006024,-0.000692,0.003367,0.005465
114,0.008808,0.000331,-0.035543,0.056729,-0.069261,-0.004213,-0.017281,-0.038169,-0.009262,-0.017620,...,0.026166,-0.008591,0.045725,0.002359,0.084810,0.001898,-0.011920,-0.037713,-0.021000,-0.015035
242,-0.008163,0.002806,0.098813,-0.010835,-0.032621,-0.001764,0.027848,0.026195,-0.014049,-0.000306,...,-0.094758,0.051785,-0.000270,-0.002696,-0.036294,0.000138,0.000584,0.000879,0.008610,-0.009145
243,-0.014596,0.012220,-0.086185,0.010529,-0.020236,0.017999,-0.026686,0.084581,0.043281,0.061944,...,0.477174,0.006837,-0.021648,-0.008918,-0.029811,-0.009990,0.006611,-0.012845,-0.077676,0.009178
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278633,0.016941,0.003876,-0.037586,-0.017051,-0.066762,0.008461,-0.022465,0.025970,0.041588,0.060244,...,-0.240852,-0.059897,-0.012479,-0.003743,0.042439,0.004137,0.008666,0.017967,-0.154048,0.055585
278694,0.002407,0.002294,-0.002261,0.008578,0.001928,0.002335,0.000164,0.003819,0.003144,0.003134,...,0.047935,0.003106,0.002439,0.002280,0.001762,0.002784,0.001542,0.002566,0.005401,0.001730
278843,-0.011425,-0.027014,0.051353,-0.000474,0.025538,0.002019,-0.041905,0.053412,0.000979,0.002522,...,0.107071,0.039368,-0.010670,-0.004503,0.190347,0.002539,0.005742,0.106894,0.067311,-0.024405
278851,0.004294,-0.000494,0.027033,0.007322,-0.000608,0.000668,0.000108,0.000076,-0.006946,0.001850,...,0.053799,-0.001106,0.000382,0.001553,-0.003663,0.000477,-0.000195,0.001884,0.002360,0.003435


## Testing Accuracy

In [None]:
rmse_scores_no_zeros = test_rmse_SVD(train_set_no_zeros, test_set_no_zeros)
print('Accuracy for data with no 0 ratings (number of latent factors, RMSE):')
display(rmse_scores_no_zeros)

# Check predictions for specific user

In [None]:
# find original user ratings
u_data=ratings_df_all_cols[ratings_df_all_cols['User-ID']==252676].sort_values(by='Book-Rating', ascending=False)
u_data[['User-ID','Book-Title','Book-Rating','Book-Author','Year-Of-Publication','Publisher']]

In [None]:
# preview recommendations by pure SVD (mean)
recommend_books_for_user(252676, ratings_df, books_df, all_predictions=all_predictions_no_zeros)