In [1]:
import os, io
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

# Read the books dataset and explore it

In [11]:
bx_books = pd.read_csv('BX-Books.csv',
                       encoding = "ISO-8859-1")
bx_books.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [12]:
bx_books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271379 entries, 0 to 271378
Data columns (total 5 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   isbn                 271379 non-null  object
 1   book_title           271379 non-null  object
 2   book_author          271378 non-null  object
 3   year_of_publication  271379 non-null  object
 4   publisher            271377 non-null  object
dtypes: object(5)
memory usage: 10.4+ MB


In [16]:
bx_books.isbn.nunique()

271379

**271379 unique isbn values**

# Clean up NaN values

In [19]:
bx_books.isnull().sum()

isbn                   0
book_title             0
book_author            1
year_of_publication    0
publisher              2
dtype: int64

## Drop the null values

In [20]:
bx_books = bx_books.dropna()
bx_books.isnull().sum()

isbn                   0
book_title             0
book_author            0
year_of_publication    0
publisher              0
dtype: int64

# Read the data where ratings are given by users

In [13]:
bx_book_ratings = pd.read_csv('BX-Book-Ratings.csv',
                              encoding = "ISO-8859-1")
bx_book_ratings.head()

Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,155061224,5
2,276727,446520802,0
3,276729,052165615X,3
4,276729,521795028,6


In [14]:
bx_book_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   user_id  1048575 non-null  int64 
 1   isbn     1048575 non-null  object
 2   rating   1048575 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 24.0+ MB


In [22]:
bx_book_ratings.isnull().sum()

user_id    0
isbn       0
rating     0
dtype: int64

**No missing values in the dataset**

# Take a quick look at the number of unique users and books

In [23]:
# Code for checking number of unique users and books. 
n_users = bx_book_ratings.user_id.nunique()
n_books = bx_book_ratings.isbn.nunique()

print('Num. of Users: '+ str(n_users))
print('Num of Books: '+str(n_books))

Num. of Users: 95513
Num of Books: 322102


# Convert ISBN variables to numeric numbers in the correct order

In [26]:
bx_books.isbn.value_counts().nlargest(10)

195153448     1
785339876     1
077108482X    1
207124310     1
439172543     1
590408518     1
902375512     1
1885222831    1
732909449     1
330337408     1
Name: isbn, dtype: int64

In [28]:
bx_book_ratings.isbn.value_counts().nlargest(10)

971880107     2264
316666343     1164
385504209      813
312195516      668
60928336       662
044023722X     595
679781587      578
142001740      555
067976402X     552
671027360      536
Name: isbn, dtype: int64

## Merge the bx_books and bx_books_rating datasets

In [29]:
final_df_books_ratings = pd.merge(bx_books,
                                  bx_book_ratings,
                                  on='isbn')
final_df_books_ratings.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,user_id,rating
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,2,0
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,8,5
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11400,0
3,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11676,8
4,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,41385,0


In [30]:
final_df_books_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 941145 entries, 0 to 941144
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   isbn                 941145 non-null  object
 1   book_title           941145 non-null  object
 2   book_author          941145 non-null  object
 3   year_of_publication  941145 non-null  object
 4   publisher            941145 non-null  object
 5   user_id              941145 non-null  int64 
 6   rating               941145 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 57.4+ MB


In [31]:
# Code for checking number of unique users and books. 
n_users = final_df_books_ratings.user_id.nunique()
n_books = final_df_books_ratings.isbn.nunique()

print('Num. of Users: '+ str(n_users))
print('Num of Books: '+str(n_books))

Num. of Users: 83644
Num of Books: 257829


## Convert ISBN variables to numeric numbers in the correct order¶

In [32]:
# Convert and print length of isbn list
isbn_list = final_df_books_ratings.isbn.unique()
print(" Length of isbn List:", len(isbn_list))

def get_isbn_numeric_id(isbn):
    #print ("  isbn is:" , isbn)
    itemindex = np.where(isbn_list==isbn)
    return itemindex[0][0]

 Length of isbn List: 257829


In [33]:
final_df_books_ratings['isbn_id'] = final_df_books_ratings['isbn'].apply(get_isbn_numeric_id)
final_df_books_ratings.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,user_id,rating,isbn_id
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,2,0,0
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,8,5,1
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11400,0,1
3,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11676,8,1
4,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,41385,0,1


In [34]:
final_df_books_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 941145 entries, 0 to 941144
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   isbn                 941145 non-null  object
 1   book_title           941145 non-null  object
 2   book_author          941145 non-null  object
 3   year_of_publication  941145 non-null  object
 4   publisher            941145 non-null  object
 5   user_id              941145 non-null  int64 
 6   rating               941145 non-null  int64 
 7   isbn_id              941145 non-null  int64 
dtypes: int64(3), object(5)
memory usage: 64.6+ MB


In [35]:
final_df_books_ratings.isnull().sum()

isbn                   0
book_title             0
book_author            0
year_of_publication    0
publisher              0
user_id                0
rating                 0
isbn_id                0
dtype: int64

**No missing values in the data**

# Convert the user_id variable to numeric numbers in the correct order

In [36]:
userid_list = final_df_books_ratings.user_id.unique()
print(" Length of user_id List:", len(userid_list))

def get_user_id_numeric_id(user_id):
    #print ("  isbn is:" , isbn)
    itemindex = np.where(userid_list == user_id)
    return itemindex[0][0]

 Length of user_id List: 83644


In [37]:
final_df_books_ratings['user_id_order'] = final_df_books_ratings['user_id'].apply(get_user_id_numeric_id)
final_df_books_ratings.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,user_id,rating,isbn_id,user_id_order
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,2,0,0,0
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,8,5,1,1
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11400,0,1,2
3,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11676,8,1,3
4,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,41385,0,1,4


In [38]:
final_df_books_ratings.user_id_order.value_counts()

3        11147
365       6456
547       5817
3960      5777
120       5646
         ...  
45140        1
45141        1
45143        1
45144        1
83643        1
Name: user_id_order, Length: 83644, dtype: int64

In [39]:
final_df_books_ratings.isbn_id.value_counts()

26        2264
408       1164
748        813
522        668
1105       662
          ... 
140173       1
140174       1
140175       1
140180       1
257828       1
Name: isbn_id, Length: 257829, dtype: int64

# Convert both user_id and ISBN to the ordered list, i.e., from 0...n-1

Completed above

# Re-index the columns to build a matrix

In [40]:
# Reindexing the columns
new_col_order = ['user_id_order', 
                 'isbn_id', 
                 'rating',
                 'book_title', 
                 'book_author',
                 'year_of_publication',
                 'publisher',
                 'isbn',
                 'user_id']

final_df_books_ratings_1 = final_df_books_ratings.reindex(columns= new_col_order)
final_df_books_ratings_1.head()

Unnamed: 0,user_id_order,isbn_id,rating,book_title,book_author,year_of_publication,publisher,isbn,user_id
0,0,0,0,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,195153448,2
1,1,1,5,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,2005018,8
2,2,1,0,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,2005018,11400
3,3,1,8,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,2005018,11676
4,4,1,0,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,2005018,41385


# Split your data into two sets (training and testing)

In [86]:
# Importing train_test_split model for splittig the data into train and test set.
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(final_df_books_ratings_1,
                                         test_size=0.30)

In [87]:
print("Shape of training data",train_data.shape)
print("Shape of test data",test_data.shape)

Shape of training data (658801, 9)
Shape of test data (282344, 9)


# Make predictions based on user and item variables

In [88]:
print("n_users",n_users)
print("n_books",n_books)

n_users 83644
n_books 257829


In [91]:
# Create user-book matrix for training 
train_data_matrix = np.zeros((n_users, n_books))

for line in train_data.itertuples():
    print("line", line)
    train_data_matrix[line[1]-1, line[2]-1] = line[3]

In [92]:
print(train_data_matrix.shape)
train_data_matrix

(83644, 257829)


array([[5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [8., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [93]:
# Create user-book matrix for testing
test_data_matrix = np.zeros((n_users, n_books))

for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [None]:
# Importing pairwise_distances function
from sklearn.metrics.pairwise import pairwise_distances

user_similarity = pairwise_distances(train_data_matrix, 
                                     metric='cosine')

item_similarity = pairwise_distances(train_data_matrix.T, 
                                     metric='cosine')

In [None]:
print("user_similarity",user_similarity)
print("item_similarity",item_similarity)

In [None]:
# Defining custom function to make predictions
def predict(ratings, similarity, type='user'):
    
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        
        # We will use np.newaxis so that mean_user_rating has same format as ratings.
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    
    return pred

# Use RMSE to evaluate the predictions the predictions

In [None]:
# Importing RMSE function 
from sklearn.metrics import mean_squared_error
from math import sqrt

# Defining custom function to filter out elements with ground_truth.nonzero
def rmse(prediction, ground_truth):
    
    prediction = prediction[ground_truth.nonzero()].flatten() 
    
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    
    return sqrt(mean_squared_error(prediction, ground_truth))

In [None]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))