In [3]:
import pandas as pd
import numpy as np
import time
from sklearn.metrics import mean_squared_error, mean_absolute_error
from surprise import Dataset, Reader, accuracy
from surprise.prediction_algorithms import knns, matrix_factorization, slope_one, co_clustering
from surprise.model_selection import train_test_split, split, cross_validate, search

In [4]:
user_cols = ['user_id', 'location', 'age']
books_cols = ['isbn', 'book_title' ,'book_author','year_of_publication', 'publisher', 'img_s ', 'img_m', 'img_l']
ratings_cols = ['user_id', 'isbn', 'rating']

users = pd.read_csv('data/BX-Users.csv', sep=';', encoding='latin-1', names = user_cols, skiprows=1)
books = pd.read_csv('data/BX-Books.csv', sep=';', encoding='latin-1', names = books_cols, skiprows=1, on_bad_lines='skip')
ratings = pd.read_csv('data/BX-Book-Ratings.csv', sep=';', encoding='latin-1', names = ratings_cols, skiprows=1)

  books = pd.read_csv('data/BX-Books.csv', sep=';', encoding='latin-1', names = books_cols, skiprows=1, on_bad_lines='skip')


In [5]:
# Filtering books with at least 5 ratings
book_ids, book_counts = np.unique(ratings.isbn, return_counts=True)
books_threshold = book_ids[[np.where(book_counts >= 5)[0]]]
new_ratings = ratings[ratings.isbn.isin(books_threshold)]

  books_threshold = book_ids[[np.where(book_counts >= 5)[0]]]


In [6]:
# Filtering users with at least 5 ratings
user_ids, user_counts = np.unique(new_ratings.user_id, return_counts=True)
users_threshold = user_ids[[np.where(user_counts >= 5)[0]]]
new_ratings = new_ratings[new_ratings.user_id.isin(users_threshold)]

  users_threshold = user_ids[[np.where(user_counts >= 5)[0]]]


In [8]:
# Filtering books with at most 15 zero ratings
zero_ratings = new_ratings.iloc[np.where(new_ratings.rating == 0)[0], :]
zero_book_ids, zero_book_counts = np.unique(zero_ratings.isbn, return_counts=True)
new_book_ids = zero_book_ids[np.where(zero_book_counts <= 15)[0]]
new_rates = new_ratings[new_ratings.isbn.isin(new_book_ids)]

In [9]:
user_ids = np.random.choice(user_ids, size=6000, replace=False)
book_ids = np.unique(new_rates.isbn)
new_users = users[users.user_id.isin(user_ids)]
new_rates = new_rates[new_rates.user_id.isin(user_ids)]
new_books = books[books.isbn.isin(book_ids)]

In [30]:
new_rates.to_csv('Book_ratings.csv', index=False)
new_books.to_csv('Books.csv', index=False)
new_users.to_csv('Users.csv', index=False)

In [31]:
new_books.shape

(35443, 8)

In [7]:
print(f'Original length: {len(ratings)},\nAt least 5 ratings books & users length: {len(new_ratings)},\nAt least 5 ratings books & users and at most 15 zero ratings length: {len(new_rates)}')

Original length: 1149780,
At least 5 ratings books & users length: 607651,
At least 5 ratings books & users and at most 15 zero ratings length: 21661


In [8]:
np.unique(new_rates.rating, return_counts=True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
 array([14298,    23,    43,    91,   139,   660,   687,  1436,  1833,
         1215,  1236]))

# Final CF Model

In [10]:
# Transforming the data into surprise library format
reader = Reader(rating_scale=(0,10))
data = Dataset.load_from_df(new_rates, reader)
train, test = train_test_split(data, test_size=.3, random_state=42)

In [11]:
# Training SVD model
start = time.time()
model = matrix_factorization.SVD()
model.fit(train)
predictions = model.test(test)
print(f'Execution time: {time.time() - start} seconds')
predictions = pd.DataFrame(predictions)
predictions.drop("details", inplace=True, axis=1)
predictions.columns = ['userId', 'movieId', 'actual', 'cf_predictions']
predictions.head()

Execution time: 0.8772296905517578 seconds


Unnamed: 0,userId,movieId,actual,cf_predictions
0,238120,345277600,0.0,0.569685
1,250407,618150730,7.0,2.364364
2,17247,671780689,3.0,3.885848
3,78834,345386663,0.0,1.563082
4,238120,316540838,0.0,0.696585


In [12]:
import recmetrics
print("MSE: ", recmetrics.mse(predictions.actual, predictions.cf_predictions))
print("RMSE: ", recmetrics.rmse(predictions.actual, predictions.cf_predictions))

MSE:  10.991525727272462
RMSE:  3.315346999526967


In [13]:
#create model (matrix of predicted values)
cf_model = predictions.pivot_table(index='userId', columns='movieId', values='cf_predictions').fillna(0)

In [15]:
def get_users_predictions(user_id, n, model):
    recommended_items = pd.DataFrame(model.loc[user_id])
    recommended_items.columns = ["predicted_rating"]
    recommended_items = recommended_items.sort_values('predicted_rating', ascending=False)    
    recommended_items = recommended_items.head(n)
    return recommended_items.index.tolist()

#get example prediction
get_users_predictions(78834, 10, cf_model)

['067102437X',
 '0590457837',
 '014027166X',
 '0786004509',
 '0786808772',
 '0743211189',
 '0552149020',
 '0553207830',
 '0380731398',
 '0375726012']

In [16]:
isbn_list = get_users_predictions(78834, 10, cf_model)

In [21]:
new_books[new_books.isbn.isin(isbn_list)].drop(['img_s ', 'img_m', 'img_l'], axis=1)

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher
1949,0786808772,Summerland,Michael Chabon,2002,Miramax Kids
17944,0375726012,The Whore's Child: And Other Stories (Vintage ...,Richard Russo,2003,Vintage Books USA
18850,0380731398,Blu's Hanging,Lois-Ann Yamanaka,1998,William Morrow
22597,0786004509,All Fall Down,Zachary Fox,1997,Pinnacle Books
32452,0743211189,Electric God,Catherine Ryan Hyde,2000,Simon &amp; Schuster
57710,014027166X,Riven Rock,T. Coraghessan Boyle,1999,Penguin Books
69575,0590457837,"More Spaghetti, I Say! (Hello Reader, Level 2)",Rita Golden Gelman,1993,Scholastic
85809,0553207830,Holcroft Covenant,Robert Ludlum,1981,Bantam Doubleday Dell
112074,067102437X,JUST CHECKING,Emily Colas,1998,Atria


In [22]:
predictions = predictions.copy().groupby('userId', as_index=False)['movieId'].agg({'actual': (lambda x: list(set(x)))})
predictions = predictions.set_index("userId")
predictions.head(10)

Unnamed: 0_level_0,actual
userId,Unnamed: 1_level_1
408,[1561561428]
929,"[0445202718, 0880384514, 0345301870, 0440154731]"
1733,"[0373079621, 0743246918, 0373835426, 037370654..."
2288,"[1551666685, 0812589831]"
2326,"[0807281948, 0786905239, 0786918055]"
2855,[0375704906]
3167,"[0553574353, 0345290240, 0345257170, 067172175..."
3346,"[0345319656, 0836218256, 042507448X, 088677782..."
3437,[0380820870]
3814,[0684829975]


In [23]:
# make recommendations for all members in the test data
cf_recs = [] = []
for user in predictions.index:
    cf_predictions = get_users_predictions(user, 10, cf_model)
    cf_recs.append(cf_predictions)
        
predictions['cf_predictions'] = cf_recs
predictions.head()


Unnamed: 0_level_0,actual,cf_predictions
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
408,[1561561428],"[1561561428, 0002239213, 0671024809, 067103656..."
929,"[0445202718, 0880384514, 0345301870, 0440154731]","[0345301870, 0445202718, 0880384514, 044015473..."
1733,"[0373079621, 0743246918, 0373835426, 037370654...","[1551666030, 0373168438, 0886777755, 037370654..."
2288,"[1551666685, 0812589831]","[1551666685, 0812589831, 0002239213, 067103494..."
2326,"[0807281948, 0786905239, 0786918055]","[0786905239, 0786918055, 0807281948, 067103946..."


# Popularity Based Recommender System

In [18]:
users = new_users
books = new_books
ratings = new_ratings

# Cleaning the dataset 
books = books.drop(['img_s ', 'img_m', 'img_l'], axis=1)
books.head()

# Merge the ratings dataset with the books dataset so that we can get access to the names and authors
combined_df = pd.merge(ratings, books, on='isbn')
all_cols = [col for col in combined_df.columns]
all_cols.remove('rating')
all_cols.append('rating') # So that we have ratings has the last column in the dataframe
combined_df = combined_df[all_cols]
combined_df.head()

Unnamed: 0,user_id,isbn,book_title,book_author,year_of_publication,publisher,rating
0,276746,786013990,At the Edge,David Dun,2002,Pinnacle Books,0
1,98391,786013990,At the Edge,David Dun,2002,Pinnacle Books,8
2,152835,786013990,At the Edge,David Dun,2002,Pinnacle Books,7
3,159033,786013990,At the Edge,David Dun,2002,Pinnacle Books,0
4,265115,786013990,At the Edge,David Dun,2002,Pinnacle Books,0


In [19]:
# Calculate the total number of ratings, average ratings, number of ratings for each book
filtered_ratings = combined_df.copy()
grouped_data = filtered_ratings.groupby(['isbn']).agg({'user_id': 'count', 'rating':'sum'}).reset_index()
grouped_data.rename(columns = {'rating': 'total_ratings'},inplace=True)
avg_ratings = filtered_ratings.groupby('isbn').agg({'rating':'mean'}).reset_index()
avg_ratings.rename(columns = {'rating': 'avg_rating'},inplace=True)
merged_data = pd.merge(grouped_data, avg_ratings, on='isbn')
merged_data.head()

Unnamed: 0,isbn,user_id,total_ratings,avg_rating
0,2005018,12,52,4.333333
1,2190915,4,10,2.5
2,2239213,5,0,0.0
3,2240114,4,21,5.25
4,2243954,5,0,0.0


In [20]:
# Calculating the parameters for the weighted average
C = merged_data['avg_rating'].mean()
m = merged_data['total_ratings'].quantile(0.70) # m is the minimum number of ratings required to be listed in the list;
print(f'C = {C}\nm = {m}\n')

# Filter out all qualified books into a new DataFrame
min_count_books = merged_data.copy().loc[merged_data['total_ratings'] > m]
print(f'The shape of the new dataset = {min_count_books.shape}')

C = 2.4867456428440153
m = 26.0

The shape of the new dataset = (10156, 4)


In [21]:
# Function that computes the weighted rating of each book
def weighted_rating(df, m=m, C=C):
    v = df['total_ratings']
    R = df['avg_rating']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

# Define a new feature 'score' and calculate its value with `weighted_rating()`
min_count_books['score'] = min_count_books.apply(weighted_rating, axis=1)
top10_books = min_count_books.merge(books, on='isbn', how='left').sort_values('score', ascending=False).head(10)
all_cols = top10_books.columns.to_list()
all_cols = all_cols[0:2] + all_cols[5:] + all_cols[2:5] # Changing the order of columns 
top10_books[all_cols]

Unnamed: 0,isbn,user_id,book_title,book_author,year_of_publication,publisher,total_ratings,avg_rating,score
10013,3522128001,11,Die unendliche Geschichte: Von A bis Z,Michael Ende,1979,Thienemann,94,8.545455,7.232734
10023,3551551936,10,Harry Potter Und Der Feuerkelch,Joanne K. Rowling,1999,Carlsen Verlag GmbH,86,8.6,7.180852
10021,3551551685,11,Harry Potter und die Kammer des Schreckens,Joanne K. Rowling,2000,Carlsen Verlag GmbH,92,8.363636,7.068728
197,60256672,28,Where the Sidewalk Ends : Poems and Drawings,Shel Silverstein,1974,HarperCollins,210,7.5,6.947692
9468,1577780728,9,Jesus Freaks: DC Talk and The Voice of the Mar...,DC Talk,1999,Bethany House Publishers,76,8.444444,6.925815
3205,385324138,10,The Outlandish Companion,DIANA GABALDON,1999,Delacorte Press,83,8.3,6.913352
10114,8478886451,14,Harry Potter y el cÃ¡liz de fuego,J. K. Rowling,2001,Lectorum Publications,111,7.928571,6.895816
7255,684842319,8,Our Bodies Ourselves For The New Century (A To...,Boston Women's Health Book Collective,1998,Touchstone,68,8.5,6.836759
8518,836213122,15,There's Treasure Everywhere--A Calvin and Hobb...,Bill Watterson,1996,Andrews McMeel Publishing,116,7.733333,6.772691
3475,394823370,11,The Lorax,Dr. Seuss,1971,Random House Children's Books,88,8.0,6.742591


In [22]:
#make recommendations for all members in the test data
popularity_recs = top10_books.isbn.tolist()

pop_recs = []
for user in predictions.index:
    pop_predictions = popularity_recs
    pop_recs.append(pop_predictions)
        
predictions['pop_predictions'] = pop_recs
predictions.head()


Unnamed: 0_level_0,actual,cf_predictions,pop_predictions
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
243,"[0345433173, 0349109117, 0385316895]","[0349109117, 0385316895, 0345433173, 000000000...","[3522128001, 3551551936, 3551551685, 006025667..."
643,"[2253001457, 2253049417, 0140366857]","[0140366857, 2253049417, 2253001457, 000000000...","[3522128001, 3551551936, 3551551685, 006025667..."
651,"[0679801146, 0553574175, 0553579673, 0345395379]","[0345395379, 0679801146, 0553579673, 055357417...","[3522128001, 3551551936, 3551551685, 006025667..."
709,"[0590129015, 0064405052]","[0590129015, 0064405052, 000000000, 0671737775...","[3522128001, 3551551936, 3551551685, 006025667..."
850,"[3442248922, 3518371002]","[3518371002, 3442248922, 000000000, 067173590X...","[3522128001, 3551551936, 3551551685, 006025667..."


# Content Based Recommender System

In [30]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [36]:
books.dropna(inplace=True)
books.reset_index(inplace=True)

In [37]:
# Creating the model input feature (author + publisher + year of publication)
books['features'] = ''
for row in range(len(books)):
    author = str(books.book_author[row])
    publisher = str(books.publisher[row])
    year = str(books.year_of_publication[row])
    
    feature = ' '.join([author, publisher, year])
    split = re.sub(r'[^\w]', ' ', feature).split()
    feature = ' '.join([i for i in split if len(i) > 1])
    books.features.at[row] = feature.lower()

In [38]:
books.head()

Unnamed: 0,index,isbn,book_title,book_author,year_of_publication,publisher,features
0,1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,richard bruce wright harperflamingo canada 2001
1,3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,gina bari kolata farrar straus giroux 1999
2,5,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,amy tan putnam pub group 1991
3,6,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,robert cowley berkley publishing group 2000
4,10,771074670,Nights Below Station Street,David Adams Richards,1988,Emblem Editions,david adams richards emblem editions 1988


In [39]:
books.features

0        richard bruce wright harperflamingo canada 2001
1             gina bari kolata farrar straus giroux 1999
2                          amy tan putnam pub group 1991
3            robert cowley berkley publishing group 2000
4              david adams richards emblem editions 1988
                              ...                       
35438                 walt disney random house childrens
35439            raymond chandler vintage books usa 1992
35440                        penny jordan harlequin 1995
35441                   lucy maud montgomery bantam 1983
35442                      cathy williams harlequin 1999
Name: features, Length: 35443, dtype: object

In [None]:
# Transforming our text feature into a vector feature
tf_idf = TfidfVectorizer(stop_words='english')
tf_idf_matrix = tf_idf.fit_transform(books.features)
# Computing the cosine similarity matrix
cosine_similarity_matrix = cosine_similarity(tf_idf_matrix, tf_idf_matrix)

In [None]:
# Book recommender based on similarity score
def recommend_books(book_title, cosine_similarity_matrix, n_recommendations):
    index = books[books['book_title']==book_title].index.values[0]
    similarity_scores = list(enumerate(cosine_similarity_matrix[index]))
    similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(n_recommendations+1)]]

    return similarity_scores_sorted, books.book_title.iloc[recommendations_indices]

def recommender(user_id, cosine_similarity_matrix, n_recommendations):
    scores = list()
    names = list()
    user_books = ratings[ratings.user_id == user_id].isbn
    book_names = books[books.isbn.isin(user_books)].book_title
    for book in book_names:
        similarity_scores, books_recommended = recommend_books(book, cosine_similarity_matrix, n_recommendations)
        scores.extend(similarity_scores)
        names.extend(books_recommended.to_list())
    ids, sim_score = zip(*scores)
    results = dict(zip(names, sim_score))
    results = pd.DataFrame(results, index=range(len(results))).T[0]
    results.columns = ['Score']
    final = results.sort_values(ascending=False).head(n_recommendations)
    
    return final

In [None]:
random_user_idx = np.random.choice(range(len(users)))
recommendations = recommender(users.user_id[random_user_idx], cosine_similarity_matrix, 10)

In [None]:
ratings[ratings.user_id == users.user_id[random_user_idx]]

In [None]:
books[books.book_title.isin(list(recommendations.index))]

# Evaluating the model

In [26]:
actual = predictions.actual.values.tolist()
cf_predictions = predictions.cf_predictions.values.tolist()
pop_predictions = predictions.pop_predictions.values.tolist()


In [24]:
import recmetrics
pop_mark = []
for K in np.arange(1, 11):
    pop_mark.extend([recmetrics.mark(actual, pop_predictions, k=K)])
pop_mark

[0.0,
 0.0,
 0.0,
 2.4331546070676505e-05,
 2.542731639721606e-05,
 2.542731639721606e-05,
 4.884140231130197e-05,
 4.884140231130197e-05,
 6.318534026635104e-05,
 6.318534026635104e-05]

In [27]:
cf_mark = []
for K in np.arange(1, 11):
    cf_mark.extend([recmetrics.mark(actual, cf_predictions, k=K)])
cf_mark

[0.5008503663066604,
 0.7057823652663819,
 0.7989796703485523,
 0.8442177917572535,
 0.872619178472077,
 0.892245054982819,
 0.9060886185683908,
 0.9173082754484233,
 0.926232013961109,
 0.9337952082561074]