In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd 

import os

# Preprocess

In [3]:
ratings = pd.read_csv('/content/drive/MyDrive/COMP9900 Project/Machine Learning/Dataset/Books/ratings.csv')
books = pd.read_csv('/content/drive/MyDrive/COMP9900 Project/Machine Learning/Dataset/Books/books_cleaned.csv')

df=ratings.merge(books[['book_id','original_title']],how='left', on='book_id')
df.head()

Unnamed: 0,book_id,user_id,rating,original_title
0,1,314,5,Harry Potter and the Half-Blood Prince
1,1,439,3,Harry Potter and the Half-Blood Prince
2,1,588,5,Harry Potter and the Half-Blood Prince
3,1,1169,4,Harry Potter and the Half-Blood Prince
4,1,1185,4,Harry Potter and the Half-Blood Prince


In [4]:
# drop empty book id
df = df[df['book_id'].notna()]

# discard books with less than 5 ratings
rating_count = pd.DataFrame(df['book_id'].value_counts())
rare_books = rating_count[rating_count['book_id'] < 1].index
df_train = df[~df['book_id'].isin(rare_books)]
df_train = df_train.dropna()

# pivot the table (for each user, which books they have rated)
user_ratings_df = df_train.pivot_table(index=['user_id'], columns=['book_id'], values='rating')

In [5]:
# rating_count.iloc[[3, 41865, 5, 5901, 34, 960]]
rating_count.loc[[3, 5, 5901, 34, 960], :]

Unnamed: 0,book_id
3,100
5,100
5901,100
34,100
960,100


In [6]:
# user_ratings_df.to_csv('user_ratings.csv', index=False)
# # !cp /content/user_ratings.csv /content/drive/My\ Drive/COMP9900\ Project/Machine\ Learning/Dataset/Books

# Generate Recommendation

In [7]:
# user_ratings_df = pd.read_csv('/content/drive/MyDrive/COMP9900 Project/Machine Learning/Dataset/Books/user_ratings.csv')
# ratings = pd.read_csv('/content/drive/MyDrive/COMP9900 Project/Machine Learning/Dataset/Books/ratings.csv')

In [8]:
user_ratings_df.head()

book_id,1,2,3,5,6,10,11,13,21,24,...,9844,9864,9865,9912,9913,9914,9915,9943,9957,9998
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
11,,,,,,,,,,,...,,,,,,,,,,


In [9]:
from sklearn.impute import SimpleImputer
from sklearn.metrics import pairwise_distances
import math

def generate_recommendation_collaborative(user_ratings_df, ratings, user_id, rated_books):

  # finds all books that has been rated by the user
  if (len(user_ratings_df[user_ratings_df.index == user_id]) > 0):
    user_ratings_df = user_ratings_df[user_ratings_df.index != user_id]
  curr_user_ratings = {}
  for el in rated_books:
    if (el['book_id'] in user_ratings_df.columns):
      curr_user_ratings[el['book_id']] = el['rating']
  new_row = pd.Series(curr_user_ratings, name = user_id)
  user_ratings_df = user_ratings_df.append(new_row, ignore_index=False)
  user_df = user_ratings_df[user_ratings_df.index == user_id]
  rated_books_id = user_df.columns[user_df.notna().any()].tolist()

  # count how many similar books other users had rated
  books_read_df = user_ratings_df[rated_books_id]
  user_book_count = books_read_df.T.notnull().sum()
  user_book_count = user_book_count.reset_index()
  user_book_count.columns = ['user_id', 'book_count']

  # finds all users who had rated at least n% of the books the current user rated
  count_threshold = int(0.1 * len(rated_books_id))   # only users who had rated half of similar books
  if (count_threshold < 1):
    count_threshold = 1
  user_book_count = user_book_count[user_book_count['book_count'] >= count_threshold]['user_id']
  filtered_users = pd.concat([books_read_df[books_read_df.index.isin(user_book_count)], user_df[rated_books_id]]).drop_duplicates().T
  users_list = filtered_users.columns
  books_list = filtered_users.index
  imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
  ratings_no_nan = imp_mean.fit_transform(filtered_users)
  filtered_users = pd.DataFrame(data = ratings_no_nan, index = books_list, columns = users_list).astype('float64')

  # Take the rating df of the filtered users, then find the correlation between the users
  sim_matrix = 1-pairwise_distances(filtered_users.T, metric='cosine')
  user_dist = pd.DataFrame(data = sim_matrix, index = users_list, columns = users_list).astype('float64')
  # print(user_dist)
  user_dist = user_dist.unstack()
  user_dist = user_dist.sort_values().drop_duplicates()
  user_dist = pd.DataFrame(user_dist, columns=['dist'])
  user_dist.index.names = ['uid1', 'uid2']
  user_dist = user_dist.reset_index()
  # print(user_dist)

  # filter the users that are relatively similar to the current user
  similar_users = user_dist[(user_dist['uid1'] == user_id) & (user_dist['dist'] >= 0.50)][['uid2', 'dist']].reset_index(drop=True)
  similar_users = similar_users.sort_values(by='dist', ascending=False)
  similar_users.rename(columns={'uid2': 'user_id'}, inplace=True)
  similar_users['dist'] = (similar_users['dist']-similar_users['dist'].min())/(similar_users['dist'].max()-similar_users['dist'].min())
  print(similar_users)
  print(similar_users['dist'])

  # list all the books rated by similar users, and calculate its average score
  user_books = similar_users.merge(ratings[['user_id', 'book_id', 'rating']], how='inner')
  user_books = user_books[user_books['user_id'] != user_id]
  user_books['score'] = user_books['dist'] * user_books['rating']   #use exp for better distinguish
  user_books.groupby('book_id').agg({'score': 'mean'})
  
  recommendation_df = user_books.groupby('book_id').agg({'score': 'mean'})
  recommendation_df = recommendation_df.reset_index()

  recommendation_df = recommendation_df[recommendation_df['score'] > 3.0].sort_values(
      "score", ascending=False).head(100)
  print(recommendation_df)
  return recommendation_df['book_id'].tolist()

In [10]:
from pandas.core.dtypes.missing import isna
# user_id=123456989
user_id = 11

# rated_books = [
#   {
#     'book_id' : 3,
#     'rating'  : 5.0
#   },
#   {
#     'book_id' : 41865,
#     'rating'  : 1.0
#   },
#   {
#     'book_id' : 5	,
#     'rating'  : 5.0
#   },
#   {
#     'book_id' : 5907,
#     'rating'  : 5.0
#   },
#   {
#     'book_id' : 34,
#     'rating'  : 5.0
#   },
#   {
#     'book_id' : 960,
#     'rating'  : 3.0
#   }
# ]

# copy_df = user_ratings_df[user_ratings_df.index == 12381]
rated_books = []
for col in user_ratings_df:
  if (not(isna(user_ratings_df.at[user_id, col]))):
    rated_books.append(
        {
          'book_id' : col,
          'rating'  : user_ratings_df.at[user_id, col]
        }
    )

recommendations_id = generate_recommendation_collaborative(user_ratings_df, ratings, user_id, rated_books)
print(recommendations_id)

Empty DataFrame
Columns: [user_id, dist]
Index: []
Series([], Name: dist, dtype: float64)
Empty DataFrame
Columns: [book_id, score]
Index: []
[]


In [11]:
books = pd.read_csv('/content/drive/MyDrive/COMP9900 Project/Machine Learning/Dataset/Books/books.csv')

books.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [12]:
rated_book_id = []
for review in rated_books:
  rated_book_id.append(review['book_id'])
print(rated_book_id)

books.loc[books['book_id'].isin(rated_book_id)]

[9717]


Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
322,323,9717,9717,4489585,274,571224385,9780571000000.0,"Milan Kundera, Michael Henry Heim",1984.0,Nesnesitelná lehkost bytí,...,205279,247980,10682,4894,12964,42199,84519,103404,https://images.gr-assets.com/books/1265401884m...,https://images.gr-assets.com/books/1265401884s...


In [13]:
books.loc[books['book_id'].isin(recommendations_id)]

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
