In [1]:
import gzip
import json
import re
import os
import sys
import numpy as np
import pandas as pd
from scipy.spatial import distance

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
path = 'drive/MyDrive/CSE 6240/Project'
%cd $path

/content/drive/MyDrive/CSE 6240/Project


In [4]:
DIR = './'

def load_data(file_name):
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            count += 1
            data.append(d)

    return data

In [5]:
books = load_data(os.path.join(DIR, 'goodreads_books_poetry.json.gz'))
interactions = load_data(os.path.join(DIR, 'goodreads_interactions_poetry.json.gz'))
reviews = load_data(os.path.join(DIR, 'goodreads_reviews_poetry.json.gz'))

In [6]:
df_interactions = pd.json_normalize(interactions)
df_books = pd.json_normalize(books)
df_reviews = pd.json_normalize(reviews)

In [7]:
user_map = pd.read_csv('user_id_map.csv')
book_map = pd.read_csv('book_id_map.csv')

In [8]:
df_merged = pd.merge(df_books, df_reviews, left_on='book_id', right_on='book_id')
df_merged = pd.merge(df_merged, user_map, on='user_id')
df_merged = df_merged.astype({'book_id':'int64'})
df_merged = pd.merge(df_merged, book_map, on='book_id')
# df_merged['rating'] = [r/5 for r in df_merged['rating']]
df_merged.columns

Index(['isbn', 'text_reviews_count', 'series', 'country_code', 'language_code',
       'popular_shelves', 'asin', 'is_ebook', 'average_rating', 'kindle_asin',
       'similar_books', 'description', 'format', 'link', 'authors',
       'publisher', 'num_pages', 'publication_day', 'isbn13',
       'publication_month', 'edition_information', 'publication_year', 'url',
       'image_url', 'book_id', 'ratings_count', 'work_id', 'title',
       'title_without_series', 'user_id', 'review_id', 'rating', 'review_text',
       'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes',
       'n_comments', 'user_id_csv', 'book_id_csv'],
      dtype='object')

In [9]:
column = ['user_id_csv', 'book_id_csv', 'rating']
df = df_merged[column]
df.head()

Unnamed: 0,user_id_csv,book_id_csv,rating
0,256407,158412,4
1,371125,1224532,3
2,371125,337477,5
3,42094,337477,4
4,40607,337477,4


In [10]:
df[df['book_id_csv'] == 1241].index[0]

12091

In [11]:
c = df.book_id_csv.value_counts()
s = c[c == 1].reset_index()
single_books = list(s['index'])

test_index = []

for book in set(df['book_id_csv']):
  if book not in single_books:
    test_index.append(df[df['book_id_csv'] == book].index[0])

len(test_index)

16878

In [12]:
test_df = df.iloc[test_index, :]
train_df = df[~df.index.isin(test_index)]

In [13]:
train_df.shape, test_df.shape

((137677, 3), (16878, 3))

# Generate vectors

In [14]:
from collections import defaultdict

def preprocessing(df):
    R_ui = (df.groupby('user_id_csv')['book_id_csv','rating'].apply(lambda x: dict(x.values)).to_dict())
    R_iu = (df.groupby('book_id_csv')['user_id_csv','rating'].apply(lambda x: dict(x.values)).to_dict())

    return R_ui, R_iu

# Prints the first 20 items in our dictionary
# n = 20
# {key:value for key,value in list(R_ui.items())[0:n]}

In [15]:
R_ui, R_iu = preprocessing(train_df)

  after removing the cwd from sys.path.
  """


In [16]:
R_ui[46779][953479]

4

In [17]:
def cosine_similarity(d, item1, item2):
    k1 = set(d[item1].keys())
    k2 = set(d[item2].keys())

    items = k1.intersection(k2)

    if len(items) == 0:
      return 0

    v1 = np.array([d[item1][item] for item in items])
    v2 = np.array([d[item2][item] for item in items])

    num = sum(v1*v2)

    d1 = np.array(list(d[item1].values()))
    d2 = np.array(list(d[item2].values()))

    den = np.sqrt(sum(d1**2) * sum(d2**2))
    if den == 0:
      return 0

    return num/den

In [18]:
def item_item_collaborative_filtering(R_ui, R_iu, user_u, item_i):
    """
    Arguments: 
    item_emb (dictionary of numpy.array): pre-trained embeddings of items. 
     - The key is item_id (string), and the value is the corresponding item embedding (numpy.array; dim=32).
    user_u (string): user_id.
    item_j (string): item_id.

    Returns:
    P_{u,i} (float): the predicted rating of user i on item j based on the item-item collaborative filtering.

    Steps:
    1. retrieve the set of items I a user rated using the keys of R_ui.
    2. for each item in I (must be different from item_i), compute sim(current_item, item_i).
    3. update numerator and denominator values for the current item based on the above P_{u,i} equation. 
        - Don't forget to use absolute value of the similarity while computing denominator.
    4. repeat 2 and 3 for all items in I, and return numerator/denominator.
    """
    numerator, denominator = 0, 0
    ## Add code below [1.0 points] ##
    try:
      item_set = R_ui[user_u].keys()
    except:
      return 0

    numerator = sum([cosine_similarity(R_iu, item, item_i)*R_ui[user_u][item] for item in item_set if item != item_i])
    denominator = sum([abs(cosine_similarity(R_iu, item, item_i)) for item in item_set if item != item_i])
    if denominator == 0:
      return 0
    #################################
    return numerator/denominator

In [19]:
item_item_collaborative_filtering(R_ui, R_iu, 46779, 953479)

4.018325222508087

In [20]:
def test_of_collaborative_filtering(test_df, R_iu):
  """
  Arguments: 
  test data (str-type numpy.array): the test data containing user_id, movie_id, and normalized rating (0-1) information.
  item_emb (dictionary of numpy.arraay): pre-trained embeddings of items. 
   - The key is item_id (string), and the value is the corresponding item embedding (numpy.array; dim=32).

  Returns:
  test_RMSE (float): the test RMSE of item-item collaborative filtering model.

  Steps:
  1. for each test example in the test data, compute P_{u,i} using the item_item_collaborative_filtering function.
  2. compute the error (R_{u,i}^{test} - P_{u,i}) for the current test example.
  3. sum the square of the error for all test examples.
  4. divide the sum by the number of test examples and compute the root of it.
  """
  test_RMSE = 0
  ## Add code below [0.5 points] ##
  N = len(test_df)
  R_ui_, R_iu_ = preprocessing(test_df)

  temp_mse = [(R_ui_[test_df.iloc[i, 0]][test_df.iloc[i, 1]] - item_item_collaborative_filtering(R_ui, R_iu, test_df.iloc[i, 0], test_df.iloc[i, 1]))**2 for i in range(len(test_df))]
  temp_mae = [abs(R_ui_[test_df.iloc[i, 0]][test_df.iloc[i, 1]] - item_item_collaborative_filtering(R_ui, R_iu, test_df.iloc[i, 0], test_df.iloc[i, 1])) for i in range(len(test_df))]

  test_MSE = sum(temp_mse)/N
  test_MAE = sum(temp_mae)/N
  #################################
  return test_MAE, test_MSE, np.sqrt(test_MSE)

In [21]:
mae, mse, rmse = test_of_collaborative_filtering(test_df, R_iu)

  after removing the cwd from sys.path.
  """


In [22]:
print("==== Item-Item Collaborative Filtering Evaluation Metrics: ====\n")
print('MAE (L1) = {:.4f} \nMSE (L2) = {:.4f} \nRMSE     = {:.4f}'.format(mae, mse, rmse))

==== Item-Item Collaborative Filtering Evaluation Metrics: ====

MAE (L1) = 2.8516 
MSE (L2) = 11.5179 
RMSE     = 3.3938
