In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cd /content/drive/MyDrive/6240_project/project_data/

/content/drive/MyDrive/6240_project/project_data


In [None]:
import gzip
import json
import re
import os
import sys
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import time
from collections import defaultdict
from tqdm import tqdm
import pickle
from sklearn.model_selection import train_test_split
import ast
from scipy.spatial import distance
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import ast
import matplotlib.pyplot as plt

In [None]:
import gzip
def load_data(file_name):
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            count += 1
            data.append(d)
            
          
    return data


Steps to generate the interactions.csv file and interactions_full.csv file

Load the user-item interaction matrix 

In [None]:
interactions = load_data('/content/drive/MyDrive/6240_project/project_data/goodreads_interactions_poetry.json.gz')

In [None]:
interactions_df = pd.json_normalize(interactions)

In [None]:
len(interactions_df)

2734350

user_id_map.csv - Dataset to map the unique user ids to integers 
book_id_map.csv - Dataset to map the unique book ids to integers 

'user_id' - are the original user ids 
'user_id_csv' -  are the integer mapped user ids 

In [None]:
user_map = pd.read_csv('/content/drive/MyDrive/6240_project/project_data/user_id_map.csv')
book_map = pd.read_csv('/content/drive/MyDrive/6240_project/project_data/book_id_map.csv')

Join the interactions matrix and user_map to link user_ids to their mapped integer values for easier handling

In [None]:
rating_df = interactions_df.merge(user_map,on='user_id')[['user_id_csv','book_id','rating']]

Filter out zero rated interactions

In [None]:
non_zero = rating_df[rating_df['rating']!=0].reset_index(drop=True)
non_zero

Unnamed: 0,user_id_csv,book_id,rating
0,0,1384,4
1,0,1376,4
2,0,30119,5
3,1,30119,3
4,2,240007,4
...,...,...,...
1229054,876132,2547,5
1229055,876133,11047097,4
1229056,876144,7433930,5
1229057,876144,16170625,5


In [None]:
user_map = dict(zip(np.unique(non_zero['user_id_csv']),np.arange(len(np.unique(non_zero['user_id_csv'])))))
item_map = dict(zip(np.unique(non_zero['book_id']),np.arange(len(np.unique(non_zero['book_id'])))))

In [None]:
non_zero['uid'] = non_zero['user_id_csv'].apply(lambda x : user_map[x])
non_zero['bid'] = non_zero['book_id'].apply(lambda x : item_map[x])
non_zero[['uid','bid','rating']].to_csv('interaction.csv')

In [None]:
non_zero.to_csv('interaction_full.csv')

In [None]:
non_zero

Unnamed: 0,user_id_csv,book_id,rating,uid,bid
0,0,1384,4,0,5949
1,0,1376,4,0,5831
2,0,30119,5,0,21931
3,1,30119,3,1,21931
4,2,240007,4,2,17215
...,...,...,...,...,...
1229054,876132,2547,5,267818,18493
1229055,876133,11047097,4,267819,1380
1229056,876144,7433930,5,267820,32082
1229057,876144,16170625,5,267820,8714


Steps to generate the matrix factorization embeddings

In [None]:
def preprocessing(training_data):
  """
  Arguments:
  training data (str-type numpy.array): the training data containing user_id, movie_id, and normalized rating (0-1) information.

  Returns:
  R_ui (dictionary of dictionaries): this dictionary contains rating information of each user. 
   - The key is user_id (string), the value is a dictionary whose key is item_id (string) and value is rating (float).
   - Thus, R_ui['1']['260'] = 1.0. R_ui should be computed with training data.
  R_iu (dictionary of dictionaries): it is similar to R_ui, but the key of a dictionary is item_id (string). 
   - Thus, R_ui['260']['1'] = 1.0. R_iu should be computed with training data.

  Steps:
  1. for each training example with (user u, item i, and rating r), R_ui[u][i] should be r (float). R_iu can be computed similarly.
  """
  R_ui,R_iu = defaultdict(dict),defaultdict(dict)

  ## Add code below [0.5 points] ##

  for index in tqdm(range(len(training_data)),desc="Complete..."):
    
    u_id = str(training_data.loc[index]['user_id_csv'])
    it_id = str(training_data.loc[index]['book_id'])
    r = float(training_data.loc[index]['rating'])

    R_ui[u_id][it_id] = r
    R_iu[it_id][u_id] = r

  
  #################################
  return R_ui,R_iu

In [None]:
R_ui,R_iu = preprocessing(non_zero)

Complete...: 100%|██████████| 1229059/1229059 [11:55<00:00, 1718.31it/s]


In [None]:
print('R_ui of user 0 and item 1384 = {}'.format(R_ui["0"]['1384']))
print('R_iu of user 30119 and item 1 = {}'.format(R_iu['30119']['1']))

R_ui of user 0 and item 1384 = 4.0
R_iu of user 30119 and item 1 = 3.0


In [None]:
def gradient_descent_update(U,V,K):
  """
  Do not modify this function. There is a -2.0 point penalty if you modify this function.

  Arguments: 
  U,V (dictionary of numpy.array): current user and item profile dictionaries. 
   - The key is either user_id or item_id, and the value is the corresponding user or item profile (numpy.array; dim:K).
  K (int): the number of latent factors.

  Returns:
  Updated U,V (dictionary of numpy.array): updated user and item profile dictionaries. 
   - The key is either user_id or item_id, and the value is the corresponding user or item profile (numpy.array; dim:K).
  """
  mu = 0.00001
  lambda_value = 0.00001
  for user in U.keys():
    updates = np.zeros(K)
    for item in R_ui[user].keys():
      pred = np.inner(U[user],V[item])
      error = R_ui[user][item] - pred
      updates += error*V[item]
    final_updates = 2*mu*updates - 2*lambda_value*U[user]
    U[user] += final_updates

  for item in V.keys():
    updates = np.zeros(K)
    for user in R_iu[item].keys():
      pred = np.inner(U[user],V[item])
      error = R_iu[item][user] - pred
      updates += error*U[user]
    final_updates = 2*mu*updates - 2*lambda_value*V[item]
    V[item] += final_updates
  return U,V

In [None]:
def matrix_factorization (training_data, K=15, epochs = 100):
  """
  Arguments:
  training data (str-type numpy.array): the training data containing user_id, movie_id, and normalized rating (0-1) information.
  K (int): number of latent factors used for matrix factorization.
  epochs (int): number of repetitions of the updates of U and V.

  Returns:
  U,V (dictionary of float-type numpy.array): learned user and item profile dictionaries. 
   - The key is either user_id or item_id, and the value is the corresponding user or item profile (float-type numpy.array; dim:K).

  Steps for the first code block:
  1. compute the maximum value using 'sqrt(avg(ratings of all training examples)/K)' for the initialization (ratings of all training examples should be float-type, not str-type here).
  2. for each user u in training_data, initialize the value of U[u] with a size-K numpy.array (float) filled with random values between 0 the maximum value.
  3. initialize V[v] for each item v in training_data like step 2.
      - when you assign the initial value, please use R_ui.keys(), R_iu.keys() to keep the order and to avoid multiple initialization.
  """

  np.random.seed(0)
  U,V = defaultdict(np.array),defaultdict(np.array)

  ## Add code below [1.0 points] ##

  avg_rating = np.mean(training_data[:,2].astype(float))

  max_value = (avg_rating/K)**0.5

  users = list(R_ui.keys())
  items = list(R_iu.keys())

  for user in users:
    U[user] = np.random.uniform(low=0,high=max_value,size=K).astype(float)

  for item in items:
    V[item] = np.random.uniform(low=0,high=max_value,size=K).astype(float)

  #################################

  """
  Steps for the second code block:
  1. for each iteration, call the gradient_descent_update with current U and V.
  2. update the user and item profile matrices with the returned U and V.
  """
  ## Add code below [1.0 points] ##

  for j in tqdm(range(epochs),desc='Completed......'):
    U,V = gradient_descent_update(U,V,K)

  #################################
  return U,V

In [None]:
training_data = np.array(non_zero,dtype=np.float64)

Generate the user and item embeddings using matrix factorization 

In [None]:
(W,H) = matrix_factorization(training_data)

Completed......: 100%|██████████| 100/100 [35:30<00:00, 21.30s/it]


In [None]:
with open('user.pickle', 'wb') as handle:
    pickle.dump(W, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('item.pickle', 'wb') as handle:
    pickle.dump(H, handle, protocol=pickle.HIGHEST_PROTOCOL)

Steps to generate the interactions_genres.csv

In [None]:
df = non_zero

In [None]:
book_map = pd.read_csv('book_id_map.csv')
book_review = pd.json_normalize(load_data('goodreads_book_genres_initial.json.gz'))
book_review.to_csv('book_genres.csv')

In [None]:
book_genres = pd.read_csv('/content/drive/MyDrive/6240_project/project_data/book_genres.csv')
book_genres.fillna(value=0,inplace=True)

In [None]:
cols = list(book_genres.columns)
book_genres = book_genres[cols[1:]]
book_genres.set_index('book_id',inplace=True)

In [None]:
book_genres.columns

In [None]:
def get_genre_id(book_id):

  a = g[book_id]

  return list(book_genres.loc[a].values)



In [None]:
genres = []

In [None]:
for i in tqdm(range(len(df))):
  genres.append(get_genre_id(df.loc[i]['bid']))

In [None]:
df['genres'] = genres

In [None]:
df.to_csv('interactions_genres.csv')