In [0]:
# Google Cloud things

project_id = 'cs229-project-258114'
bucket_name = 'goodreads_reviews'

from google.colab import auth
auth.authenticate_user()

!gcloud config set project {project_id}

# Download the file from a given Google Cloud Storage bucket.
!gsutil cp gs://{bucket_name}/finalbooks.csv /tmp/finalbooks.csv
!gsutil cp gs://{bucket_name}/finalratings.csv /tmp/ratings.csv 
!gsutil cp gs://{bucket_name}/train.csv /tmp/train.csv 
!gsutil cp gs://{bucket_name}/test.csv /tmp/test.csv 


# basics
import pandas as pd
import numpy as np

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# tools
import scipy
import math
import random
import sklearn
import string
from sklearn.model_selection import train_test_split


finalbooks = pd.read_csv('../tmp/finalbooks.csv')
ratings = pd.read_csv('../tmp/ratings.csv')
train = pd.read_csv('../tmp/train.csv')
test = pd.read_csv('../tmp/test.csv')


In [0]:
## NDGC & RMSE

def dcg_k(r, k):
    """ Discounted Cumulative Gain (DGC)  
    Args:
        r: True Ratings in Predicted Rank Order (1st element is top recommendation)
        k: Number of results to consider
    Returns:
        DCG
    """
  
    r = np.asfarray(r)[:k]
    return np.sum(2**r / np.log2(np.arange(2, r.size + 2)))      



def ndcg_k(r, k):
    """Normalized Discounted Cumulative Gain (NDCG)
    Args:
        r: True Ratings in Predicted Rank Order (1st element is top recommendation)
        k: Number of results to consider
    Returns:
        NDCG
    """
    dcg_max = dcg_k(sorted(r, reverse=True), k)
    if not dcg_max:
        return 0.
    return dcg_k(r, k) / dcg_max

def mean_ndcg(rs):
    """Mean NDCG for all users
    Args:
        rs: Iterator / For each user: True Ratings in Predicted Rank Order
    Returns:
        Mean NDCG
    """
    return np.mean([ndcg_k(r, len(r)) for r in rs])

def rmse(y,h):
    """RMSE
    Args:
        y: real y
        h: predicted y
    Returns:
        RMSE
    """
    a = y-h

    return np.sqrt(sum(a**2)/len(a))

In [0]:
# MATRIX FACTORIZATION
def new_R(data, U, B):
    nR = np.zeros(data.shape[0])
    c = 0
    for i in range(data.shape[0]):
        #if i % 10000000 == 0:
            #print('step ' + str(i))
        nR[c] = B[:, data.newbookid[i] - 1] @ U[data.newuser_id[i] - 1, :]

        c += 1
    return nR



In [0]:
## Alternate Least Square

def ALS(train, k, lamu = 0.1, lamb = 0.1):
    users = np.unique(train.newuser_id)
    books = np.unique(train.newbookid)
    nu = len(users)
    nb = len(books)

# Initialize U and B

    #U = np.random.rand(max(users), k)/50
    #B = np.random.rand(k, max(books))/50
    #B[0, books - 1] = finalbooks.average_rating[books - 1]

    U = np.ones((max(users), k)) / np.sqrt(k)
    B = np.ones((k, max(books))) / np.sqrt(k)
    #B[0, books - 1] = finalbooks.average_rating[books - 1]
    
    iter = 1
    RMSE = 3
    dRMSE = 1
    rms = []
    stop = 0.0001
    max_iter = 24
    
    while (dRMSE > stop) and (iter < max_iter):
      for i in users:
        ind_B = train.newbookid[train.newuser_id == i] - 1
        sub_B = B[:, ind_B]
        nui = sub_B.shape[1]
        Ai = sub_B @ np.transpose(sub_B) + lamu * np.identity(k)  #*nui
        Vi = sub_B @ train.rating[train.newuser_id == i]
        U[i - 1, :] = np.linalg.pinv(Ai) @ Vi
        #if i % 1000 == 0: print('user ' + str(i))
      nR = new_R(train, U, B)
      new_RMSE = rmse(nR,train.rating)
      dRMSEu = (RMSE - new_RMSE)
      RMSE = new_RMSE.copy()
      #print('dRMSE = ' + str(dRMSE))
      rms.append(RMSE)
      iter += 1
      print("step: ", iter)

      for i in books:
        ind_U = train.newuser_id[train.newbookid == i] - 1
        sub_U = U[ind_U, :]
        nbi = sub_U.shape[0]
        Ai = np.transpose(sub_U) @ sub_U + lamb * np.identity(k)   #*nbi
        Vi = np.transpose(sub_U) @ train.rating[train.newbookid == i]
        B[:, i - 1] = np.linalg.pinv(Ai) @ Vi
        #if i % 1000 == 0: print('book ' + str(i))
      nR = new_R(train, U, B)
      new_RMSE = rmse(nR,train.rating)
      dRMSE = (RMSE - new_RMSE) #np.abs
      #dRMSE = min(dRMSEu, dRMSEb) #np.abs
      RMSE = new_RMSE.copy()
      #print('dRMSE = ' + str(dRMSE))
      #print('RMSE = ' + str(RMSE))
      print("step: ", iter)
      rms.append(RMSE)
      iter += 1
    w = {}
    w['rms'] = rms
    w['U'] = U
    w['B'] = B

    return w

In [0]:
 traint, traincv = train_test_split(train,
                               stratify=train['newuser_id'], 
                               test_size=0.1875,
                               random_state=42)
traint = traint.reset_index(drop=True)
traincv = traincv.reset_index(drop=True)

In [0]:
ks = []
trains = []
cvs = []
ndgs = []

for k in [3]:
  for alphau in  [ 0.125]:
    for betab in  [0.075, 0.1, 0.2, 1]:
      print("running for... alphau = ", alphau, " and betab = ", betab)
      w = ALS(traint, k, alphau, betab)
      CVpred = new_R(traincv, w['U'], w['B'])
      RMSE_CV = np.sqrt(np.mean((CVpred - traincv.rating) ** 2))
      ranked = traincv.filter(['rating'])
      ranked['pred'] = CVpred
      ndgcv = ndcg_k(ranked.sort_values(by=['pred'], ascending = False).rating, len(ranked.sort_values(by=['pred'], ascending = False).rating))
      ndgs.append(ndgcv)
      ks.append(betab)
      trains.append(w['rms'][-1])
      cvs.append(RMSE_CV)
      print("RMSEtrain: ", w['rms'][-1])
      print("RMSECV: ", RMSE_CV)
      print("done for: k= ", k, "alphau= ", alphau, "betab= ", betab)
      print("RMSEtrain: ", w['rms'][-1])
      print("RMSECV: ", RMSE_CV)
      print("NDG: ", ndgcv)
      print ("w rms: ", w['rms'])




In [0]:
print(ks)
print(trains)
print(cvs)


In [0]:
w = ALS(train,  3, 0.1, 0.1)

In [0]:
R = w['U'].dot(w['B'])

In [0]:
rflat = np.matrix.flatten(R)
testy = np.repeat(np.array(train.newuser_id.unique()), 8000)
booky = np.tile(np.array(finalbooks.newbookid), 15000)
booky


In [0]:
testy = np.sort(testy)
testy

In [0]:
predictions = pd.DataFrame(np.column_stack((testy, booky, rflat)), columns=('newuser_id','newbookid', 'pred'))

In [0]:
predictions

In [0]:
from google.colab import drive
drive.mount('/content/drive')
predictions.to_csv( '../tmp/predictions.csv' , index = False )
!cp /tmp/predictions.csv drive/My\ Drive/

In [0]:
## DEFINING THE TAIL
tailcomp = ratings.groupby(by= 'newbookid', as_index=False).agg({'rating':pd.Series.count}).sort_values(by = 'rating', ascending = False)
tot = sum(tailcomp['rating'])
tailcomp['popshare']= [x/tot for x in tailcomp['rating']]
tailcomp['popshare']= tailcomp['popshare'].cumsum()
tailcomp['category']= ['Head' if x<0.95 else "Tail" for x in tailcomp['popshare']]

tail = tailcomp.loc[tailcomp.popshare >= 0.95]
tail

In [0]:
mfrank = test.merge(predictions,on = ['newbookid', 'newuser_id'])
mfrank = mfrank.sort_values(by=['newuser_id', 'pred'], ascending=False)
mfrank.head(5)

In [0]:
#train['conc']=train['newuser_id'].map(str)+train['newbookid'].map(str)
#pred['conc']=pred['newuser_id'].map(str)+pred['newbookid'].map(str)
#predfin = pred[~pred.conc.isin(train.conc)]
#predfin.describe()

In [0]:
mflist = []
for i in range(15000):
    a = mfrank.loc[mfrank.newuser_id == i+1]['rating'].tolist()
    mflist.append(a)
    if (i+1)%1000 == 0: print("done: ", i+1)

In [0]:
b = np.array([ndcg_k(r, len(r)) for r in mflist])


facet, axes = plt.subplots(1, 1, figsize=(10, 3))
n, bins, patches = plt.hist(b, 200, facecolor='blue', alpha=0.5) #, log = True)   
plt.title('Distribution of NDGC among Users for the MF model')
plt.show()

# [ndcg_k(r, len(r)) for r in poplista]

In [0]:
d = b[b == 1]
sum(d)/15000

In [0]:
top10 = predictions.sort_values('pred',ascending = False).groupby('newuser_id').head(10)
top50 = predictions.sort_values('pred',ascending = False).groupby('newuser_id').head(50)

print('(1) MF Model RMSE: ', np.round(rmse(mfrank['pred'],mfrank['rating']), decimals=3))
print('(2) MF Model NDCG: ', np.round(mean_ndcg(mflist), decimals=3))
print("(2) Median NDCG: ", np.round(np.median(b), decimals=3))
print("(2) Share of NDCG =1 among Users: ", np.round(sum(d)/15000, decimals=3))
print('(3) MF Model Div10 Score: ',np.round(sum(np.in1d(top10.newbookid, tail.newbookid))/len(top10), decimals=3))
print('(3) MF Model Div50 Score: ',np.round(sum(np.in1d(top50.newbookid, tail.newbookid))/len(top50), decimals=3))

In [0]:
mfranktrain = train.merge(predictions,on = ['newbookid', 'newuser_id'])
mfranktrain = mfranktrain.sort_values(by=['newuser_id', 'pred'], ascending=False)

In [0]:
mflisttrain = []
for i in range(15000):
    a = mfranktrain.loc[mfranktrain.newuser_id == i+1]['rating'].tolist()
    mflisttrain.append(a)
    if (i+1)%1000 == 0: print("done: ", i+1)

In [0]:
print('(1) MF Train Model RMSE: ', np.round(rmse(mfranktrain['pred'],mfranktrain['rating']), decimals=3))
print('(2) MF Train Model NDCG: ', np.round(mean_ndcg(mflisttrain), decimals=3))

In [0]:
w['rms'][-1]