In [1]:
!pwd

/mnt/c/Google drive/columbia sem2/AML/dataset


In [1]:
import pandas as pd
import numpy as np
import scipy.sparse

def itemize(data):
    _,indx = np.unique(data,return_index=True)
    u = data[np.sort(indx)]
    n_data = u.shape[0]
    new_indx = np.arange(n_data)
    d = dict(zip(u,new_indx))
    data_indx = np.zeros(data.shape, dtype=np.int32)
    for i in range(data_indx.shape[0]):
        data_indx[i] = d[data[i]]
    return data_indx, n_data

def load_data():
    data = pd.read_csv('review.csv')
    data = data.drop(['funny', 'review_id', 'text', 'date', 'useful', 'cool'], axis=1)
    data.drop_duplicates(inplace=True)
    rows, cols, stars = np.array(data['user_id']), np.array(data['business_id']), np.array(data['stars'],dtype=np.uint8)
    # itemize users and items
    row_indx, n_users = itemize(rows)
    col_indx, n_items = itemize(cols)
    return scipy.sparse.csr_matrix((stars,(row_indx, col_indx)), dtype=np.uint8, shape=(n_users,n_items))


R = load_data()

In [16]:
l,w  = R.shape

In [17]:
type(R)

scipy.sparse.csr.csr_matrix

In [18]:
# Submitted by Isht Dwivedi, UNI id2303
# Advanced Machine Learning for Personalization, HomeWork 1

import numpy as np
from scipy import sparse
from scipy.sparse import lil_matrix
from random import shuffle
import pickle
import timeit
from scipy.sparse.linalg import norm
import os



save_dir = 'objF'

if not os.path.exists(save_dir):
    os.makedirs(save_dir)

def save_obj(obj, name ):
    """
    function to dumb grid search matrix to file
    """
    with open(save_dir+'/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    """
    function to load grid search matrix from file
    """
    with open(save_dir+'/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)



def read_file(filename):
    """
    This function reads the input csv file and stores data in a list
    """
    ratings_lines = open(filename).readlines()
    f = []
    for line in ratings_lines[1:]:
        f.append(int(line.split(',')[1]))
    f = list(set(f))
    c = 0
    m2i = {}
    for i in f:
        m2i[i]=c
        c+=1

    return m2i,ratings_lines


def split_train_test(ratings_lines):
    """
    This function splits the data into train and test. For each user, half of the data 
    goes into the train set and the other half goes into the test set
    """
    user_ratings = {}
    for line in ratings_lines[1:]:
        parts = line.split(',')
        if int(parts[0])-1 not in user_ratings:
            #userId,movieId,rating,timestamp
            user_ratings[int(parts[0])-1] = [(   m2i[int(parts[1])],float(parts[2])   )]
        else:
            user_ratings[int(parts[0])-1].append( ( m2i[int(parts[1])], float(parts[2]) )  )

    for key in user_ratings.keys():
        shuffle(user_ratings[key])
    R_train = lil_matrix((l, w))
    R_test = lil_matrix((l, w))
    train_list = []
    test_list = []
    for userID0 in user_ratings.keys():
        k = int(len(user_ratings[userID0])/2)
        for i in range(k):
            R_train[userID0,user_ratings[userID0][i][0]]=user_ratings[userID0][i][1]
            train_list.append((userID0,user_ratings[userID0][i][0]))
        for i in range(k,len(user_ratings[userID0])):
            R_test[userID0,user_ratings[userID0][i][0]]=user_ratings[userID0][i][1]
            test_list.append((userID0,user_ratings[userID0][i][0]))
            
    return train_list,test_list,R_train,R_test,user_ratings


In [19]:
R_train = R

In [40]:
def train(R_train,train_list,r,max_iter,lam,lr):
    """
    this function does SGD on the train data and return the matrix factorization
    Input to this function are the max. number of epocs to train, the learning rate, the regularization weigt and the train data
    """
    p = np.random.normal(0,1./r, (l,r)).astype(dtype=np.float128)
    q = np.random.normal(0,1./r, (r,w)).astype(dtype=np.float128)
    
    for iter_ in range(max_iter):
        shuffle(train_list)
        for samp,element in enumerate(train_list):
            i,j = element[0],element[1]
            e = -np.dot(p[i,:],(q[:,j]))+R_train[i,j]
            p[i,:] += lr*(2*e*q[:,j]-lam*p[i,:])
            q[:,j] += lr*(2*e*p[i,:]-lam*q[:,j])
        R_pred = get_pred(p,q,train_list)
        print_loss(R_pred,R_train,train_list,'train')
    return p,q



def get_pred(p,q,test_list):
    """
    get predictions of the trained model on test or train data
    """
    R_pred = lil_matrix((l, w))
    for element in test_list:
        i,j = element[0],element[1]
        R_pred[i,j] = np.dot(p[i,:],(q[:,j]))
#         R_train = lil_matrix((138493, 26744))
    return R_pred




def getMRR(user_ratings,R_test,R_pred,test_or_train):
    """
    get MMR values
    """
    total = 0.0
    for userID0 in user_ratings.keys():
        k = int(len(user_ratings[userID0])/2)
        mrr = 0.0
        movie_list = []
        if test_or_train=='test':
            start,end = k,len(user_ratings[userID0])
        elif test_or_train=='train':
            start,end = 0,k
        else:
            raise ValueError('please try either train or test mode')
        for i in range(start,end):
            #movieID, rating <- user_ratings[userID0][i]
            movieID, rating = user_ratings[userID0][i][0], user_ratings[userID0][i][1]
             #R_test[movieID, userID0]>=3.0:
            movie_list.append((movieID,R_pred[userID0,movieID]))
        movie_list.sort(key=lambda tup: tup[1])  # reverse or not?
        ct = 0.0
        for rank,i in enumerate(movie_list):
            rank+=1.0
            movieID, pred = i[0],i[1]
            if R_test[userID0,movieID]>=3.0:
                mrr += 1/rank
                ct+=1.0
        if mrr!=0 and ct!=0:
            total += mrr/ct
    total = total/len(user_ratings.keys())
    print(test_or_train,' mrr ',total)
    return total


def print_loss(R_test_pred,R_test,data_list,mode):
    """
    print root means dqure loss on the train or test data, depending on the inputs, mode is either 'train' or 'test'
    """
    R_e = lil_matrix((l, w))
    for element in data_list:
        i,j = element[0],element[1]
        R_e[i,j] = R_test[i,j]-R_test_pred[i,j]
    loss = norm(R_e)/np.sqrt(len(data_list))
    print(mode,' mse ',loss)
    return loss

In [41]:
R_train = R

In [42]:
R_train = R_train[:-1000000,:-100000]

In [43]:
tr_list = R_train.nonzero()

In [44]:
len(tr_list[1])

1107133

In [45]:
train_list = []
for i in zip(tr_list[0],tr_list[1]):
    train_list.append(i)

In [47]:
### specify training parameters here
id_no = '1'
pflag = '1'
lr = 0.01  # learning rate
max_iter= 15 # maximum number of epocs to be trained on
results_train = {}
results_test = {}
complete =0 

r_list = [1]
lam_list = [0.01]
#lam_list = [lam_list_full[int(id_no)-1]]
# m2i,ratings_lines = read_file('ml-20m/ratings.csv')
# train_list,test_list,R_train,R_test,user_ratings = split_train_test(ratings_lines)


# 2 for loops for grid search, one loop for rank values, one loop for lambda valuesS
for r in r_list:
    for lam in lam_list:
        complete+=1
        print(complete,'starting of ',len(r_list)*len(lam_list),'r = ',r,'lam = ',lam)
        p,q = train(R_train,train_list,r,max_iter,lam,lr)
#         R_test_pred = get_pred(p,q,test_list)
        R_train_pred = get_pred(p,q,train_list)
#         loss_test = print_loss(R_test_pred,R_test,test_list,'test')
        loss_train = print_loss(R_train_pred,R_train,train_list,'train')
#         MRR_test = getMRR(user_ratings,R_test,R_test_pred,'test')
#         MRR_train = getMRR(user_ratings,R_train,R_train_pred,'train')

#         results_train[(r,lam)] = (MRR_train,loss_train)
#         results_test[(r,lam)] = (MRR_test,loss_test)

1 starting of  1 r =  1 lam =  0.01
train  mse  3.96441889007
train  mse  3.75092736479
train  mse  3.25721965995
train  mse  2.83475250833
train  mse  2.50723405369
train  mse  2.2474575931
train  mse  2.04044072815
train  mse  1.87736634987
train  mse  1.74713272078
train  mse  1.6427837007
train  mse  1.55819789924
train  mse  1.489732365
train  mse  1.4328451562
train  mse  1.38574134735
train  mse  1.34751766201
train  mse  1.34751766201
