In [5]:
# coding: utf-8
"""
Source code for matrix completion
Author: Abhinav Sharma (as5414)
The details of the code and dev environment including descriptions of variables and functions is provided in the report pdf
"""
import scipy.sparse as sp
import scipy.sparse.linalg as sla
import numpy as np
import pandas as pd
import time
import math

# Function for loading and pre-processing data
def get_data(path):
    np_frame = pd.read_csv(path,usecols=[0,1,2]).as_matrix()

    users = np_frame[:,0].astype(dtype = 'uint32')-1
    items = np_frame[:,1].astype(dtype = 'uint32')-1
    ratings = np_frame[:,2].astype(dtype = 'float64')

    items_unique, items_cleaned = np.unique(items,return_inverse=True)

    num_users = np.max(users)+1
    num_items = items_unique.shape[0]

    user_dict = {i: [] for i in range(num_users)}

    for i in range(len(users)):
        user_dict[users[i]].append([items_cleaned[i],ratings[i]])

    R_train = sp.lil_matrix((num_users,num_items))
    R_test = sp.lil_matrix((num_users,num_items))
    num_train = 0
    num_test = 0

    for i in user_dict.keys():
        l = len(user_dict[i])
        indx = np.arange(l)
        np.random.shuffle(indx)
        temp = np.asarray(user_dict[i])
        R_test[i,temp[indx[:l/2],0]] = temp[indx[:l/2],1]
        R_train[i,temp[indx[l/2:],0]] = temp[indx[l/2:],1]
        num_train += len(indx[l/2:])
        num_test += len(indx[:l/2])

    return R_train,R_test,num_users,num_items,num_train,num_test

def itemize(data):
    _,indx = np.unique(data,return_index=True)
    u = data[np.sort(indx)]
    n_data = u.shape[0]
    new_indx = np.arange(n_data)
    d = dict(zip(u,new_indx))
    data_indx = np.zeros(data.shape, dtype=np.int32)
    for i in range(data_indx.shape[0]):
        data_indx[i] = d[data[i]]
    return data_indx, n_data

def load_data(path):
    data = pd.read_csv(path)
    data = data.drop(['funny', 'review_id', 'text', 'date', 'useful', 'cool'], axis=1)
    data.drop_duplicates(inplace=True)
    rows, cols, stars = np.array(data['user_id']), np.array(data['business_id']), np.array(data['stars'],dtype='float32')
    # itemize users and items
    row_indx, n_users = itemize(rows)
    col_indx, n_items = itemize(cols)
    return sp.csr_matrix((stars,(row_indx, col_indx)), dtype='float32', shape=(n_users,n_items))

# Class for matrix completion
class mat_comp():
    
    def __init__(self,R_train,R_test,num_users,num_items,num_train,num_test,r,lamda):
        self.r = r
        self.alpha = 0.01
        #self.beta = 0.01
        self.lamda = lamda
        self.iters = 15
        self.R_train = R_train
        self.R_test = R_test
        self.num_users = num_users
        self.num_items = num_items
        self.num_train = num_train
        self.num_test = num_test
        self.V = np.random.normal(scale=1.0/self.r,size=(self.num_users,self.r))
        self.W = np.random.normal(scale=1.0/self.r,size=(self.num_items,self.r))
        #self.b_u = np.zeros(self.num_users)
        #self.b_i = np.zeros(self.num_items)
        #self.b = np.sum(self.R_train)/self.R_train.nonzero()[0].shape[0]

    # Perfrom SGD on train set
    def sgd(self):
        ind = np.asarray(self.R_train.nonzero()).T
        np.random.shuffle(ind)
        for i in range(ind.shape[0]):
            err = self.R_train[ind[i,0],ind[i,1]] - (np.dot(self.V[ind[i,0],:],self.W[ind[i,1],:]))#+self.b+self.b_u[ind[i,0]]+self.b_i[ind[i,1]])
            #self.b_u[ind[i,0]] += self.alpha*(err - self.beta * self.b_u[ind[i,0]])
            #self.b_i[ind[i,1]] += self.alpha*(err - self.beta * self.b_i[ind[i,1]])
            self.V[ind[i,0],:] += self.alpha*(err*self.W[ind[i,1],:] - self.lamda*self.V[ind[i,0],:])
            self.W[ind[i,1],:] += self.alpha*(err*self.V[ind[i,0],:] - self.lamda*self.W[ind[i,1],:])

    # Compute RMSE on train / test sets
    def rmse(self,R,num):
        term = sp.lil_matrix(R.shape)
        ind = R.nonzero()
        i_prev=0
        k=0
        for i in range(len(ind[0])-1):
            if ind[0][i]==ind[0][i+1]:
                i=i+1
            else:
                term[k,ind[1][i_prev:i+1]] = np.asarray([np.dot(self.V[ind[0][i],:],self.W[j,:])for j in ind[1][i_prev:i+1]])#+self.b+self.b_u[ind[0][i]]+self.b_i[j] 
                i_prev=i+1
                k=k+1
        term[k,ind[1][i_prev:i+1]] = np.asarray([np.dot(self.V[ind[0][i],:],self.W[j,:]) for j in ind[1][i_prev:i+1]])
        term = R - term
        rmse = math.sqrt(sla.norm(term)**2/num)
        return rmse
    
    # Compute MRR on test set
    def mrr(self):
        mrr = 0
        num = self.num_users
        for u in range(self.num_users):
            ind = self.R_test[u].nonzero()[1]
            pred = (np.asarray([np.dot(self.V[u,:],self.W[j,:])+self.b+self.b_u[u]+self.b_i[j] for j in ind]),)
            pred = np.argsort(pred)
            arginv = np.zeros((len(ind)))
            arginv[pred] = np.arange(len(ind))
            indg = (self.R_test[u,ind]>=3.0).toarray()[0,:]
            if np.sum(indg)>0:
                mrr += np.sum((1.0/(arginv+1))[indg]) / np.sum(indg)
            else:
                num -=1
        mrr /= num
        return mrr
    
    # Train model
    def train(self):
        for i in range(self.iters):
            start = time.time()
            rmse = self.rmse(self.R_train,self.num_train)
            self.sgd()
            end = time.time()
            print('iter: ',i, ' rmse: ', rmse,' time: ',end-start)
        #rmse_test = self.rmse(R_test,num_test)
        #mrr_test = self.mrr()
        #return rmse_test,mrr_test,rmse

In [2]:
# Load and pre-process data
path = '../../dataset/review.csv'
R_train = load_data(path)
R_train = R_train[:-1000000,:-100000]
R_test = None
num_users = R_train.shape[0]
num_items = R_train.shape[1]
num_train = R_train.count_nonzero()
num_test = None


# path = '../../../HW1/ml-20m/ratings.csv'
# R_train,R_test,num_users,num_items,num_train,num_test = get_data(path)

# Define ranges for grid search
# l_range = [0.01,0.05,0.1,0.2,0.4,0.8]
# r_range = [2,4,8,16,32,64]

# Compute and store results
# for lamda in l_range:
#     for r in r_range:

#np.save('resl'+str(lamda)+'r'+str(r),res)

In [6]:
r = 1
lamda = 0.01
MC = mat_comp(R_train,R_test,num_users,num_items,num_train,num_test,r,lamda)
MC.train()

('iter: ', 0, ' rmse: ', 4.126663507028058, ' time: ', 107.9257869720459)
('iter: ', 1, ' rmse: ', 4.045225200557062, ' time: ', 111.27004790306091)
('iter: ', 2, ' rmse: ', 4.033651689096913, ' time: ', 108.97926211357117)
('iter: ', 3, ' rmse: ', 4.047891371171192, ' time: ', 107.79443907737732)
('iter: ', 4, ' rmse: ', 4.103824780763576, ' time: ', 107.57871794700623)


KeyboardInterrupt: 

In [59]:
MC.rmse(R_train,num_train)

4.891958036776596

In [58]:
MC.sgd()

In [26]:
num_users

326101

In [27]:
num_items

74567

In [1]:
import graphlab
import pandas as pd
from sklearn.metrics import mean_squared_error
import numpy as np

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1524851030.log


This non-commercial license of GraphLab Create for academic use is assigned to as5414@columbia.edu and will expire on April 26, 2019.


In [2]:
df = pd.read_csv('../../dataset/review.csv')

In [None]:
# Matrix Factorization recommender using Graphlab
sf = graphlab.SFrame(df[['user_id', 'business_id', 'stars']])

train, test = graphlab.recommender.util.random_split_by_user(sf,user_id='user_id',item_id='business_id',max_num_users=None)


rec = graphlab.recommender.factorization_recommender.create(
            train,
            user_id='user_id',
            item_id='business_id',
            target='stars',
            solver='als',
            max_iterations=10,
            regularization=0.000001,
            num_factors=50,
            side_data_factorization=False)

predictions = rec.predict(train)
rmse = np.sqrt(mean_squared_error(train['stars'], predictions))

print "graphlab's reported rmse:", rec['training_rmse']
print "calculated rmse:", rmse  

In [16]:
prec_rec = rec.evaluate_precision_recall(test,cutoffs=[100])

[ERROR] graphlab.toolkits._main: Toolkit error: Cancelled by user.


ToolkitError: Cancelled by user.

In [17]:
prec_rec

{'precision_recall_by_user': Columns:
 	user_id	str
 	cutoff	int
 	precision	float
 	recall	float
 	count	int
 
 Rows: 9108054
 
 Data:
 +------------------------+--------+-----------+--------+-------+
 |        user_id         | cutoff | precision | recall | count |
 +------------------------+--------+-----------+--------+-------+
 | ---1lKK3aKOuomHnwAkAow |   1    |    0.0    |  0.0   |   17  |
 | ---1lKK3aKOuomHnwAkAow |   2    |    0.0    |  0.0   |   17  |
 | ---1lKK3aKOuomHnwAkAow |   3    |    0.0    |  0.0   |   17  |
 | ---1lKK3aKOuomHnwAkAow |   4    |    0.0    |  0.0   |   17  |
 | ---1lKK3aKOuomHnwAkAow |   5    |    0.0    |  0.0   |   17  |
 | ---1lKK3aKOuomHnwAkAow |   6    |    0.0    |  0.0   |   17  |
 | ---1lKK3aKOuomHnwAkAow |   7    |    0.0    |  0.0   |   17  |
 | ---1lKK3aKOuomHnwAkAow |   8    |    0.0    |  0.0   |   17  |
 | ---1lKK3aKOuomHnwAkAow |   9    |    0.0    |  0.0   |   17  |
 | ---1lKK3aKOuomHnwAkAow |   10   |    0.0    |  0.0   |   17  |
 +----

In [33]:
lat = rec['coefficients']
U = np.asarray(lat['user_id']['factors'])
V = np.asarray(lat['business_id']['factors'])