In [5]:
# coding: utf-8
"""
Source code for matrix completion
Author: Abhinav Sharma (as5414)
The details of the code and dev environment including descriptions of variables and functions is provided in the report pdf
"""
import scipy.sparse as sp
import scipy.sparse.linalg as sla
import numpy as np
import pandas as pd
import time
import math

# Function for loading and pre-processing data
def get_data(path):
    np_frame = pd.read_csv(path,usecols=[0,1,2]).as_matrix()

    users = np_frame[:,0].astype(dtype = 'uint32')-1
    items = np_frame[:,1].astype(dtype = 'uint32')-1
    ratings = np_frame[:,2].astype(dtype = 'float64')

    items_unique, items_cleaned = np.unique(items,return_inverse=True)

    num_users = np.max(users)+1
    num_items = items_unique.shape[0]

    user_dict = {i: [] for i in range(num_users)}

    for i in range(len(users)):
        user_dict[users[i]].append([items_cleaned[i],ratings[i]])

    R_train = sp.lil_matrix((num_users,num_items))
    R_test = sp.lil_matrix((num_users,num_items))
    num_train = 0
    num_test = 0

    for i in user_dict.keys():
        l = len(user_dict[i])
        indx = np.arange(l)
        np.random.shuffle(indx)
        temp = np.asarray(user_dict[i])
        R_test[i,temp[indx[:l/2],0]] = temp[indx[:l/2],1]
        R_train[i,temp[indx[l/2:],0]] = temp[indx[l/2:],1]
        num_train += len(indx[l/2:])
        num_test += len(indx[:l/2])

    return R_train,R_test,num_users,num_items,num_train,num_test

def itemize(data):
    _,indx = np.unique(data,return_index=True)
    u = data[np.sort(indx)]
    n_data = u.shape[0]
    new_indx = np.arange(n_data)
    d = dict(zip(u,new_indx))
    data_indx = np.zeros(data.shape, dtype=np.int32)
    for i in range(data_indx.shape[0]):
        data_indx[i] = d[data[i]]
    return data_indx, n_data

def load_data(path):
    data = pd.read_csv(path)
    data = data.drop(['funny', 'review_id', 'text', 'date', 'useful', 'cool'], axis=1)
    data.drop_duplicates(inplace=True)
    rows, cols, stars = np.array(data['user_id']), np.array(data['business_id']), np.array(data['stars'],dtype='float32')
    # itemize users and items
    row_indx, n_users = itemize(rows)
    col_indx, n_items = itemize(cols)
    return sp.csr_matrix((stars,(row_indx, col_indx)), dtype='float32', shape=(n_users,n_items))

# Class for matrix completion
class mat_comp():
    
    def __init__(self,R_train,R_test,num_users,num_items,num_train,num_test,r,lamda):
        self.r = r
        self.alpha = 0.01
        #self.beta = 0.01
        self.lamda = lamda
        self.iters = 15
        self.R_train = R_train
        self.R_test = R_test
        self.num_users = num_users
        self.num_items = num_items
        self.num_train = num_train
        self.num_test = num_test
        self.V = np.random.normal(scale=1.0/self.r,size=(self.num_users,self.r))
        self.W = np.random.normal(scale=1.0/self.r,size=(self.num_items,self.r))
        #self.b_u = np.zeros(self.num_users)
        #self.b_i = np.zeros(self.num_items)
        #self.b = np.sum(self.R_train)/self.R_train.nonzero()[0].shape[0]

    # Perfrom SGD on train set
    def sgd(self):
        ind = np.asarray(self.R_train.nonzero()).T
        np.random.shuffle(ind)
        for i in range(ind.shape[0]):
            err = self.R_train[ind[i,0],ind[i,1]] - (np.dot(self.V[ind[i,0],:],self.W[ind[i,1],:]))#+self.b+self.b_u[ind[i,0]]+self.b_i[ind[i,1]])
            #self.b_u[ind[i,0]] += self.alpha*(err - self.beta * self.b_u[ind[i,0]])
            #self.b_i[ind[i,1]] += self.alpha*(err - self.beta * self.b_i[ind[i,1]])
            self.V[ind[i,0],:] += self.alpha*(err*self.W[ind[i,1],:] - self.lamda*self.V[ind[i,0],:])
            self.W[ind[i,1],:] += self.alpha*(err*self.V[ind[i,0],:] - self.lamda*self.W[ind[i,1],:])

    # Compute RMSE on train / test sets
    def rmse(self,R,num):
        term = sp.lil_matrix(R.shape)
        ind = R.nonzero()
        i_prev=0
        k=0
        for i in range(len(ind[0])-1):
            if ind[0][i]==ind[0][i+1]:
                i=i+1
            else:
                term[k,ind[1][i_prev:i+1]] = np.asarray([np.dot(self.V[ind[0][i],:],self.W[j,:])for j in ind[1][i_prev:i+1]])#+self.b+self.b_u[ind[0][i]]+self.b_i[j] 
                i_prev=i+1
                k=k+1
        term[k,ind[1][i_prev:i+1]] = np.asarray([np.dot(self.V[ind[0][i],:],self.W[j,:]) for j in ind[1][i_prev:i+1]])
        term = R - term
        rmse = math.sqrt(sla.norm(term)**2/num)
        return rmse
    
    # Compute MRR on test set
    def mrr(self):
        mrr = 0
        num = self.num_users
        for u in range(self.num_users):
            ind = self.R_test[u].nonzero()[1]
            pred = (np.asarray([np.dot(self.V[u,:],self.W[j,:])+self.b+self.b_u[u]+self.b_i[j] for j in ind]),)
            pred = np.argsort(pred)
            arginv = np.zeros((len(ind)))
            arginv[pred] = np.arange(len(ind))
            indg = (self.R_test[u,ind]>=3.0).toarray()[0,:]
            if np.sum(indg)>0:
                mrr += np.sum((1.0/(arginv+1))[indg]) / np.sum(indg)
            else:
                num -=1
        mrr /= num
        return mrr
    
    # Train model
    def train(self):
        for i in range(self.iters):
            start = time.time()
            rmse = self.rmse(self.R_train,self.num_train)
            self.sgd()
            end = time.time()
            print('iter: ',i, ' rmse: ', rmse,' time: ',end-start)
        #rmse_test = self.rmse(R_test,num_test)
        #mrr_test = self.mrr()
        #return rmse_test,mrr_test,rmse

In [2]:
# Load and pre-process data
path = '../../dataset/review.csv'
R_train = load_data(path)
R_train = R_train[:-1000000,:-100000]
R_test = None
num_users = R_train.shape[0]
num_items = R_train.shape[1]
num_train = R_train.count_nonzero()
num_test = None


# path = '../../../HW1/ml-20m/ratings.csv'
# R_train,R_test,num_users,num_items,num_train,num_test = get_data(path)

# Define ranges for grid search
# l_range = [0.01,0.05,0.1,0.2,0.4,0.8]
# r_range = [2,4,8,16,32,64]

# Compute and store results
# for lamda in l_range:
#     for r in r_range:

#np.save('resl'+str(lamda)+'r'+str(r),res)

In [6]:
r = 1
lamda = 0.01
MC = mat_comp(R_train,R_test,num_users,num_items,num_train,num_test,r,lamda)
MC.train()

('iter: ', 0, ' rmse: ', 4.126663507028058, ' time: ', 107.9257869720459)
('iter: ', 1, ' rmse: ', 4.045225200557062, ' time: ', 111.27004790306091)
('iter: ', 2, ' rmse: ', 4.033651689096913, ' time: ', 108.97926211357117)
('iter: ', 3, ' rmse: ', 4.047891371171192, ' time: ', 107.79443907737732)
('iter: ', 4, ' rmse: ', 4.103824780763576, ' time: ', 107.57871794700623)


KeyboardInterrupt: 

In [59]:
MC.rmse(R_train,num_train)

4.891958036776596

In [58]:
MC.sgd()

In [26]:
num_users

326101

In [27]:
num_items

74567

In [1]:
import graphlab
import pandas as pd
from sklearn.metrics import mean_squared_error
import numpy as np

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1524851030.log


This non-commercial license of GraphLab Create for academic use is assigned to as5414@columbia.edu and will expire on April 26, 2019.


In [2]:
df = pd.read_csv('../../dataset/review.csv')

In [51]:
df['binary']=(df['stars']>2).astype(int)

In [52]:
df

Unnamed: 0,funny,user_id,review_id,text,business_id,stars,date,useful,cool,binary
0,0,bv2nCi5Qv5vroFiqKGopiw,v0i_UHJMo_hPBq9bxWvW4w,"Love the staff, love the meat, love the place....",0W4lkclzZThpx3V65bVgig,5,2016-05-28,0,0,1
1,0,bv2nCi5Qv5vroFiqKGopiw,vkVSCC7xljjrAI4UGfnKEQ,Super simple place but amazing nonetheless. It...,AEx2SYEUJmTxVVB18LlCwA,5,2016-05-28,0,0,1
2,0,bv2nCi5Qv5vroFiqKGopiw,n6QzIUObkYshz4dz2QRJTw,Small unassuming place that changes their menu...,VR6GpWIda3SfvPC-lg9H3w,5,2016-05-28,0,0,1
3,0,bv2nCi5Qv5vroFiqKGopiw,MV3CcKScW05u5LVfF6ok0g,Lester's is located in a beautiful neighborhoo...,CKC0-MOWMqoeWf6s-szl8g,5,2016-05-28,0,0,1
4,0,bv2nCi5Qv5vroFiqKGopiw,IXvOzsEMYtiJI0CARmj77Q,Love coming here. Yes the place always needs t...,ACFtxLv8pGrrxMm6EgjreA,4,2016-05-28,0,0,1
5,0,bv2nCi5Qv5vroFiqKGopiw,L_9BTb55X0GDtThi6GlZ6w,Had their chocolate almond croissant and it wa...,s2I_Ni76bjJNK9yG60iD-Q,4,2016-05-28,0,0,1
6,0,_4iMDXbXZ1p1ONG297YEAQ,HRPm3vEZ_F-33TYVT7Pebw,Cycle Pub Las Vegas was a blast! Got a groupon...,8QWPlVQ6D-OExqXoaD2Z1g,5,2014-09-24,1,0,1
7,0,u0LXt3Uea_GidxRW1xcsfg,ymAUG8DZfQcFTBSOiaNN4w,Who would have guess that you would be able to...,9_CGhHMz8698M9-PkVf0CQ,4,2012-05-11,0,2,1
8,0,u0LXt3Uea_GidxRW1xcsfg,8UIishPUD92hXtScSga_gw,Always drove past this coffee house and wonder...,gkCorLgPyQLsptTHalL61g,4,2015-10-27,1,0,1
9,0,u0LXt3Uea_GidxRW1xcsfg,w41ZS9shepfO3uEyhXEWuQ,"Not bad!! Love that there is a gluten-free, ve...",5r6-G9C4YLbC7Ziz57l3rQ,3,2013-02-09,1,0,1


In [None]:
# Matrix Factorization recommender using Graphlab
sf = graphlab.SFrame(df[['user_id', 'business_id', 'stars']])

train, test = graphlab.recommender.util.random_split_by_user(sf,user_id='user_id',item_id='business_id',max_num_users=None)


rec = graphlab.recommender.factorization_recommender.create(
            train,
            user_id='user_id',
            item_id='business_id',
            target='binary',
            solver='sgd',
            #sgd_step_size=0.01,
            max_iterations=50,
            binary_target=True,
            regularization=0.000005,
            num_factors=100,
            side_data_factorization=False)
eval_rmse = rec.evaluate(test,metric='rmse',target='binary')
#predictions = rec.predict(train)
#rmse = np.sqrt(mean_squared_error(train['stars'], predictions))

# print "graphlab's reported rmse:", rec['training_rmse']
# print "calculated rmse:", rmse  

In [25]:
eval_rmse

{'rmse_by_item': Columns:
 	business_id	str
 	count	int
 	rmse	float
 
 Rows: 138454
 
 Data:
 +------------------------+-------+----------------+
 |      business_id       | count |      rmse      |
 +------------------------+-------+----------------+
 | JDm9c_Gkm-N01H9K2PdgMw |   1   | 2.72815195104  |
 | icrwcgr0Pqle5PCL-Ki8Rw |   2   | 1.26681856862  |
 | Ijw0d-2wTcsciDesjCVgsA |   13  | 1.65462228994  |
 | L5CF9zPE5G0lCEExKn_qQg |   9   | 1.69396885475  |
 | o3oC0UK97tiV4cw97rAgfw |   7   | 1.32530029298  |
 | VDGMgPcGFB6acVw1qEoooA |   1   | 2.72806540532  |
 | PkM2kRiW_HJiV0E8p8yJsg |   1   |  1.518667538   |
 | pUI96wpoJtsQc-aDP9UIqw |   2   | 1.26076620991  |
 | Bl7-eu0cRYJ6BiydS23-jw |   5   | 1.05320434797  |
 | K0QZq5UwpAZCUk9yWeOFcQ |   2   | 0.249933200241 |
 +------------------------+-------+----------------+
 [138454 rows x 3 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.,
 'rmse

In [11]:
prec_rec = rec.evaluate_precision_recall(test,cutoffs=[100])

In [13]:
evla_rmse = rec.evaluate_rmse(test,target='stars')

In [14]:
evla_rmse

{'rmse_by_item': Columns:
 	business_id	str
 	count	int
 	rmse	float
 
 Rows: 138454
 
 Data:
 +------------------------+-------+---------------+
 |      business_id       | count |      rmse     |
 +------------------------+-------+---------------+
 | JDm9c_Gkm-N01H9K2PdgMw |   1   | 2.76938159419 |
 | icrwcgr0Pqle5PCL-Ki8Rw |   2   | 1.26835248405 |
 | Ijw0d-2wTcsciDesjCVgsA |   13  | 1.90521454565 |
 | L5CF9zPE5G0lCEExKn_qQg |   9   | 1.64238110263 |
 | o3oC0UK97tiV4cw97rAgfw |   7   |  1.3014528242 |
 | VDGMgPcGFB6acVw1qEoooA |   1   | 2.72806540532 |
 | PkM2kRiW_HJiV0E8p8yJsg |   1   | 1.66773834108 |
 | pUI96wpoJtsQc-aDP9UIqw |   2   | 1.18124971271 |
 | Bl7-eu0cRYJ6BiydS23-jw |   5   | 1.02134734084 |
 | K0QZq5UwpAZCUk9yWeOFcQ |   2   | 0.30406193632 |
 +------------------------+-------+---------------+
 [138454 rows x 3 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.,
 'rmse_by_user': Col

In [33]:
lat = rec['coefficients']
U = np.asarray(lat['user_id']['factors'])
V = np.asarray(lat['business_id']['factors'])