In [5]:
# coding: utf-8
"""
Source code for matrix completion
Author: Abhinav Sharma (as5414)
The details of the code and dev environment including descriptions of variables and functions is provided in the report pdf
"""
import scipy.sparse as sp
import scipy.sparse.linalg as sla
import numpy as np
import pandas as pd
import time
import math

# Function for loading and pre-processing data
def get_data(path):
    np_frame = pd.read_csv(path,usecols=[0,1,2]).as_matrix()

    users = np_frame[:,0].astype(dtype = 'uint32')-1
    items = np_frame[:,1].astype(dtype = 'uint32')-1
    ratings = np_frame[:,2].astype(dtype = 'float64')

    items_unique, items_cleaned = np.unique(items,return_inverse=True)

    num_users = np.max(users)+1
    num_items = items_unique.shape[0]

    user_dict = {i: [] for i in range(num_users)}

    for i in range(len(users)):
        user_dict[users[i]].append([items_cleaned[i],ratings[i]])

    R_train = sp.lil_matrix((num_users,num_items))
    R_test = sp.lil_matrix((num_users,num_items))
    num_train = 0
    num_test = 0

    for i in user_dict.keys():
        l = len(user_dict[i])
        indx = np.arange(l)
        np.random.shuffle(indx)
        temp = np.asarray(user_dict[i])
        R_test[i,temp[indx[:l/2],0]] = temp[indx[:l/2],1]
        R_train[i,temp[indx[l/2:],0]] = temp[indx[l/2:],1]
        num_train += len(indx[l/2:])
        num_test += len(indx[:l/2])

    return R_train,R_test,num_users,num_items,num_train,num_test

def itemize(data):
    _,indx = np.unique(data,return_index=True)
    u = data[np.sort(indx)]
    n_data = u.shape[0]
    new_indx = np.arange(n_data)
    d = dict(zip(u,new_indx))
    data_indx = np.zeros(data.shape, dtype=np.int32)
    for i in range(data_indx.shape[0]):
        data_indx[i] = d[data[i]]
    return data_indx, n_data

def load_data(path):
    data = pd.read_csv(path)
    data = data.drop(['funny', 'review_id', 'text', 'date', 'useful', 'cool'], axis=1)
    data.drop_duplicates(inplace=True)
    rows, cols, stars = np.array(data['user_id']), np.array(data['business_id']), np.array(data['stars'],dtype='float32')
    # itemize users and items
    row_indx, n_users = itemize(rows)
    col_indx, n_items = itemize(cols)
    return sp.csr_matrix((stars,(row_indx, col_indx)), dtype='float32', shape=(n_users,n_items))

# Class for matrix completion
class mat_comp():
    
    def __init__(self,R_train,R_test,num_users,num_items,num_train,num_test,r,lamda):
        self.r = r
        self.alpha = 0.01
        #self.beta = 0.01
        self.lamda = lamda
        self.iters = 15
        self.R_train = R_train
        self.R_test = R_test
        self.num_users = num_users
        self.num_items = num_items
        self.num_train = num_train
        self.num_test = num_test
        self.V = np.random.normal(scale=1.0/self.r,size=(self.num_users,self.r))
        self.W = np.random.normal(scale=1.0/self.r,size=(self.num_items,self.r))
        #self.b_u = np.zeros(self.num_users)
        #self.b_i = np.zeros(self.num_items)
        #self.b = np.sum(self.R_train)/self.R_train.nonzero()[0].shape[0]

    # Perfrom SGD on train set
    def sgd(self):
        ind = np.asarray(self.R_train.nonzero()).T
        np.random.shuffle(ind)
        for i in range(ind.shape[0]):
            err = self.R_train[ind[i,0],ind[i,1]] - (np.dot(self.V[ind[i,0],:],self.W[ind[i,1],:]))#+self.b+self.b_u[ind[i,0]]+self.b_i[ind[i,1]])
            #self.b_u[ind[i,0]] += self.alpha*(err - self.beta * self.b_u[ind[i,0]])
            #self.b_i[ind[i,1]] += self.alpha*(err - self.beta * self.b_i[ind[i,1]])
            self.V[ind[i,0],:] += self.alpha*(err*self.W[ind[i,1],:] - self.lamda*self.V[ind[i,0],:])
            self.W[ind[i,1],:] += self.alpha*(err*self.V[ind[i,0],:] - self.lamda*self.W[ind[i,1],:])

    # Compute RMSE on train / test sets
    def rmse(self,R,num):
        term = sp.lil_matrix(R.shape)
        ind = R.nonzero()
        i_prev=0
        k=0
        for i in range(len(ind[0])-1):
            if ind[0][i]==ind[0][i+1]:
                i=i+1
            else:
                term[k,ind[1][i_prev:i+1]] = np.asarray([np.dot(self.V[ind[0][i],:],self.W[j,:])for j in ind[1][i_prev:i+1]])#+self.b+self.b_u[ind[0][i]]+self.b_i[j] 
                i_prev=i+1
                k=k+1
        term[k,ind[1][i_prev:i+1]] = np.asarray([np.dot(self.V[ind[0][i],:],self.W[j,:]) for j in ind[1][i_prev:i+1]])
        term = R - term
        rmse = math.sqrt(sla.norm(term)**2/num)
        return rmse
    
    # Compute MRR on test set
    def mrr(self):
        mrr = 0
        num = self.num_users
        for u in range(self.num_users):
            ind = self.R_test[u].nonzero()[1]
            pred = (np.asarray([np.dot(self.V[u,:],self.W[j,:])+self.b+self.b_u[u]+self.b_i[j] for j in ind]),)
            pred = np.argsort(pred)
            arginv = np.zeros((len(ind)))
            arginv[pred] = np.arange(len(ind))
            indg = (self.R_test[u,ind]>=3.0).toarray()[0,:]
            if np.sum(indg)>0:
                mrr += np.sum((1.0/(arginv+1))[indg]) / np.sum(indg)
            else:
                num -=1
        mrr /= num
        return mrr
    
    # Train model
    def train(self):
        for i in range(self.iters):
            start = time.time()
            rmse = self.rmse(self.R_train,self.num_train)
            self.sgd()
            end = time.time()
            print('iter: ',i, ' rmse: ', rmse,' time: ',end-start)
        #rmse_test = self.rmse(R_test,num_test)
        #mrr_test = self.mrr()
        #return rmse_test,mrr_test,rmse

In [2]:
# Load and pre-process data
path = '../../dataset/review.csv'
R_train = load_data(path)
R_train = R_train[:-1000000,:-100000]
R_test = None
num_users = R_train.shape[0]
num_items = R_train.shape[1]
num_train = R_train.count_nonzero()
num_test = None


# path = '../../../HW1/ml-20m/ratings.csv'
# R_train,R_test,num_users,num_items,num_train,num_test = get_data(path)

# Define ranges for grid search
# l_range = [0.01,0.05,0.1,0.2,0.4,0.8]
# r_range = [2,4,8,16,32,64]

# Compute and store results
# for lamda in l_range:
#     for r in r_range:

#np.save('resl'+str(lamda)+'r'+str(r),res)

In [6]:
r = 1
lamda = 0.01
MC = mat_comp(R_train,R_test,num_users,num_items,num_train,num_test,r,lamda)
MC.train()

('iter: ', 0, ' rmse: ', 4.126663507028058, ' time: ', 107.9257869720459)
('iter: ', 1, ' rmse: ', 4.045225200557062, ' time: ', 111.27004790306091)
('iter: ', 2, ' rmse: ', 4.033651689096913, ' time: ', 108.97926211357117)
('iter: ', 3, ' rmse: ', 4.047891371171192, ' time: ', 107.79443907737732)
('iter: ', 4, ' rmse: ', 4.103824780763576, ' time: ', 107.57871794700623)


KeyboardInterrupt: 

In [1]:
import graphlab
import pandas as pd
from sklearn.metrics import mean_squared_error
import numpy as np

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1525031772.log


This non-commercial license of GraphLab Create for academic use is assigned to as5414@columbia.edu and will expire on April 26, 2019.


In [2]:
df = pd.read_csv('../../dataset/train.csv',names=['user_id','business_id','stars','timestamps'])

In [3]:
#df['binary']=(df['stars']>2).astype(int)

In [3]:
df

Unnamed: 0,user_id,business_id,stars,timestamps
0,1,15733,4,155918
1,1,67856,5,130087
2,1,87093,5,190773
3,1,113461,3,6364
4,1,153617,5,162649
5,1,160310,5,132535
6,2,77849,3,169798
7,3,14970,2,296189
8,4,18440,4,190213
9,4,19036,5,371844


In [4]:
# Matrix Factorization recommender using Graphlab
sf = graphlab.SFrame(df[['user_id', 'business_id', 'stars']])

#train, test = graphlab.recommender.util.random_split_by_user(sf,user_id='user_id',item_id='business_id',item_test_proportion=0.1,max_num_users=None)


rec = graphlab.recommender.factorization_recommender.create(
            sf,
            user_id='user_id',
            item_id='business_id',
            target='stars',
            solver='als',
            #sgd_step_size=0.01,
            max_iterations=50,
            #binary_target=True,
            regularization=0.000001,
            num_factors=50,
            side_data_factorization=False)
#eval_rmse = rec.evaluate(test,metric='rmse',target='stars')
predictions = rec.predict(sf)
rmse = np.sqrt(mean_squared_error(sf['stars'], predictions))

print "graphlab's reported rmse:", rec['training_rmse']
print "calculated rmse:", rmse  

graphlab's reported rmse: 0.598135357543
calculated rmse: 0.5981353575431414


In [12]:
predictions

dtype: float
Rows: 4801323
[3.960885850903941, 4.421873701927614, 4.456742836353731, 3.2900847626900203, 4.665315760490847, 4.696186257717562, 3.397292776224566, 2.173987819073153, 3.986887974855853, 4.157048477051211, 3.986588491318179, 3.920488281366778, 4.590969039318514, 3.2070239318584925, 2.967716051456881, 3.3226398063873774, 4.325329793808413, 2.4129422439312465, 1.6113767160152919, 4.293170286533785, 3.9800522459721095, 2.8503270281528956, 3.1796266568874842, 3.432968421099139, 4.962591601726961, 2.7741706503605372, 4.64751262795682, 4.767675472614718, 4.078980578300905, 1.912040902492953, 4.111834241268587, 3.938256574985934, 1.6166109575008876, 4.714427365181399, 2.563126279232455, 2.0307288898205287, 4.91668028962369, 2.629651142475558, 3.1028919352268702, 3.8602066321348674, 2.206939174053622, 4.874176098225069, 2.167286707279635, 1.5056936277126796, 2.190847350475741, 4.865939928410006, 4.592676831600619, 3.247379673836184, 4.123067600843859, 1.2453414930080897, 4.7689560

In [9]:
prec_rec = rec.evaluate_rmse(sf,target='stars')

In [10]:
prec_rec

{'rmse_by_item': Columns:
 	business_id	int
 	count	int
 	rmse	float
 
 Rows: 170198
 
 Data:
 +-------------+-------+----------------+
 | business_id | count |      rmse      |
 +-------------+-------+----------------+
 |    21855    |  130  | 0.332384976968 |
 |    88004    |   15  | 0.876281948869 |
 |    79732    |   3   | 0.970728077213 |
 |    63664    |   6   | 0.644049639399 |
 |    127950   |  151  | 0.348455881719 |
 |     7899    |   3   | 0.995840781512 |
 |    25263    |   33  | 0.659385962989 |
 |    130872   |   3   | 1.01982870708  |
 |    87629    |   3   | 1.18951343747  |
 |    30621    |   43  | 0.672680238364 |
 +-------------+-------+----------------+
 [170198 rows x 3 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.,
 'rmse_by_user': Columns:
 	user_id	int
 	count	int
 	rmse	float
 
 Rows: 1248782
 
 Data:
 +---------+-------+----------------+
 | user_id | count |      rmse 

In [14]:
evla_rmse

{'rmse_by_item': Columns:
 	business_id	str
 	count	int
 	rmse	float
 
 Rows: 138454
 
 Data:
 +------------------------+-------+---------------+
 |      business_id       | count |      rmse     |
 +------------------------+-------+---------------+
 | JDm9c_Gkm-N01H9K2PdgMw |   1   | 2.76938159419 |
 | icrwcgr0Pqle5PCL-Ki8Rw |   2   | 1.26835248405 |
 | Ijw0d-2wTcsciDesjCVgsA |   13  | 1.90521454565 |
 | L5CF9zPE5G0lCEExKn_qQg |   9   | 1.64238110263 |
 | o3oC0UK97tiV4cw97rAgfw |   7   |  1.3014528242 |
 | VDGMgPcGFB6acVw1qEoooA |   1   | 2.72806540532 |
 | PkM2kRiW_HJiV0E8p8yJsg |   1   | 1.66773834108 |
 | pUI96wpoJtsQc-aDP9UIqw |   2   | 1.18124971271 |
 | Bl7-eu0cRYJ6BiydS23-jw |   5   | 1.02134734084 |
 | K0QZq5UwpAZCUk9yWeOFcQ |   2   | 0.30406193632 |
 +------------------------+-------+---------------+
 [138454 rows x 3 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.,
 'rmse_by_user': Col

In [5]:
lat = rec['coefficients']
# U = np.asarray(lat['user_id']['factors'])
# V = np.asarray(lat['business_id']['factors'])

In [6]:
lat

{'business_id': Columns:
 	business_id	int
 	linear_terms	float
 	factors	array
 
 Rows: 170198
 
 Data:
 +-------------+--------------+-------------------------------+
 | business_id | linear_terms |            factors            |
 +-------------+--------------+-------------------------------+
 |    15733    |     0.0      | [-2.04808831215, -0.289387... |
 |    67856    |     0.0      | [-0.346569895744, 0.148897... |
 |    87093    |     0.0      | [-0.673363804817, -0.48200... |
 |    113461   |     0.0      | [0.115808263421, -0.009901... |
 |    153617   |     0.0      | [-0.377523034811, -0.34691... |
 |    160310   |     0.0      | [-0.0709279030561, -0.0526... |
 |    77849    |     0.0      | [0.107696078718, 0.1720171... |
 |    14970    |     0.0      | [-0.313647985458, -0.31288... |
 |    18440    |     0.0      | [0.495027035475, -0.192912... |
 |    19036    |     0.0      | [-0.156491905451, -0.18767... |
 +-------------+--------------+-------------------------------+

In [7]:
lat['business_id']

business_id,linear_terms,factors
15733,0.0,"[-2.04808831215, -0.289387613535, ..."
67856,0.0,"[-0.346569895744, 0.148897513747, ..."
87093,0.0,"[-0.673363804817, -0.482003927231, ..."
113461,0.0,"[0.115808263421, -0.00990190636367, ..."
153617,0.0,"[-0.377523034811, -0.34691748023, ..."
160310,0.0,"[-0.0709279030561, -0.0526834502816, ..."
77849,0.0,"[0.107696078718, 0.172017171979, ..."
14970,0.0,"[-0.313647985458, -0.312882661819, ..."
18440,0.0,"[0.495027035475, -0.192912846804, ..."
19036,0.0,"[-0.156491905451, -0.187671408057, ..."


In [80]:
np.unique(np.asarray(df['user_id'])).shape

(1248782,)

In [81]:
np.unique(np.asarray(lat['user_id']['user_id'])).shape

(1248782,)

In [41]:
import pickle
def save_obj(obj, name ):
    with open('dumps/'+name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [43]:
save_obj(lat,'lat_exp')

In [11]:
lat['user_id'].export_csv('dumps/lat_user_2.csv')
lat['business_id'].export_csv('dumps/lat_item_2.csv')

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../../dataset/train.csv',names=['user_id','business_id','stars','timestamps'])

In [3]:
dfu = pd.read_csv('dumps/lat_user.csv')#,names=['user_id','linear','stars','timestamps'])

In [4]:
dfi = pd.read_csv('dumps/lat_item.csv')

In [5]:
np.asarray(dfu['factors'][0][1:-1].split(),dtype=np.float64).shape

(50,)

In [6]:
len(dfu)

1248782

In [7]:

for i in range(len(dfu)):
    print(i)
    u = dfu['user_id'][i]
    uf = np.asarray(dfu['factors'][i][1:-1].split(),dtype=np.float64)
    for j in range(len(dfi)):
        v = dfi['business_id'][j]
        vf = np.asarray(dfi['factors'][j][1:-1].split(),dtype=np.float64)
        pred=np.dot(uf,vf)
        err += 

0
1
2
3
4
5


KeyboardInterrupt: 

In [10]:
import scipy.sparse as sp
lines = open('../../dataset/train.csv').readlines()
x = []
y = []
rat = []
for line in lines:
    line_parts = line.split(',')
    x.append(int(line_parts[0]))
    y.append(int(line_parts[1]))
    rat.append(int(line_parts[2]))

R_train = sp.csr_matrix((rat,(x, y)), dtype=np.int, shape=(1326102, 174568))

print len(R_train.nonzero()[0]),len(x)

4801323 4801323


In [22]:
lat

{'business_id': Columns:
 	business_id	int
 	linear_terms	float
 	factors	array
 
 Rows: 170198
 
 Data:
 +-------------+--------------+-------------------------------+
 | business_id | linear_terms |            factors            |
 +-------------+--------------+-------------------------------+
 |    15733    |     0.0      | [-2.04808831215, -0.289387... |
 |    67856    |     0.0      | [-0.346569895744, 0.148897... |
 |    87093    |     0.0      | [-0.673363804817, -0.48200... |
 |    113461   |     0.0      | [0.115808263421, -0.009901... |
 |    153617   |     0.0      | [-0.377523034811, -0.34691... |
 |    160310   |     0.0      | [-0.0709279030561, -0.0526... |
 |    77849    |     0.0      | [0.107696078718, 0.1720171... |
 |    14970    |     0.0      | [-0.313647985458, -0.31288... |
 |    18440    |     0.0      | [0.495027035475, -0.192912... |
 |    19036    |     0.0      | [-0.156491905451, -0.18767... |
 +-------------+--------------+-------------------------------+