# SVD Baseline Approximation

In [1]:
import csv
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
df = pd.read_csv('phx_rating.csv')
train, test = train_test_split(df, test_size = 0.2)
print len(train)
print len(test)

150819
37705


#### Creating dictionaries of restaurants and users

In [3]:
restr_dict = {}
user_dict = {}
rest_cnt = 0
user_cnt = 0
for index, row in train.iterrows():
    if row[0] not in user_dict:        
        user_dict[row[0]] = user_cnt
        user_cnt = user_cnt + 1
    if row[2] not in restr_dict:        
        restr_dict[row[2]] = rest_cnt
        rest_cnt = rest_cnt + 1

In [4]:
print user_cnt
print rest_cnt

56617
2921


#### Creating a numpy matrix

In [5]:
users_restr = np.zeros((int(len(user_dict)),int(len(restr_dict))))
print users_restr.shape

(56617L, 2921L)


In [6]:
for index, row in train.iterrows():
    try:
        users_restr[int(user_dict[row[0]]) , int(restr_dict[row[2]])] = row[3]
    except Exception as e:
        print e
        continue

#### Getting avg stars for each restaurant to make data dense

In [49]:
phx_business = open('../Data files/Phoenix/Restaurants_Phoenix.csv')
phx_csv = csv.reader(phx_business)
phx_data = list(phx_csv)
r_avg_dict = {}
header = True
i = 1
while i < len(phx_data):    
    avg_rat = phx_data[i][61]    
    b_id = phx_data[i][15]    
    r_avg_dict[b_id] = float(avg_rat)
    i = i + 1

In [50]:
print len(r_avg_dict)

2925


#### Replacing every 0 entry with restaurant average

In [51]:
for key in restr_dict:
    i = 0
    while i < user_cnt:
        if users_restr[i][restr_dict[key]] == 0:
            users_restr[i][restr_dict[key]] = r_avg_dict[key]
        i = i + 1

#### Finding avg user rating

In [54]:
u_avg_rat = {}
# dict in the form of ['userid',(no. of reviews by the user, sum of all ratings)]

for index, row in train.iterrows(): 
    if row[0] not in u_avg_rat:
        u_avg_rat[row[0]] = (1 , int(row[3]))
    else:
        tup = u_avg_rat[row[0]]
        new_tup = (tup[0] + 1 , tup[1] + int(row[3]))
        u_avg_rat[row[0]] = new_tup

In [66]:
print "Total no. of unique users = %d" % len(u_avg_rat)
#print u_avg_rat['t95D1tnWvAOy2sxXnI3GUA']

Total no. of unique users = 56617


#### Normalizing each element in the matrix by subtracting from avg user rating

In [67]:
for key in user_dict:
    i = 0
    pos = user_dict[key]
    tup = u_avg_rat[key]
    user_avg_rating = tup[1] / tup[0]
    while i < rest_cnt:
        users_restr[pos][i] -= user_avg_rating            
        i = i + 1

#### Finally calculating the SVD

In [None]:
U, s, V = np.linalg.svd(users_restr, full_matrices=False)

In [95]:
print U.shape
print s.shape
print V.shape

(56617L, 2921L)
(2921L,)
(2921L, 2921L)


In [98]:
S = np.diag(s)
print S.shape

(2921L, 2921L)


### Reducing to top k dimensions for a list of k values

In [128]:
kvals = list(range(5,21))
kvals.append(25)
kvals.append(50)
kvals.append(100)

#### Predicting and calculating error

In [131]:
all_rmse = []
all_mae = []
for DIM in kvals:
    U1 = U[:, :DIM]
    S1 = S[:DIM, :DIM]
    V1 = V.T[:, :DIM].T
    S12 = np.sqrt(S1)
    A1 = np.dot(U1,S12)
    A2 = np.dot(S12,V1)
    meancnt = 0
    mae_par = 0
    rmse_par = 0
    for index, row in test.iterrows(): 
        if row[0] in user_dict and row[2] in restr_dict:
            meancnt += 1
            actual = float(row[3])
            pred = np.dot(A1[user_dict[row[0]],:] , A2[:,restr_dict[row[2]]])
            tup = u_avg_rat[row[0]]
            tup_avg = tup[1] / tup[0]
            pred += tup_avg
            rmse_par += (pred - actual) ** 2
            mae_par += abs(pred - actual)
    all_rmse.append((rmse_par / meancnt) ** 0.5)
    all_mae.append(mae_par / meancnt)

#### Printing out the errors, we get a RMSE = 1.138214 and MAE = 0.886604

In [133]:
print all_rmse
print all_mae
print min(all_rmse)
print min(all_mae)

[1.1382637052454978, 1.1383221742099854, 1.1382532019720524, 1.1383680738827862, 1.1385241381360096, 1.1385558718017861, 1.1382144655433262, 1.1382176822156005, 1.1384274028911141, 1.1386196192409745, 1.1385662079402188, 1.1385449567345192, 1.1384014034900731, 1.1384502910755108, 1.1384575714074536, 1.1386494432901069, 1.1389579503432394, 1.1395250452205996, 1.140204839159803]
[0.88664308099826794, 0.88670625652862001, 0.88660430101038301, 0.88672896739585971, 0.88681778560204561, 0.88689219197229663, 0.88683872497047311, 0.88682211092816554, 0.88705552137081456, 0.88723571861847017, 0.88719823388931829, 0.88709469840083843, 0.88720383540504089, 0.88739603729439109, 0.88723596399631577, 0.88742518993845532, 0.88760804986841535, 0.88802185063597372, 0.88710847745503085]
1.13821446554
0.88660430101


In [155]:
import matplotlib
matplotlib.style.use('ggplot')
ts = pd.Series(all_rmse, index=kvals)

In [156]:
ts = ts.cumsum()

In [157]:
ts.plot();