# SVD Baseline Approximation

In [1]:
import csv
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
df = pd.read_csv('..\Data\Input\Phx_Rating.csv')
train, test = train_test_split(df, test_size = 0.2)
print len(train)
print len(test)

150819
37705


#### Creating dictionaries of restaurants and users

In [2]:
restr_dict = {}
user_dict = {}
rest_cnt = 0
user_cnt = 0
for index, row in train.iterrows():
    if row[0] not in user_dict:        
        user_dict[row[0]] = user_cnt
        user_cnt = user_cnt + 1
    if row[2] not in restr_dict:        
        restr_dict[row[2]] = rest_cnt
        rest_cnt = rest_cnt + 1

In [3]:
print user_cnt
print rest_cnt

56778
2921


#### Creating a numpy matrix

In [4]:
users_restr = np.zeros((int(len(user_dict)),int(len(restr_dict))))
print users_restr.shape

(56778L, 2921L)


In [5]:
for index, row in train.iterrows():
    try:
        users_restr[int(user_dict[row[0]]) , int(restr_dict[row[2]])] = row[3]
    except Exception as e:
        print e
        continue

#### Getting avg stars for each restaurant to make data dense

In [6]:
phx_business = open('..\Data\Input\Restaurants_Phoenix.csv')
phx_csv = csv.reader(phx_business)
phx_data = list(phx_csv)
r_avg_dict = {}
header = True
i = 1
while i < len(phx_data):    
    avg_rat = phx_data[i][61]    
    b_id = phx_data[i][15]    
    r_avg_dict[b_id] = float(avg_rat)
    i = i + 1

In [7]:
print len(r_avg_dict)

2925


#### Replacing every 0 entry with restaurant average

In [8]:
for key in restr_dict:
    i = 0
    while i < user_cnt:
        if users_restr[i][restr_dict[key]] == 0:
            users_restr[i][restr_dict[key]] = r_avg_dict[key]
        i = i + 1

#### Finding avg user rating

In [9]:
u_avg_rat = {}
# dict in the form of ['userid',(no. of reviews by the user, sum of all ratings)]

for index, row in train.iterrows(): 
    if row[0] not in u_avg_rat:
        u_avg_rat[row[0]] = (1 , int(row[3]))
    else:
        tup = u_avg_rat[row[0]]
        new_tup = (tup[0] + 1 , tup[1] + int(row[3]))
        u_avg_rat[row[0]] = new_tup

In [10]:
print "Total no. of unique users = %d" % len(u_avg_rat)
#print u_avg_rat['t95D1tnWvAOy2sxXnI3GUA']

Total no. of unique users = 56778


#### Normalizing each element in the matrix by subtracting from avg user rating

In [None]:
for key in user_dict:
    i = 0
    pos = user_dict[key]
    tup = u_avg_rat[key]
    user_avg_rating = tup[1] / tup[0]
    while i < rest_cnt:
        users_restr[pos][i] -= user_avg_rating            
        i = i + 1

#### Finally calculating the SVD

In [None]:
U, s, V = np.linalg.svd(users_restr, full_matrices=False)

In [None]:
print U.shape
print s.shape
print V.shape

In [None]:
S = np.diag(s)
print S.shape

### Reducing to top k dimensions for a list of k values

In [None]:
kvals = list(range(2,21))
kvals.append(25)
kvals.append(50)
kvals.append(100)

#### Predicting and calculating error

In [None]:
all_rmse = []
all_mae = []
for DIM in kvals:
    U1 = U[:, :DIM]
    S1 = S[:DIM, :DIM]
    V1 = V.T[:, :DIM].T
    S12 = np.sqrt(S1)
    A1 = np.dot(U1,S12)
    A2 = np.dot(S12,V1)
    meancnt = 0
    mae_par = 0
    rmse_par = 0
    for index, row in test.iterrows(): 
        if row[0] in user_dict and row[2] in restr_dict:
            meancnt += 1
            actual = float(row[3])
            pred = np.dot(A1[user_dict[row[0]],:] , A2[:,restr_dict[row[2]]])
            tup = u_avg_rat[row[0]]
            tup_avg = tup[1] / tup[0]
            pred += tup_avg
            rmse_par += (pred - actual) ** 2
            mae_par += abs(pred - actual)
    all_rmse.append((rmse_par / meancnt) ** 0.5)
    all_mae.append(mae_par / meancnt)

#### Printing out the errors, we get a min RMSE ~ 1.14 and min MAE ~ 0.89 for k ~ 5

In [None]:
print all_rmse
print all_mae
print min(all_rmse)
print min(all_mae)

#### Plotting the k values v/s the MAE and RMSE. k = [ 2-21, 25, 50, 100]

In [None]:
%matplotlib inline
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
plt.plot(kvals , all_rmse, 'ro', kvals , all_mae, 'bo')
plt.xlabel('k-vals')
plt.ylabel('Error')
red_patch = mpatches.Patch(color='red', label='RMSE')
blue_patch = mpatches.Patch(color='blue', label='MAE')
plt.legend(handles=[red_patch,blue_patch])
plt.title('Error as a function of k values')
plt.show()