In [1]:
import gzip
from collections import defaultdict,Counter
from random import shuffle
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
allReviews = []
with open('../CA_reviews.txt',encoding='utf-8') as f:
    for l in f:
        allReviews.append(eval(l))
            
middleSet,testSet = train_test_split(allReviews,test_size=0.2)
trainSet,validSet = train_test_split(middleSet,test_size=0.125)

In [3]:
# def predict(dataset, alpha, beta_u, beta_i):
def calMSE(dataset, alpha, beta_u, beta_p):
    avg_beta_u = sum(beta_u[k] for k in beta_u)/len(beta_u)
    avg_beta_p = sum(beta_p[k] for k in beta_p)/len(beta_p)
    
    MSE = 0
    for l in dataset:
        user,place,rating = l['gPlusUserId'],l['gPlusPlaceId'],l['rating']
        
        predRating = alpha \
            + (beta_u[user] if user in beta_u else avg_beta_u) \
            + (beta_p[place] if place in beta_p else avg_beta_p)
        MSE += (predRating - rating) ** 2
    MSE /= len(dataset)
    return MSE

In [4]:
allRatings = defaultdict(int)
user_places = defaultdict(set)
place_users = defaultdict(set)
for l in trainSet:
    user, place, rating = l['gPlusUserId'], l['gPlusPlaceId'], l['rating']
    user_places[user].add(place)
    place_users[place].add(user)
    allRatings[user + place] = rating

In [5]:
### Using Latent factor model with only alpha and beta
    
def gridSearch(lams):
    lam1, lam2 = lams
    prev_alpha = 0
    prev_beta_u = defaultdict(int)
    prev_beta_p = defaultdict(int)
    prev_MSE = 0
#     iterCount = 0

    while (True):
        alpha = 0
        beta_u = defaultdict(int)
        beta_p = defaultdict(int)
        for l in trainSet:
            user, place, rating = l['gPlusUserId'], l['gPlusPlaceId'], l['rating']
            alpha += rating - (prev_beta_u[user] + prev_beta_p[place])
        alpha /= len(allRatings)

        for user in user_places:
            for place in user_places[user]:
                beta_u[user] += allRatings[user + place] - (alpha + prev_beta_p[place])
            beta_u[user] /= (lam1 + len(user_places[user]))

        for place in place_users:
            for user in place_users[place]:
                beta_p[place] += allRatings[user + place] - (alpha + prev_beta_u[user])
            beta_p[place] /= (lam2 + len(place_users[place]))

        MSE = calMSE(trainSet, alpha, beta_u, beta_p)

        if abs(prev_MSE - MSE) < 0.0001:
            print('lambda is:', lams)
            print("Alpha is " , alpha)
            print("Training Mean Square Error is ", MSE)
            validMSE = calMSE(validSet, alpha, beta_u, beta_p)
            print("Validation Mean Square Error is ", validMSE)
            return (validMSE, alpha, beta_u, beta_p)
        prev_MSE = MSE
        prev_alpha = alpha
        prev_beta_u = beta_u
        prev_beta_p = beta_p

In [6]:
lams = ((l1 / 10.0,l2 / 10.0) for l1 in range(25,35) for l2 in range(25,35))
lamPair = min(lams, key=gridSearch)
curr_mse, alpha, beta_u, beta_i = gridSearch(lamPair)
print("Best lambda is: ",lamPair)
print("Using this lambda, the mean square error on the validation set is: " ,curr_mse)

lambda is: (2.5, 2.5)
Alpha is  4.056718095557591
Training Mean Square Error is  0.5437329375610713
Validation Mean Square Error is  1.1765108644846956
lambda is: (2.5, 2.6)
Alpha is  4.056994169922862
Training Mean Square Error is  0.5469367398420749
Validation Mean Square Error is  1.1764860522620812
lambda is: (2.5, 2.7)
Alpha is  4.057276633374055
Training Mean Square Error is  0.5500411010718068
Validation Mean Square Error is  1.1765275311078591
lambda is: (2.5, 2.8)
Alpha is  4.057563910817
Training Mean Square Error is  0.5530509762947973
Validation Mean Square Error is  1.1766274156845162
lambda is: (2.5, 2.9)
Alpha is  4.057854685252135
Training Mean Square Error is  0.5559709979568128
Validation Mean Square Error is  1.1767788313094778
lambda is: (2.5, 3.0)
Alpha is  4.058147853715208
Training Mean Square Error is  0.5588054991263867
Validation Mean Square Error is  1.1769757644853795
lambda is: (2.5, 3.1)
Alpha is  4.058442491429695
Training Mean Square Error is  0.56155853

lambda is: (3.0, 3.0)
Alpha is  4.05754721563827
Training Mean Square Error is  0.5928786758801724
Validation Mean Square Error is  1.1760870694448466
lambda is: (3.0, 3.1)
Alpha is  4.057804637234239
Training Mean Square Error is  0.5957935429644358
Validation Mean Square Error is  1.1762677394775496
lambda is: (3.0, 3.2)
Alpha is  4.058064901337275
Training Mean Square Error is  0.5986274610732525
Validation Mean Square Error is  1.1764867024666579
lambda is: (3.0, 3.3)
Alpha is  4.058327197363898
Training Mean Square Error is  0.6013840801638851
Validation Mean Square Error is  1.1767397049752844
lambda is: (3.0, 3.4)
Alpha is  4.058590833241594
Training Mean Square Error is  0.6040668292119049
Validation Mean Square Error is  1.177022977469475
lambda is: (3.1, 2.5)
Alpha is  4.0561163755591565
Training Mean Square Error is  0.5828089687989961
Validation Mean Square Error is  1.1759397077337574
lambda is: (3.1, 2.6)
Alpha is  4.0563406663107875
Training Mean Square Error is  0.58622

In [7]:
testMSE = calMSE(testSet, alpha, beta_u, beta_i)
print("Test Mean Square Error is ", testMSE)

Test Mean Square Error is  1.1887627544667272
