<a href="https://colab.research.google.com/github/ajitkr1994/RestaurantRecommendation/blob/master/Assignment2_GoogleLocal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import gzip,json,ast,numpy,scipy.optimize, random
from collections import defaultdict

In [0]:
def readJSON(path):
    f = gzip.open(path, 'rt', encoding = 'utf-8')
    for l in f:
        yield l.strip()

In [0]:
import string

def cleanReviewData(d):
    if d['reviewerName']:
        d['reviewerName'] = d['reviewerName'].lower()
    if d['reviewText']:
        d['reviewText'] = d['reviewText'].lower()
        d['reviewText'] = ''.join([c for c in d['reviewText'] if c not in string.punctuation])
    
    return d

In [0]:
restaurantData = []

for line in readJSON("./restaurant-smart-super-compact-reviews.json.gz"):
    #json.loads(line)
    d = json.loads(line)
    cleanReviewData(d)
    restaurantData.append(d)

In [0]:
import random
random.shuffle(restaurantData)

In [0]:
REVIEW_DATASET_SIZE = len(restaurantData)

nTrain = int(0.8*REVIEW_DATASET_SIZE)
nValid = int(0.1*REVIEW_DATASET_SIZE)

trainData = restaurantData[:nTrain]
validData = restaurantData[nTrain:nTrain+nValid]
testData = restaurantData[nTrain+nValid:]

In [0]:
xValidate = [[d['gPlusUserId'], d['gPlusPlaceId'], d['rating']] for d in validData]
yValidate = [d['rating'] for d in validData]

xTest = [[d['gPlusUserId'], d['gPlusPlaceId'], d['rating']] for d in testData]
yTest = [d['rating'] for d in testData]

In [0]:
users = set()
categories = set()
businesses = set()


for d in trainData:
    users.add(d['gPlusUserId'])
    if d['categories']:
        for cat in d['categories']:
            categories.add(cat)
        
    businesses.add(d['gPlusPlaceId'])
    
len(users), len(businesses), len(categories)

In [0]:
ratingsList = [d['rating'] for d in trainData]
ratingMean = sum(ratingsList)/len(ratingsList)

In [0]:
ratingMean

In [0]:
userToPlaces = defaultdict(set)
placesToUser = defaultdict(set)
userSet = set()
placesSet = set()
ratings = {}

for d in trainData:
    u = d['gPlusUserId']
    p = d['gPlusPlaceId']
    r = d['rating']
    userToPlaces[u].add(p)
    placesToUser[p].add(u)
    userSet.add(u)
    placesSet.add(p)
    ratings[(u, p)] = r

In [0]:
xTrainG = [[d['gPlusUserId'], d['gPlusPlaceId'], d['rating']] for d in trainData]
yTrainG = [d['rating'] for d in trainData]

# MSE With Mean Prediction

In [0]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [0]:
alwaysPredictMean = [ratingMean for d in validData]
labels = [d['rating'] for d in validData]

MSE(alwaysPredictMean, labels)

# Implement Latent Factor Without Gamma

In [0]:
def getPrediction1(user, book, alpha, betaUsers, betaBooks):
    try:
        userDelta = betaUsers[user]
    except KeyError:
        userDelta = 0

    try:
        bookDelta = betaBooks[book]
    except KeyError:
        bookDelta = 0

    return alpha + userDelta + bookDelta


def getMSE1(x, y, alpha, betaUsers, betaBooks):
    prediction = [getPrediction1(entry[0], entry[1], alpha, betaUsers, betaBooks) for entry in x]
    return sum(numpy.square(numpy.subtract(prediction,y)))/len(x)


In [0]:
def train(reg_coef = 3):

    alpha = 0
    betaUsers = defaultdict()
    betaPlaces = defaultdict()

    for user in userSet:
        betaUsers[user] = 0
    for place in placesSet:
        betaPlaces[place] = 0

    oldMse = 1
    newMse = 0

    while abs(newMse-oldMse) > 1e-06:
        allRatings = [x[2] for x in xTrainG]
        alpha = (sum(allRatings) - sum(betaUsers.values()) - sum(betaPlaces.values()))/len(xTrainG)

        for user in userSet:
            beta = 0
            pSet = userToPlaces[user]
            for place in pSet:
                beta += ratings[(user, place)] - alpha - betaPlaces[place]
            betaUsers[user] = beta/(reg_coef + len(pSet))

        for place in placesSet:
            beta = 0
            uSet = placesToUser[place]
            for user in uSet:
                beta += ratings[(user, place)] - alpha - betaUsers[user]
            betaPlaces[place] = beta/(reg_coef + len(uSet))

        oldMse = newMse
        newMse = getMSE1(xTrainG, yTrainG, alpha, betaUsers, betaPlaces)
#         print('old', newMse)

    return [alpha, betaUsers, betaPlaces]

In [0]:
alpha, bU, bB = train(6)

In [0]:
for i in range(20):
    alpha, bU, bB = train(i)
    print("Training MSE: ",  getMSE1(xTrainG, yTrainG, alpha, bU, bB))
    print("Validation MSE: ",  getMSE1(xValidate, yValidate, alpha, bU, bB))
    print("Test MSE: ",  getMSE1(xTest, yTest, alpha, bU, bB))

# Implement Latent Factor With Gamma

In [0]:
def getPrediction(user, book, alpha, betaUsers, betaPlaces, gammaUsers, gammaPlaces):
    try:
        userDelta = betaUsers[user]
    except KeyError:
        userDelta = 0

    try:
        placeDelta = betaPlaces[book]
    except KeyError:
        placeDelta = 0

    try:
        gammaU = gammaUsers[user]
    except KeyError:
        gammaU = [0] * k

    try:
        gammaP = gammaPlaces[book]
    except KeyError:
        gammaP = [0] * k

    return alpha + userDelta + placeDelta + numpy.dot(gammaU, gammaP)


def getMSE(x, y, alpha, betaUsers, betaPlaces, gammaU, gammaP):
    prediction = [getPrediction(entry[0], entry[1], alpha, betaUsers, betaPlaces, gammaU, gammaP) for entry in x]
    return sum(numpy.square(numpy.subtract(prediction, y))) / len(x)

def train2(l = 0.01, l1=1, l2=1, l3=1, l4=1, itr = 1):

    alpha, betaUsers, betaPlaces = train()

    gammaUsers = defaultdict()
    gammaPlaces = defaultdict()
    for user in userSet:
        gammaUsers[user] =  [random.random() * 0.1 - 0.05] *k
    for place in placesSet:
        gammaPlaces[place] =  [random.random() * 0.1 - 0.05] *k

    # l = 1e-3
    for i in range(itr):
        count = 0

        Xy = list(zip(xTrainG, yTrainG))
        random.shuffle(Xy)

        xTrain = [d[0] for d in Xy]
        yTrain = [d[1] for d in Xy]

        for x in xTrain:
            u, p, r = x
            betaU = betaUsers[u]
            betaP = betaPlaces[p]
            gammaP = gammaPlaces[p]
            gammaU = gammaUsers[u]

            pred = getPrediction(u, p, alpha, betaUsers, betaPlaces, gammaUsers, gammaPlaces)

            alpha = alpha - l * (pred - r)

            newBetaU = betaU - l * (pred - r + l1 * betaU)
            betaUsers[u] = newBetaU

            newBetaP = betaP - l * (pred - r + l2 * betaP)
            betaPlaces[p] = newBetaP

            newGammaP = numpy.array(gammaP) - l*k * (numpy.multiply(pred - r, gammaU) + numpy.multiply(l3, gammaP))
            gammaPlaces[p] = newGammaP

            newGammaU = numpy.array(gammaU) - l*k * (numpy.multiply(pred - r, gammaP) + numpy.multiply(l4, gammaU))
            gammaUsers[u] = newGammaU

        newMse = getMSE(xTrain, yTrain, alpha, betaUsers, betaPlaces, gammaUsers, gammaPlaces)
        print(newMse)
        print("Validation MSE: ", i,  getMSE(xValidate, yValidate, alpha, betaUsers, betaPlaces, gammaUsers, gammaPlaces))

    return [alpha, betaUsers, betaPlaces, gammaUsers, gammaPlaces]



In [0]:
k = 6
l = 0.01
lr = 0.0002
itr = 100
alpha, bU, bB, gU, gB = train2(lr, l, l, l*k,  l*k, itr)

print("Training MSE: ",  getMSE(xTrainG, yTrainG, alpha, bU, bB, gU, gB))
print("Validation MSE: ",  getMSE(xValidate, yValidate, alpha, bU, bB, gU, gB))
print("Test MSE: ",  getMSE(xTest, yTest, alpha, bU, bB, gU, gB))



# Location Study for users.

In [0]:
import gzip
import random
from collections import defaultdict
import numpy
import json


def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

def readJSON(path):
    f = gzip.open(path, 'rt')
    #f.readline() . Not needed. There's no header.
    for l in f:
        yield l.strip()

In [0]:
# userData = []
# for line in readJSON("users.clean.json.gz"):
#     userData.append(eval(line))


In [0]:
# userToLocation = defaultdict()

# for u in userData:
#     userToLocation[u['gPlusUserId']] = u['currentPlace']

In [0]:
# count = 0
# for u in userSet:
#     if userToLocation.get(u, None) is not None:
#         count+=1

In [0]:
# count/len(userSet)

# Sentiment Analysis

In [0]:
import nltk

In [0]:
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer


In [0]:
from textblob import TextBlob

t = "It was lovely"

blob = TextBlob(t)
print(blob.sentiment)

senti = SentimentIntensityAnalyzer()
print(senti.polarity_scores(t))

# ENSEMBLE

In [0]:
bUAvg = sum(bU.values())/len(bU.values())
bBAvg = sum(bB.values())/len(bB.values())
rLength = [len(d['reviewText']) for d in trainData]
avgReviewLength = sum(rLength)/len(rLength)

In [0]:
avgReviewLength

In [0]:
def getFeature(d):
    text = d['reviewText']
    blob = TextBlob(text)
    scores = senti.polarity_scores(text)
    betaUser = bU.get(d['gPlusUserId'], bUAvg)
    betaPlace = bB.get(d['gPlusPlaceId'], bBAvg)
    feature = [1, len(text)/avgReviewLength, blob.sentiment.polarity, blob.sentiment.subjectivity, scores['neg'], scores['pos'], scores['compound'], betaUser, betaPlace] 
    return feature


In [0]:
xTrainSentiment = []
yTrainSentiment = []
for d in trainData:
    if d['reviewText'] is not None:
        feature = getFeature(d)
        xTrainSentiment.append(feature)
        yTrainSentiment.append(d['rating'])

In [0]:
xValidSentiment = []
yValidSentiment = []
yValid = []
for d in validData:
    if d['reviewText'] is not None:
        feature = getFeature(d)
        xValidSentiment.append(feature)
        yValidSentiment.append(d['rating'])
        

In [0]:
xTestSentiment = []
yTestSentiment = []
for d in testData:
    if d['reviewText'] is not None:
        feature = getFeature(d)
        xTestSentiment.append(feature)
        yTestSentiment.append(d['rating'])
        

# Using Sentiment Scores

In [0]:
def getX(X):
#     return X
    return [[x[0], x[1], x[2], x[3], x[4],x[5], x[6]] for x in X]

from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(getX(xTrainSentiment), yTrainSentiment)

In [0]:
yPredTrain = reg.predict(getX(xTrainSentiment))
yPredT = reg.predict(getX(xTestSentiment))
yPredV = reg.predict(getX(xValidSentiment))

print("Training MSE: ", MSE(yPredTrain, yTrainSentiment))
print("Validation MSE: ", MSE(yPredV, yValidSentiment))
print("Test MSE: ", MSE(yPredT, yTestSentiment))


In [0]:
from sklearn.linear_model import Ridge
clf = Ridge(alpha=1e-1)

In [0]:
clf.fit(getX(xTrainSentiment), yTrainSentiment)

In [0]:
yPredTrain = clf.predict(getX(xTrainSentiment))
yPredV = clf.predict(getX(xValidSentiment))
yPredT = clf.predict(getX(xTestSentiment))

print("Training MSE: ", MSE(yPredTrain, yTrainSentiment))
print("Validation MSE: ", MSE(yPredV, yValidSentiment))
print("Test MSE: ", MSE(yPredT, yTestSentiment))


# Ensemble 

In [0]:
def getX(X):
    return X

from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(getX(xTrainSentiment), yTrainSentiment)


In [0]:
from sklearn.linear_model import Ridge
clf = Ridge(alpha=1e4)
clf.fit(getX(xTrainSentiment), yTrainSentiment)

yPredTrain = clf.predict(getX(xTrainSentiment))
yPredV = clf.predict(getX(xValidSentiment))
yPredT = clf.predict(getX(xTestSentiment))

print("Training MSE: ", MSE(yPredTrain, yTrainSentiment))
print("Validation MSE: ", MSE(yPredV, yValidSentiment))
print("Test MSE: ", MSE(yPredT, yTestSentiment))


In [0]:
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor(random_state=0,
                             n_estimators=100)
clf.fit(getX(xTrainSentiment), yTrainSentiment)

In [0]:
yPredTrain = clf.predict(getX(xTrainSentiment))
yPredV = clf.predict(getX(xValidSentiment))
yPredT = clf.predict(getX(xTestSentiment))

print("Training MSE: ", MSE(yPredTrain, yTrainSentiment))
print("Validation MSE: ", MSE(yPredV, yValidSentiment))
print("Test MSE: ", MSE(yPredT, yTestSentiment))

# Ensembling with Places features

In [0]:
placeData = []

for line in readJSON("./places.clean.json.gz"):
    d = eval(line)
    placeData.append(d)


In [0]:
def calcHours(hoursText):
    hours_val = 0.
    for day in hoursText:
        isWeekend = False
        if(day[0] == 'Saturday' or day[0] == 'saturday' \
          or day[0] == 'Sunday' or day[0] == 'sunday'):
            isWeekend = True
        for time in day[1]:
            #print(time) #each time calculate hours
            str_range = time[0]
            #print(time[0])
            time_str_arr = time[0].split('--')
            try:
                left_time_obj = datetime.datetime.strptime(time_str_arr[0], '%I:%M %p')
                right_time_obj = datetime.datetime.strptime(time_str_arr[1], '%I:%M %p')
                #print(left_time_obj.time(), right_time_obj.time())
                hr = (right_time_obj - left_time_obj).total_seconds()/60./60.
                #print(hr)
                if(hr < 0.):
                    hr = 24. + hr
                hours_val += hr
            except:
                pass
            try:
                left_time_obj = datetime.datetime.strptime(time_str_arr[0], '%I:%M')
                right_time_obj = datetime.datetime.strptime(time_str_arr[1].split(" ")[0], '%I:%M')
                #print(left_time_obj.time(), right_time_obj.time())
                hr = (right_time_obj - left_time_obj).total_seconds()/60./60.
                #print(time_str_arr,'  hours without PM', hr)
                if(hr < 0.):
                    hr = 24. + hr
                hours_val += hr
                #print('hours without PM', hr)
            except:
                pass
                #print(time_str_arr)
            if(time_str_arr == 'Closed'):
                hours_val += 0.
            if(time_str_arr == 'Open 24 hours'):
                hours_val += 24.            
    return hours_val, isWeekend

In [0]:
totalPrice = 0.
countPlaceWithPrice = 0
placeToPrice = defaultdict(float)
for pd in placeData:
    if(pd['price'] is not None):
        totalPrice += len(pd['price'])
        placeToPrice[pd['gPlusPlaceId']] = len(pd['price'])
        countPlaceWithPrice += 1
placeAveragePrice = totalPrice/countPlaceWithPrice

placeToHours = defaultdict(float)
placeIsWeekend = defaultdict(int)
totalHours = 0.
countPlaceWithHours  = 0
for pd in placeData:
    if(pd['hours'] is not None):
        hrs, weekend = calcHours(pd['hours'])
        totalHours += hrs
        placeToHours[pd['gPlusPlaceId']] = hrs
        placeIsWeekend[pd['gPlusPlaceId']] = float(weekend)
        countPlaceWithHours += 1
placeAverageHours = totalHours/countPlaceWithHours

In [0]:
#places data
def place_price(reviewDatum):
    place_id = reviewDatum['gPlusPlaceId']
    _prc = placeToPrice.get(place_id, placeAveragePrice)
    return _prc

def place_hours(reviewDatum):
    place_id = reviewDatum['gPlusPlaceId']
    _prc = placeToHours.get(place_id, placeAverageHours)
    return _prc

def place_is_weekend(reviewDatum):
    place_id = reviewDatum['gPlusPlaceId']
    return placeIsWeekend.get(place_id, 0.)

total_avg_rating = sum(ratings.values())/len(ratings.values())

def place_average_rating(reviewDatum):
    place_id = reviewDatum['gPlusPlaceId']
    place_ratings = [ratings[u, place_id] for u in placesToUser[place_id]]
    if len(place_ratings) == 0:
        return 0
    return sum(place_ratings)/len(place_ratings)

In [0]:
def getPlacesFeature(d):
    return [place_price(d), place_hours(d), place_is_weekend(d)]
    

In [0]:
xTrainPlaces = []
yTrainPlaces = []
for d in trainData:
    if d['reviewText'] is not None:
        feature = getPlacesFeature(d)
        xTrainPlaces.append(feature)
        yTrainPlaces.append(d['rating'])

In [0]:
xValidationPlaces = []
yValidationPlaces = []
for d in validData:
    if d['reviewText'] is not None:
        feature = getPlacesFeature(d)
        xValidationPlaces.append(feature)
        yValidationPlaces.append(d['rating'])

In [0]:
xTestPlaces = []
yTestPlaces = []
for d in testData:
    if d['reviewText'] is not None:
        feature = getPlacesFeature(d)
        xTestPlaces.append(feature)
        yTestPlaces.append(d['rating'])

In [0]:
reg = LinearRegression().fit(getX(xTrainPlaces), yTrainPlaces)

yPredTrain = reg.predict(xTrainPlaces)
yPredV = reg.predict(xValidationPlaces)
yPredT = reg.predict(getX(xTestPlaces))

print("Training MSE: ", MSE(yPredTrain, yTrainSentiment))
print("Validation MSE: ", MSE(yPredV, yValidSentiment))
print("Test MSE: ", MSE(yPredT, yTestSentiment))

In [0]:
clf = Ridge(alpha=1e2)
clf.fit(getX(xTrainPlaces), yTrainPlaces)

yPredTrain = clf.predict(xTrainPlaces)
yPredV = clf.predict(xValidationPlaces)
yPredT = clf.predict(getX(xTestPlaces))

print("Training MSE: ", MSE(yPredTrain, yTrainSentiment))
print("Validation MSE: ", MSE(yPredV, yValidSentiment))
print("Test MSE: ", MSE(yPredT, yTestSentiment))

# All Combined


In [0]:
import numpy as np

XTrainCombined = np.concatenate((xTrainSentiment, xTrainPlaces), axis=1)
XValidationCombined = np.concatenate((xValidSentiment, xValidationPlaces), axis=1)
XTestCombined = np.concatenate((xTestSentiment, xTestPlaces), axis=1)

In [0]:
reg = LinearRegression().fit(XTrainCombined, yTrainPlaces)

yPredTrain = reg.predict(XTrainCombined)
yPredV = reg.predict(XValidationCombined)
yPredT = reg.predict(XTestCombined)

print("Training MSE: ", MSE(yPredTrain, yTrainSentiment))
print("Validation MSE: ", MSE(yPredV, yValidSentiment))
print("Test MSE: ", MSE(yPredT, yTestSentiment))

In [0]:
clf = Ridge(alpha=1e4)
clf.fit(XTrainCombined, yTrainPlaces)

In [0]:
yPredTrain = clf.predict(XTrainCombined)
yPredV = clf.predict(XValidationCombined)
yPredT = clf.predict(XTestCombined)

print("Training MSE: ", MSE(yPredTrain, yTrainSentiment))
print("Validation MSE: ", MSE(yPredV, yValidSentiment))
print("Test MSE: ", MSE(yPredT, yTestSentiment))