In [97]:
import random
import loader 

from collections import defaultdict

from sklearn import linear_model
import numpy as np

import heapq

In [98]:
def getGlobalAverage(trainRatings):
    # Return the average rating in the training set
    res = np.average(trainRatings)

    return res

def alphaUpdate(ratingsTrain, alpha, betaU, betaLoc, lamb):
    # Update equation for alpha
    newAlpha = 0

    # From slide 83 of the recommendation slide
    # alpha = sum_{u,i in train} (R_u,i - (betaU + betaLoc)) / Ntrain
    Ntrain = len(ratingsTrain)

    for u, loc, rating in ratingsTrain:
        # u, loc, rating = r["user_id"], r["gmap_id"], r["rating"]

        newAlpha += rating - (betaU[u] + betaLoc[loc])
    
    newAlpha /= Ntrain

    return newAlpha

def betaUUpdate(ratingsPerUser, alpha, betaU, betaLoc, lamb):
    # Update equation for betaU
    newBetaU = {}

    # From slide 83 of the recommendation slide
    # betaU = sum_{i in I_u} (R_u,i - (alpha + betaLoc)) / (lamb + |I_u|)
    
    # Structure is ratingsPerUser[user] = [(location, rating)]
    # betaU[user] = how much does this user tend to rate things above mean

    for u in ratingsPerUser:
        curr = 0
        
        for i, r in ratingsPerUser[u]:
            curr += r - (alpha + betaLoc[i])
    
        curr /= (lamb + len(ratingsPerUser[u]))

        newBetaU[u] = curr

    return newBetaU

def betaLocUpdate(ratingsPerLocation, alpha, betaU, betaLoc, lamb):
    # Update equation for betaLoc
    newBetaLoc = {}

    # From slide 83 of the recommendation slide
    # betaU = sum_{u in U_i} (R_u,i - (alpha + betaU)) / (lamb + |U_i|)

    # ratingsPerLocation[location] = [(user, rating)]

    for i in ratingsPerLocation:
        curr = 0

        for u, r in ratingsPerLocation[i]:
            curr += r - (alpha + betaU[u])
        
        curr /= (lamb + len(ratingsPerLocation[i]))

        newBetaLoc[i] = curr

    return newBetaLoc

def goodModel(ratingsTrain, ratingsPerUser, ratingsPerLocation, alpha, betaU, betaLoc):
    # Improve upon your model from the previous question (e.g. by running multiple iterations)

    # Running multiple iterations
    lamb = 0.5
    for i in range(100):
        alpha = alphaUpdate(ratingsTrain, alpha, betaU, betaLoc, lamb)
        betaU = betaUUpdate(ratingsPerUser, alpha, betaU, betaLoc, lamb)
        betaLoc = betaLocUpdate(ratingsPerLocation, alpha, betaU, betaLoc, lamb)

    return alpha, betaU, betaLoc

In [99]:
# train_reviews = loader.load_to_dict("review-Washington_10.json.gz")
# train_metadata = loader.load_to_dict("meta-Washington.json.gz")
# test_reviews = loader.load_to_dict("review-Oregon_10.json.gz")
# test_metadata = loader.load_to_dict("meta-Oregon.json.gz")

train_reviews = loader.load_to_dict("review-Alaska_10.json.gz")
train_metadata = loader.load_to_dict("meta-Alaska.json.gz")
test_reviews = loader.load_to_dict("review-Wyoming_10.json.gz")
test_metadata = loader.load_to_dict("meta-Wyoming.json.gz")

In [100]:
# Get each users' highly reviewed stores list
# users_likes[user_id] = [stores they rated >= 4]
users_likes = defaultdict(set)
dupe_review_count = 0
dupe_removed_count = 0

for review in test_reviews:
    user_id = review["user_id"]
    gmap_id = review["gmap_id"]
    rating = review["rating"]

    if gmap_id in users_likes[user_id]:
        dupe_review_count += 1
    
    # Use the most recent review, meaning if a user re-reviewed a place and they didn't like it, update our set
    if gmap_id in users_likes[user_id] and rating < 4:
        users_likes[user_id].remove(gmap_id)
        dupe_removed_count += 1

    if rating >= 4 and (gmap_id not in users_likes[user_id]):
        users_likes[user_id].add(gmap_id)

print("Num dupes: ", dupe_review_count) 
print("Num dupes removed: ", dupe_removed_count) 

# Split off the users_likes to revealed and hidden
users_revealed_likes = defaultdict(list)
users_hidden_likes = defaultdict(list)
users_total_likes = defaultdict(list)

random.seed(42)
for user_id, liked_places in users_likes.items():
    # For now let's say 8:2 ratio for revealed vs hidden
    # Shuffle before splitting

    liked_list = list(liked_places)
    num_likes = len(liked_list)
    
    random.shuffle(liked_list)

    # ensures at least 1 review is hidden
    min_hidden_count = 1
    split_point = max(min_hidden_count, int(0.2 * num_likes))

    revealed = liked_list[split_point:]
    hidden = liked_list[:split_point]
    
    if len(hidden) >= min_hidden_count:
        users_revealed_likes[user_id] = revealed
        users_hidden_likes[user_id] = hidden
        users_total_likes[user_id] = liked_list

# Save user likes: revealed, hidden, and full
loader.save_likes("users_likes_full.json", users_total_likes)
loader.save_likes("users_revealed_likes.json", users_revealed_likes)
loader.save_likes("users_hidden_likes.json", users_hidden_likes)

Num dupes:  3224
Num dupes removed:  0
Saved to  eval/users_likes_full.json
Saved to  eval/users_revealed_likes.json
Saved to  eval/users_hidden_likes.json


In [101]:
# More preprocessing: get all users and locations from the test set
# Also, get all the reviews that are revealed / negative (which will be used to train)

test_users = set()
test_locations = set()

# of (user_id, gmap_id, rating)
test_revealed_and_negative_reviews = set()

for review in test_reviews:
    user = review["user_id"]
    test_users.add(user)

    loc = review["gmap_id"]
    test_locations.add(loc)

    rating = review["rating"]

    if loc in users_revealed_likes[user] or rating < 4:
        test_revealed_and_negative_reviews.add((user, loc, rating))

test_users = list(test_users)
test_locations = list(test_locations)

In [102]:
# Use the train set to figure out the generally ideal alpha, betaU, betaLoc of the model in this kind of task
ratingsTrain = test_revealed_and_negative_reviews
ratingsPerUser = defaultdict(list)
ratingsPerLocation = defaultdict(list)
for u, loc, r in ratingsTrain:
    # u = review["user_id"]
    # loc = review["gmap_id"]
    # r = review["rating"]

    ratingsPerUser[u].append((loc,r))
    ratingsPerLocation[loc].append((u,r))

trainRatings = [r for u, loc, r in ratingsTrain]
# trainRatings = [r["rating"] for r in ratingsTrain]

betaU = {}
betaLoc = {}
for u in ratingsPerUser:
    betaU[u] = 0

for loc in ratingsPerLocation:
    betaLoc[loc] = 0

alpha = getGlobalAverage(trainRatings) # Could initialize anywhere, this is a guess

alpha, betaU, betaLoc = goodModel(ratingsTrain, ratingsPerUser, ratingsPerLocation, alpha, betaU, betaLoc)

In [103]:
# Go through each user and location, predict
k = 30
recommendation = {}

for user_id in test_users:
    curr_user_list = []

    for gmap_id in test_locations:

        # Predict this pair
        bu = 0
        bl = 0
        if user_id in betaU:
            bu = betaU[user_id]
        if gmap_id in betaLoc:
            bi = betaLoc[gmap_id]
        
        if gmap_id in users_revealed_likes[user_id]:
            continue
        
        curr_user_list.append((alpha + bu + bi, gmap_id))
    
    curr_user_list.sort(reverse=True)

    recommendation[user_id] = [loc for r, loc in curr_user_list[:k]]

In [104]:
print(recommendation[list(recommendation.keys())[0]])

['0x87688fc120455555:0x44076d7148cf9270', '0x8760bcc89d4d78e7:0xc53bc074835bfd13', '0x876f3ba7841f29c5:0x6655febc1b5e8982', '0x87688f8bb09a8053:0xd04f17b5404837bf', '0x8760bcc86bd59b23:0x9be8883c087dd079', '0x875a60f06403a8e5:0xc09bada4728331e7', '0x876f3bc5c9f330f3:0xe7cb83191d2fccd9', '0x876f3dbe174d6dbb:0xf3b11e29694a8bdd', '0x876f3b2e64f6905f:0x5b6b9ed602b819c4', '0x534b1f9fcde76181:0xd4159a1a2a0f965a', '0x8760bba2d8a2af79:0x45376e789893eb50', '0x87688fe23c528257:0x3b860f0518f2e8e4', '0x875708f085a3c085:0x4210a53468aa349d', '0x876f3b3c320f679b:0x25752932aa6d7d24', '0x876f3a16ccb3ceff:0x99781dc3c03b3972', '0x876f3b9dce619ccb:0x92de6ed944646fed', '0x876f3b6bdcd67ae9:0x3c5b4fbf5f12a125', '0x8760bdf5fe1dc225:0xa6f56aa19473457e', '0x8756725b8e8bc895:0xb73b39b37f3f0165', '0x876f252ea5603beb:0xeb46992d5bf71e42', '0x876f3b3e168d0177:0x30e02889fc91e081', '0x53531a5d1f9a86fb:0x4359e8d9daaee2b8', '0x8760bc7d139fc4a1:0x555b7acc2a3ba72d', '0x876f3ac14465ed89:0xb9331b23742c01d1', '0x8759a661a94a

In [105]:
loader.save_likes("hw3_iterative_recommendation_per_user.json", recommendation)

Saved to  eval/hw3_iterative_recommendation_per_user.json
