In [7]:
import random
import loader 

from collections import defaultdict

from sklearn import linear_model
import numpy

In [8]:
train_reviews = loader.load_to_dict("review-Washington_10.json.gz")
train_metadata = loader.load_to_dict("meta-Washington.json.gz")
test_reviews = loader.load_to_dict("review-Oregon_10.json.gz")
test_metadata = loader.load_to_dict("meta-Oregon.json.gz")

# train_reviews = loader.load_to_dict("review-Alaska_10.json.gz")
# train_metadata = loader.load_to_dict("meta-Alaska.json.gz")

# test_reviews = loader.load_to_dict("review-Wyoming_10.json.gz")
# test_metadata = loader.load_to_dict("meta-Wyoming.json.gz")

In [9]:
# Get each users' highly reviewed stores list
# users_likes[user_id] = [stores they rated >= 4]
users_likes = defaultdict(set)
dupe_review_count = 0
dupe_removed_count = 0

for review in test_reviews:
    user_id = review["user_id"]
    gmap_id = review["gmap_id"]
    rating = review["rating"]

    if gmap_id in users_likes[user_id]:
        dupe_review_count += 1
    
    # Use the most recent review, meaning if a user re-reviewed a place and they didn't like it, update our set
    if gmap_id in users_likes[user_id] and rating < 4:
        users_likes[user_id].remove(gmap_id)

        dupe_removed_count += 1

    if rating >= 4 and (gmap_id not in users_likes[user_id]):
        users_likes[user_id].add(gmap_id)

print("Num dupes: ", dupe_review_count) 
print("Num dupes removed: ", dupe_removed_count) 

# Split off the users_likes to revealed and hidden
users_revealed_likes = defaultdict(list)
users_hidden_likes = defaultdict(list)
users_total_likes = defaultdict(list)

random.seed(42)
for user_id, liked_places in users_likes.items():
    # For now let's say 8:2 ratio for revealed vs hidden
    # Shuffle before splitting

    liked_list = list(liked_places)
    num_likes = len(liked_list)
    
    random.shuffle(liked_list)

    # ensures at least 1 review is hidden
    min_hidden_count = 1
    split_point = max(min_hidden_count, int(0.2 * num_likes))

    revealed = liked_list[split_point:]
    hidden = liked_list[:split_point]
    
    if len(hidden) >= min_hidden_count:
        users_revealed_likes[user_id] = revealed
        users_hidden_likes[user_id] = hidden
        users_total_likes[user_id] = liked_list

# Save user likes: revealed, hidden, and full
loader.save_likes("users_likes_full.json", users_total_likes)
loader.save_likes("users_revealed_likes.json", users_revealed_likes)
loader.save_likes("users_hidden_likes.json", users_hidden_likes)

Num dupes:  68767
Num dupes removed:  0
Saved to  eval/users_likes_full.json
Saved to  eval/users_revealed_likes.json
Saved to  eval/users_hidden_likes.json


In [10]:
# Linear Regression (which isn't personalized, kinda like the baseline); the average rating is the label
# Plan: Fit the model using each locations' metadata
# Predict for the test set's locations
# Sort from highest to lowest average rating

# cats = {}
# i = 0

# for d in train_metadata + test_metadata:
#     if d["category"]:
#         for c in d["category"]:
#             if c not in cats:
#                 cats[c] = i
#                 i += 1

# print(len(list(cats.keys())))

# plannings = {}
# i = 0

# for d in train_metadata + test_metadata:
#     if d["MISC"] and "Planning" in d["MISC"] and d["MISC"]["Planning"]:
#         for p in d["MISC"]["Planning"]:
#             if p not in plannings:
#                 plannings[p] = i
#                 i += 1

# print(len(list(plannings.keys())))

# payments = {}
# i = 0

# for d in train_metadata + test_metadata:
#     if d["MISC"] and "Payments" in d["MISC"] and d["MISC"]["Payments"]:
#         for p in d["MISC"]["Payments"]:
#             if p not in payments:
#                 payments[p] = i
#                 i += 1

# print(len(list(payments.keys())))

In [11]:
def feature(datum):
    feat = [1]

    # The features that make sense to see:
    # price, latitude/longitude, description length?, num_of_reviews

    feat.append(len(datum["price"]) if datum["price"] else 0)
    # feat.append(datum["latitude"])
    # feat.append(datum["longitude"])

    feat.append(len(datum["description"]) if datum["description"] else 0)
    feat.append(datum["num_of_reviews"])

    # Days open?
    days_open = [0] * 7

    if datum["hours"]:
        for d in datum["hours"]:
            if d[0] == "Monday":
                days_open[0] = 1
            if d[0] == "Tuesday":
                days_open[1] = 1
            if d[0] == "Wednesday":
                days_open[2] = 1
            if d[0] == "Thursday":
                days_open[3] = 1
            if d[0] == "Friday":
                days_open[4] = 1
            if d[0] == "Saturday":
                days_open[5] = 1
            if d[0] == "Sunday":
                days_open[6] = 1

    feat += days_open

    # Categories?
    # cat_one_hot = [0] * len(list(cats))
    
    # if datum["category"]:
    #     for c in datum["category"]:
    #         cat_one_hot[cats[c]] = 1

    # feat += cat_one_hot

    # Planning
    # plannings_one_hot = [0] * len(list(plannings))
    # if datum["MISC"] and "Planning" in datum["MISC"] and datum["MISC"]["Planning"]:
    #     for p in datum["MISC"]["Planning"]:
    #         plannings_one_hot[plannings[p]] = 1
    
    # feat += plannings_one_hot

    # Planning
    # payments_one_hot = [0] * len(list(payments))
    # if datum["MISC"] and "Payments" in datum["MISC"] and datum["MISC"]["Payments"]:
    #     for p in datum["MISC"]["Payments"]:
    #         payments_one_hot[payments[p]] = 1
    
    # feat += payments_one_hot

    return feat

In [12]:
# Implement and predict

y = numpy.array([d["avg_rating"] for d in train_metadata])

x = numpy.array([feature(d) for d in train_metadata])

# Setting up the logistic regression using sklearn library (class weight is balanced)
regr = linear_model.LinearRegression()

# Train the model using feature = x, label = y
regr.fit(x, y)

# Now predict if we feed the model another feature (x)
x_test = numpy.array([feature(d) for d in test_metadata])

y_train_prediction = regr.predict(x_test)

loc_and_pred = []

for i in range(len(x_test)):
    loc_and_pred.append((y_train_prediction[i], test_metadata[i]["gmap_id"]))

loc_and_pred.sort(reverse=True)

popularity_list_id = [b for a, b in loc_and_pred]

# Building the dictionary to feed to the evaluation function (same as baseline)
# recommendation[user_id] = [top k items the model recommend]
recommendation = {}

# Get each user that has reviewed
for review in test_reviews:
    user_id = review["user_id"]

    # Recommend the top number of hidden reviews for each user
    k = 30 # 2 * len(users_hidden_likes[user_id])

    if user_id not in recommendation:
        # Filter the popularity list so that the users' revealed likes isn't included here
        filtered_popularity_list = []

        for name in popularity_list_id:
            if name not in users_revealed_likes[user_id]:
                filtered_popularity_list.append(name)

            if len(filtered_popularity_list) == k:
                break

        recommendation[user_id] = filtered_popularity_list[:k]

loader.save_likes("linear_regression_recommendation_per_user.json", recommendation)

Saved to  eval/linear_regression_recommendation_per_user.json
