In [6]:
import random
import loader 

from collections import defaultdict

In [7]:
# train_reviews = loader.load_to_dict(root+"review-Washington_10.json.gz")
# train_metadata = loader.load_to_dict(root+"meta-Washington.json.gz")
test_reviews = loader.load_to_dict("review-Oregon_10.json.gz")
test_metadata = loader.load_to_dict("meta-Oregon.json.gz")

In [8]:
# Get each users' highly reviewed stores list
# users_likes[user_id] = [stores they rated >= 4]
users_likes = defaultdict(set)
dupe_review_count = 0
dupe_removed_count = 0

for review in test_reviews:
    user_id = review["user_id"]
    gmap_id = review["gmap_id"]
    rating = review["rating"]

    if gmap_id in users_likes[user_id]:
        dupe_review_count += 1
    
    # Use the most recent review, meaning if a user re-reviewed a place and they didn't like it, update our set
    if gmap_id in users_likes[user_id] and rating < 4:
        users_likes[user_id].remove(gmap_id)
        dupe_removed_count += 1

    if rating >= 4 and (gmap_id not in users_likes[user_id]):
        users_likes[user_id].add(gmap_id)

print("Num dupes: ", dupe_review_count) 
print("Num dupes removed: ", dupe_removed_count) 

# Split off the users_likes to revealed and hidden
users_revealed_likes = defaultdict(list)
users_hidden_likes = defaultdict(list)
users_total_likes = defaultdict(list)

random.seed(42)
for user_id, liked_places in users_likes.items():
    # For now let's say 8:2 ratio for revealed vs hidden
    # Shuffle before splitting

    liked_list = list(liked_places)
    num_likes = len(liked_list)
    
    random.shuffle(liked_list)

    # ensures at least 1 review is hidden
    min_hidden_count = 1
    split_point = max(min_hidden_count, int(0.2 * num_likes))

    revealed = liked_list[split_point:]
    hidden = liked_list[:split_point]
    
    if len(hidden) >= min_hidden_count:
        users_revealed_likes[user_id] = revealed
        users_hidden_likes[user_id] = hidden
        users_total_likes[user_id] = liked_list

# Save user likes: revealed, hidden, and full
loader.save_likes("users_likes_full.json", users_total_likes)
loader.save_likes("users_revealed_likes.json", users_revealed_likes)
loader.save_likes("users_hidden_likes.json", users_hidden_likes)

Num dupes:  68767
Num dupes removed:  0
Saved to  eval/users_likes_full.json
Saved to  eval/users_revealed_likes.json
Saved to  eval/users_hidden_likes.json


In [9]:
# Because the baseline doesn't need any training, we ignore the train sets and just build it off of the test set
# We’ll use a standard baseline for ranking latent factor model, which is by always recommending the top most popular places in the testing dataset
# “Popular” means aggregation of features from each places’ metadata; number of reviews * average rating per store

# Preprocessing the data; get the number of reviews per store in the metadata
locations_review_count = defaultdict(int)
locations_avg_rating = defaultdict(int)

# First get the count of all the reviews for each location
for review in test_reviews:
    locations_review_count[review["gmap_id"]] += 1

# Get the average rating listed in the metadata
for metadata in test_metadata:
    locations_avg_rating[metadata["gmap_id"]] = metadata["avg_rating"]

# Then multiply the two collected data and fill in the locations_popularity[gmap_id] = number of reviews * average rating
locations_popularity = defaultdict(int)

for gmap_id in locations_review_count:
    locations_popularity[gmap_id] = locations_review_count[gmap_id] * locations_avg_rating[gmap_id]


# Getting the resulting "most popular" list that can be used for the baseline
# Turn the locations_popularity dictionary to list of tuples that we can sort
popularity_list = [(pop, gmap_id) for gmap_id, pop in locations_popularity.items()]

# Sort in reverse order so the most popular place is at the top
popularity_list.sort(reverse=True)

# And then the gmap_id only list
popularity_list_id = [gmap_id for _, gmap_id in popularity_list]


# Building the dictionary to feed to the evaluation function
# recommendation[user_id] = [top k items the model recommend]
recommendation = {}

# Get each user that has reviewed
for review in test_reviews:
    user_id = review["user_id"]

    # Recommend the top number of hidden reviews for each user
    k = 30 # 2 * len(users_hidden_likes[user_id])

    if user_id not in recommendation:
        # Filter the popularity list so that the users' revealed likes isn't included here
        filtered_popularity_list = []

        for name in popularity_list_id:
            if name not in users_revealed_likes[user_id]:
                filtered_popularity_list.append(name)

            if len(filtered_popularity_list) == k:
                break

        recommendation[user_id] = filtered_popularity_list[:k]

# import json

# with open("baseline_recommendation_per_user.json", "w") as fp:
#     json.dump(recommendation, fp, indent=4)
loader.save_likes("baseline_recommendation_per_user.json", recommendation)

print("Baseline Implementation")

Saved to  eval/baseline_recommendation_per_user.json
Baseline Implementation


In [10]:
print(recommendation["116238557567455956213"])

['0x54950c57cd36d3cb:0x6f866d66264752ab', '0x549575786b8b269f:0xb3548313bd7fccfd', '0x5495a40b739922e1:0x1386e3022fac1181', '0x5495a752d90af2bd:0xac4da428b0c0c9a7', '0x54950a2eb856b805:0xe7e720b09eddfc26', '0x54ea92b04ed31491:0xf497b5d87639810', '0x5495a0b4338cb23b:0xdf44bd5a7cbcbde4', '0x549618d8d788f7cd:0xe1cbc1f79d7ed701', '0x54eb2c9d759f3191:0xe3b93c57067a899a', '0x54950ee686837673:0x4c238b5a749b4606', '0x54959ad6cca60deb:0xc5a043e2f635f', '0x54950a03b7b42a07:0x60d8820872cc912f', '0x54c11de4265cd325:0xa3a7d53c0aae45d3', '0x5495a2ae617645a7:0x9975eb0f90219342', '0x54950a0764850d75:0xfd20dac4cfa02bff', '0x54950ebce09dcfb9:0xbfe26b0a2d6fc123', '0x54950a72e8b112d1:0xd074694827faf84c', '0x54c5b55eda22e2a9:0x1f87d215776a4838', '0x54bfedda86d4a279:0x3d55c036fa2ccf71', '0x54950a051d703e13:0xfebc36dc49ec79c7', '0x54b8c632c16b82f3:0xc2979d3396b38cf0', '0x54950c04c07d5089:0x5b2f345008809c32', '0x54be1eaaa37312a3:0x6738b4912b8ca4f2', '0x5495a0370172b005:0x22f3cf34eb51d736', '0x5495a40c5458df53