# Imports

In [None]:
import random
import loader 
import pandas as pd
import numpy as np
import tensorflow as tf

import gzip
import json

from collections import defaultdict

# Loader Functions

In [None]:
def readGz(path):
    # Open in text mode ('rt') with UTF-8 encoding for JSON lines
    path = "datasets/" + path
    with gzip.open(path, 'rt', encoding='utf-8') as f:
        for l in f:
            # Safely parse each line as JSON
            yield json.loads(l)

def load_to_dict(file_to_read):
    data = []
    try:
        for item in readGz(file_to_read):
            data.append(item)
    except EOFError as e:
        # Catching the specific EOFError indicating a corrupted file
        print(f"EOFError: Compressed file '{file_to_read}' ended prematurely. Error: {e}")
        print(f"This often indicates a corrupted or incomplete gzip file. Successfully loaded {len(data)} items before the error.")
    except Exception as e:
        # Catching other potential errors during decompression or JSON parsing
        print(f"An unexpected error occurred while reading '{file_to_read}': {e}")
        print(f"Successfully loaded {len(data)} items before the error.")
    return data

def save_likes(filename, data_dict):
    filename = "eval/"+filename
    with open(filename, "w") as fp:
        json.dump(data_dict, fp, indent=4)
    print("Saved to ", filename)

def load_user_likes(filename):
    """
    Load a user_likes JSON file back into a dict[user_id] = list of liked places.
    """
    filename = "eval/"+filename
    with open(filename, "r") as f:
        data = json.load(f)

    # Ensure values are lists, not sets or other types
    return {user_id: list(likes) for user_id, likes in data.items()}


# Exploratory Analysis

# Preprocessing

In [None]:
# Loading the data

test_reviews = load_to_dict("review-Oregon_10.json.gz")
test_metadata = load_to_dict("meta-Oregon.json.gz")

In [None]:
# Get each users' highly reviewed stores list
# users_likes[user_id] = [stores they rated >= 4]
users_likes = defaultdict(set)
dupe_review_count = 0
dupe_removed_count = 0

for review in test_reviews:
    user_id = review["user_id"]
    gmap_id = review["gmap_id"]
    rating = review["rating"]

    if gmap_id in users_likes[user_id]:
        dupe_review_count += 1
    
    # Use the most recent review, meaning if a user re-reviewed a place and they didn't like it, update our set
    if gmap_id in users_likes[user_id] and rating < 4:
        users_likes[user_id].remove(gmap_id)
        dupe_removed_count += 1

    if rating >= 4 and (gmap_id not in users_likes[user_id]):
        users_likes[user_id].add(gmap_id)

print("Num dupes: ", dupe_review_count) 
print("Num dupes removed: ", dupe_removed_count) 

# Split off the users_likes to revealed and hidden
users_revealed_likes = defaultdict(list)
users_hidden_likes = defaultdict(list)
users_total_likes = defaultdict(list)

random.seed(42)
for user_id, liked_places in users_likes.items():
    # For now let's say 8:2 ratio for revealed vs hidden
    # Shuffle before splitting

    liked_list = list(liked_places)
    num_likes = len(liked_list)
    
    random.shuffle(liked_list)

    # ensures at least 1 review is hidden
    min_hidden_count = 1
    split_point = max(min_hidden_count, int(0.2 * num_likes))

    revealed = liked_list[split_point:]
    hidden = liked_list[:split_point]
    
    if len(hidden) >= min_hidden_count:
        users_revealed_likes[user_id] = revealed
        users_hidden_likes[user_id] = hidden
        users_total_likes[user_id] = liked_list

# Save user likes: revealed, hidden, and full
save_likes("users_likes_full.json", users_total_likes)
save_likes("users_revealed_likes.json", users_revealed_likes)
save_likes("users_hidden_likes.json", users_hidden_likes)

# Models

# Evaluation and Baseline

## Baseline Model

In [None]:
# Because the baseline doesn't need any training, we ignore the train sets and just build it off of the test set
# We’ll use a standard baseline for ranking latent factor model, which is by always recommending the top most popular places in the testing dataset
# “Popular” means aggregation of features from each places’ metadata; number of reviews * average rating per store

# Preprocessing the data; get the number of reviews per store in the metadata
locations_review_count = defaultdict(int)
locations_avg_rating = defaultdict(int)

# First get the count of all the reviews for each location
for review in test_reviews:
    locations_review_count[review["gmap_id"]] += 1

# Get the average rating listed in the metadata
for metadata in test_metadata:
    locations_avg_rating[metadata["gmap_id"]] = metadata["avg_rating"]

# Then multiply the two collected data and fill in the locations_popularity[gmap_id] = number of reviews * average rating
locations_popularity = defaultdict(int)

for gmap_id in locations_review_count:
    locations_popularity[gmap_id] = locations_review_count[gmap_id] * locations_avg_rating[gmap_id]


# Getting the resulting "most popular" list that can be used for the baseline
# Turn the locations_popularity dictionary to list of tuples that we can sort
popularity_list = [(pop, gmap_id) for gmap_id, pop in locations_popularity.items()]

# Sort in reverse order so the most popular place is at the top
popularity_list.sort(reverse=True)

# And then the gmap_id only list
popularity_list_id = [gmap_id for _, gmap_id in popularity_list]


# Building the dictionary to feed to the evaluation function
# recommendation[user_id] = [top k items the model recommend]
recommendation = {}

# Get each user that has reviewed
for review in test_reviews:
    user_id = review["user_id"]

    # Recommend the top number of hidden reviews for each user
    k = 30 # 2 * len(users_hidden_likes[user_id])

    if user_id not in recommendation:
        # Filter the popularity list so that the users' revealed likes isn't included here
        filtered_popularity_list = []

        for name in popularity_list_id:
            if name not in users_revealed_likes[user_id]:
                filtered_popularity_list.append(name)

            if len(filtered_popularity_list) == k:
                break

        recommendation[user_id] = filtered_popularity_list[:k]

# import json

# with open("baseline_recommendation_per_user.json", "w") as fp:
#     json.dump(recommendation, fp, indent=4)
loader.save_likes("baseline_recommendation_per_user.json", recommendation)

## Evaluation