In [None]:
import random
import loader 
import numpy as np
import pandas as pd

from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split

In [None]:
train_reviews = loader.load_to_dict("review-Washington_10.json.gz")
train_metadata = loader.load_to_dict("meta-Washington.json.gz")

#test_reviews = loader.load_to_dict("review-Oregon_10.json.gz")
#test_metadata = loader.load_to_dict("meta-Oregon.json.gz")

In [None]:
import pandas as pd

# --- Build DF from your review data ---
ratings_list = [
    {
        "user_id": r["user_id"],
        "gmap_id": r["gmap_id"],
        "rating": r["rating"]
    }
    for r in train_reviews
]

df = pd.DataFrame(ratings_list)[["user_id", "gmap_id", "rating"]]


# -------------------------------
#       POSITIVE THRESHOLD
# -------------------------------
POS_THRESHOLD = 4
df["positive"] = df["rating"] >= POS_THRESHOLD


# -------------------------------
#        BEFORE FILTERING
# -------------------------------
n_users_before = df["user_id"].nunique()
n_items_before = df["gmap_id"].nunique()
n_interactions_before = len(df)

print("=== BEFORE FILTERING ===")
print(f"Users: {n_users_before}")
print(f"Items: {n_items_before}")
print(f"Interactions: {n_interactions_before}")


# -------------------------------
#       APPLY USER FILTER
# -------------------------------
positive_counts = df.groupby("user_id")["positive"].sum()
good_users = positive_counts[positive_counts >= 10].index

df_filtered = df[df["user_id"].isin(good_users)].copy()


# -------------------------------
#        AFTER FILTERING
# -------------------------------
n_users_after = df_filtered["user_id"].nunique()
n_items_after = df_filtered["gmap_id"].nunique()
n_interactions_after = len(df_filtered)

print("\n=== AFTER FILTERING ===")
print(f"Users: {n_users_after}")
print(f"Items: {n_items_after}")
print(f"Interactions: {n_interactions_after}")


# -------------------------------
#         REDUCTIONS
# -------------------------------
print("\n=== REDUCTION ===")
print("Users reduced by: {:.2f}%".format(
    100 * (1 - n_users_after / n_users_before)))
print("Interactions reduced by: {:.2f}%".format(
    100 * (1 - n_interactions_after / n_interactions_before)))
print("Items reduced by: {:.2f}%".format(
    100 * (1 - n_items_after / n_items_before)))


In [None]:
df_filtered


In [None]:
reader = Reader(rating_scale=(1, 5))
train_data = Dataset.load_from_df(
    df_filtered[["user_id", "gmap_id", "rating"]],
    reader
)

trainset = train_data.build_full_trainset()

In [None]:
svd = SVD(n_factors=20, n_epochs=20, biased=True)
svd.fit(trainset)

print("Training done!")