In [22]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

In [23]:
df = pd.read_json("/Users/kevinli/Downloads/filtered_small_df_v1.json")

In [24]:
df.shape

(336389, 22)

In [25]:
df_ubcf = df[['user_id', 'business_id', 'stars_x']]

In [26]:
user_rating_counts = df_ubcf['user_id'].value_counts()
users_with_enough_ratings = user_rating_counts[user_rating_counts >= 50].index.tolist()
# Keep only the rows for users with at least 50 ratings
df_ubcf = df_ubcf[df_ubcf['user_id'].isin(users_with_enough_ratings)]

## Class Implementation

In [27]:
import random
import numpy as np
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse
from collections import defaultdict

# Assuming df_ubcf is your DataFrame with columns ['user_id', 'business_id', 'stars_x']
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_ubcf[['user_id', 'business_id', 'stars_x']], reader)

# Explicit train-test split
trainset, testset = train_test_split(data, test_size=0.2, random_state = 42)
#trainset = trainset.build_full_trainset()

In [21]:
import pandas as pd
from collections import defaultdict
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
import pickle

class RecommendationSystem:
    def __init__(self, trainset):
        self.trainset = trainset
        self.user_based_algo = KNNBasic(user_based=True)
        self.item_based_algo = KNNBasic(user_based=False)
    
    def train(self):
        self.user_based_algo.fit(self.trainset)
        self.item_based_algo.fit(self.trainset)
    
    def get_top_n(self, predictions, n=10):
        top_n = defaultdict(list)
        for uid, iid, _, est, _ in predictions:
            top_n[uid].append((iid, est))
        for uid, user_ratings in top_n.items():
            user_ratings.sort(key=lambda x: x[1], reverse=True)
            top_n[uid] = user_ratings[:n]
        return top_n
    
    def refine_with_item_based(self, top_n_recommendations):
        final_recommendations = defaultdict(list)
        for uid, items in top_n_recommendations.items():
            try:
                inner_uid = self.trainset.to_inner_uid(uid)
            except ValueError:
                print(f"Skipping refinement for {uid} as they are not in the training set.")
                continue
            item_based_predictions = []
            for iid, _ in items:
                try:
                    inner_iid = self.trainset.to_inner_iid(iid)
                    est = self.item_based_algo.predict(uid, iid, verbose=False).est
                    item_based_predictions.append((iid, est))
                except ValueError:
                    continue
            item_based_predictions.sort(key=lambda x: x[1], reverse=True)
            final_recommendations[uid] = item_based_predictions[:5]
        return final_recommendations

class EvaluationSystem:
    @staticmethod
    def calculate_precision_at_k_with_ties(final_recommendations, testset, k=5):
        test_ratings_by_user = defaultdict(list)
        for uid, iid, rating in testset:
            test_ratings_by_user[uid].append((iid, rating))
        top_5_items_with_ties_by_user = defaultdict(set)
        for uid, items_ratings in test_ratings_by_user.items():
            sorted_items = sorted(items_ratings, key=lambda x: x[1], reverse=True)
            top_ratings = sorted_items[:k] if len(sorted_items) > k else sorted_items
            min_top_rating = min(top_ratings, key=lambda x: x[1])[1]
            top_5_items_with_ties_by_user[uid] = {item for item, rating in sorted_items if rating >= min_top_rating}
        hits = 0
        total_predictions = 0
        for uid, recommendations in final_recommendations.items():
            recommended_item_ids = {iid for iid, _ in recommendations[:5]}
            actual_top_items = top_5_items_with_ties_by_user.get(uid, set())
            hits += len(recommended_item_ids.intersection(actual_top_items))
            total_predictions += len(recommended_item_ids)
        precision_at_k_with_ties = hits / total_predictions if total_predictions > 0 else 0
        return precision_at_k_with_ties

# Example usage:
reader = Reader(rating_scale=(1, 5))
#train_data = Dataset.load_from_df(train_df[['user_id', 'business_id', 'stars_y']], reader)
#trainset = train_data.build_full_trainset()

rec_sys = RecommendationSystem(trainset)
rec_sys.train()

# Save the trained models
with open('recommendation_system.pkl', 'wb') as file:
    pickle.dump(rec_sys, file)

user_based_predictions = rec_sys.user_based_algo.test(testset)
top_n_user_based = rec_sys.get_top_n(user_based_predictions, n=10)
final_recommendations = rec_sys.refine_with_item_based(top_n_user_based)

eval_sys = EvaluationSystem()
precision = eval_sys.calculate_precision_at_k_with_ties(final_recommendations, testset, k=5)
print(f"Precision@5 (with ties): {precision}")

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Precision@5 (with ties): 0.8495265184362968


## Cascade Model 

#### Train User-Based and Item-Based Models
Functions: KNNBasic(user_based=True) and KNNBasic(user_based=False)

Purpose: Initializes the collaborative filtering algorithms for user-based and item-based recommendations, respectively.
Use: These functions are key to creating models that can predict item ratings based on similar users (user-based) or similar items (item-based)

#### Generate Limited Anti-Testset
Function: Custom function using trainset.all_users(), trainset.all_items(), and sampling logic.

Purpose: Reduces the computational complexity by limiting the number of items to predict per user, focusing on a manageable subset

#### Predict Ratings with User-Based Model and Select Top N
Function: user_based_algo.test(limited_testset) and top_n

Purpose: Generates predictions for the limited set of user-item pairs using the user-based model.

#### Item Based Refinement

For each recommended item in the top 10 user list: 
Use the item-based model to predict the user's rating for the item. This step leverages the item-based model's understanding of item similarities.

### Individual Models

In [90]:
import random
import numpy as np
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse
from collections import defaultdict

# Assuming df_ubcf is your DataFrame with columns ['user_id', 'business_id', 'stars_x']
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_ubcf[['user_id', 'business_id', 'stars_x']], reader)

# Explicit train-test split
trainset, testset = train_test_split(data, test_size=0.2, random_state = 42)

# Define algorithms for user-based and item-based collaborative filtering
user_based_algo = KNNBasic(user_based=True)
item_based_algo = KNNBasic(user_based=False)

# Train both models on the training set
user_based_algo.fit(trainset)
item_based_algo.fit(trainset)

# Test both models on the testing set
user_based_predictions = user_based_algo.test(testset)
item_based_predictions = item_based_algo.test(testset)

# Calculate and print the RMSE for both models
user_based_rmse = rmse(user_based_predictions)
item_based_rmse = rmse(item_based_predictions)

print(f"User-Based CF RMSE: {user_based_rmse}")
print(f"Item-Based CF RMSE: {item_based_rmse}")

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0729
RMSE: 1.0729
User-Based CF RMSE: 1.0729180967373162
Item-Based CF RMSE: 1.0729180967373162


In [91]:
# Initialize an empty list to store user_id, business_id, and stars_x
ratings_list = []
# Iterate through each rating in the trainset
for uid, iid, rating in trainset.all_ratings():
    # Convert internal IDs to the raw IDs
    raw_uid = trainset.to_raw_uid(uid)
    raw_iid = trainset.to_raw_iid(iid)
    # Append the raw IDs and rating to the list
    ratings_list.append((raw_uid, raw_iid, rating))

# Convert the list to a DataFrame
df_ratings = pd.DataFrame(ratings_list, columns=['user_id', 'business_id', 'stars_x'])
df_ratings.to_csv('trainset_rec_ind.csv')

In [92]:
df_limited_testset = pd.DataFrame(limited_testset, columns=['user_id', 'business_id', 'predicted_rating'])
df_limited_testset.to_csv('testset_ind.csv')

## Final Cascade Model

In [110]:
import pandas as pd
from surprise import Dataset, Reader
from sklearn.model_selection import train_test_split

# Assuming df_ubcf is your DataFrame with columns ['user_id', 'business_id', 'stars_x']

# Step 1: Filter users with at least 20 ratings
user_rating_counts = df_ubcf['user_id'].value_counts()
users_with_enough_ratings = user_rating_counts[user_rating_counts >= 20].index.tolist()

# Keep only the rows for users with at least 20 ratings
filtered_df = df_ubcf[df_ubcf['user_id'].isin(users_with_enough_ratings)]

# Step 2: Sample 20 ratings for each of these users to create a test set
def sample_test_set(df, n=20):
    # Randomly sample n ratings for each user to form the test set
    test_df = df.groupby('user_id').sample(n=n, random_state=42)
    return test_df

test_df = sample_test_set(filtered_df)
# Exclude these test set ratings from the original filtered DataFrame to form the training set
train_df = filtered_df.drop(test_df.index)

# Step 3: Load the train and test DataFrames into Surprise
reader = Reader(rating_scale=(1, 5))

# Load the training set
train_data = Dataset.load_from_df(train_df[['user_id', 'business_id', 'stars_y']], reader)
trainset = train_data.build_full_trainset()

# Since Surprise doesn't directly support loading a test set from a DataFrame,
# convert the test DataFrame into the list of tuples expected by the test method in Surprise
testset = [tuple(x) for x in test_df[['user_id', 'business_id', 'stars_y']].values]

In [96]:
df_limited_testset = pd.DataFrame(testset, columns=['user_id', 'business_id', 'predicted_rating'])
df_limited_testset.to_csv('testset.csv')

In [97]:
## Creating explicit df dataframe 
# Initialize an empty list to store user_id, business_id, and stars_x
ratings_list = []
# Iterate through each rating in the trainset
for uid, iid, rating in trainset.all_ratings():
    # Convert internal IDs to the raw IDs
    raw_uid = trainset.to_raw_uid(uid)
    raw_iid = trainset.to_raw_iid(iid)
    # Append the raw IDs and rating to the list
    ratings_list.append((raw_uid, raw_iid, rating))
# Convert the list to a DataFrame
df_ratings = pd.DataFrame(ratings_list, columns=['user_id', 'business_id', 'stars_x'])
df_ratings.head()  # Display the first few rows of the dataframe
df_ratings.to_csv('trainset_rec.csv')

In [98]:
# Define algorithms for user-based and item-based collaborative filtering
user_based_algo = KNNBasic(user_based=True)
item_based_algo = KNNBasic(user_based=False)

# Train both models on the training set
user_based_algo.fit(trainset)
item_based_algo.fit(trainset)

# Function to get top N recommendations from predictions
def get_top_n(predictions, n=10):
    top_n = defaultdict(list)
    for uid, iid, _, est, _ in predictions:
        top_n[uid].append((iid, est))

    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n

# Generate predictions for the testset using the user-based model
user_based_predictions = user_based_algo.test(testset)
top_n_user_based = get_top_n(user_based_predictions, n=10)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


In [99]:
def refine_with_item_based(top_n_recommendations, item_based_algo, trainset):
    final_recommendations = defaultdict(list)
    for uid, items in top_n_recommendations.items():
        # Check if the user is in the training set
        try:
            inner_uid = trainset.to_inner_uid(uid)
        except ValueError:
            print(f"Skipping refinement for {uid} as they are not in the training set.")
            continue

        item_based_predictions = []
        for iid, _ in items:
            try:
                inner_iid = trainset.to_inner_iid(iid)
                # Only predict if both user and item are in the training set
                est = item_based_algo.predict(uid, iid, verbose=False).est
                item_based_predictions.append((iid, est))
            except ValueError:
                # Item not in training set, skip it
                continue

        item_based_predictions.sort(key=lambda x: x[1], reverse=True)
        final_recommendations[uid] = item_based_predictions[:5]

    return final_recommendations


final_recommendations = refine_with_item_based(top_n_user_based, item_based_algo, trainset)

Skipping refinement for -1MF2tosrw2WcCxeVNk81Q as they are not in the training set.
Skipping refinement for -A8NWVsLSAQX_XTqt4WPmg as they are not in the training set.
Skipping refinement for -Od0vvWj3RISQ0pNBGqXnQ as they are not in the training set.
Skipping refinement for -Rocdfu1eqSbyqCEBvOzDw as they are not in the training set.
Skipping refinement for -T-i6BSAeSuqLGqIQ-u3wQ as they are not in the training set.
Skipping refinement for -VOCFO1QSCjAxl_0LIrZ9Q as they are not in the training set.
Skipping refinement for -dcWUGQY6uDGQ_FHRySDeQ as they are not in the training set.
Skipping refinement for -mLeT8Ya1D2USbysd6yjrQ as they are not in the training set.
Skipping refinement for -wL-2J0enMz1DDPgPlLHwA as they are not in the training set.
Skipping refinement for 01EE-OfsMFnZJhfNQsp2vg as they are not in the training set.
Skipping refinement for 0DZQA74K8IodQfBEMxJbQw as they are not in the training set.
Skipping refinement for 0Ersa47HgrkiHD8GhqIpgw as they are not in the traini

In [100]:
def calculate_precision_at_k_with_ties(final_recommendations, testset, k=5):
    # Step 1: Prepare the data structures
    test_ratings_by_user = defaultdict(list)
    for uid, iid, rating in testset:
        test_ratings_by_user[uid].append((iid, rating))

    top_5_items_with_ties_by_user = defaultdict(set)
    for uid, items_ratings in test_ratings_by_user.items():
        sorted_items = sorted(items_ratings, key=lambda x: x[1], reverse=True)
        top_ratings = sorted_items[:k] if len(sorted_items) > k else sorted_items
        min_top_rating = min(top_ratings, key=lambda x: x[1])[1]
        top_5_items_with_ties_by_user[uid] = {item for item, rating in sorted_items if rating >= min_top_rating}

    # Step 2: Calculate precision
    hits = 0
    total_predictions = 0
    for uid, recommendations in final_recommendations.items():
        recommended_item_ids = {iid for iid, _ in recommendations[:5]}
        actual_top_items = top_5_items_with_ties_by_user.get(uid, set())
        
        hits += len(recommended_item_ids.intersection(actual_top_items))
        total_predictions += len(recommended_item_ids)

    precision_at_k_with_ties = hits / total_predictions if total_predictions > 0 else 0
    return precision_at_k_with_ties

# Usage example
precision = calculate_precision_at_k_with_ties(final_recommendations, testset, k=5)
print(f"Precision@5 (with ties): {precision}")

Precision@5 (with ties): 0.6576055523423945
