In [9]:
import pandas as pd
import os
import json
import re
import scipy as sp
import pickle
import numpy as np
from sklearn.metrics import mean_squared_error
import surprise

# MAIN IDEA

- The notebook is targetted to develop a hybrid Collaborative Filter using Item-Based and User-Based CF. The approach that we have taken is the weighted average of the two systems. The weights are chosen such that they result in the lowest MSE

In [10]:
data_dir = '/Users/dare_devil/Documents/MLDS_2024/Quarter2/DataMining/Project/data'
os.listdir(data_dir)

['test_data.json',
 'user_profiles_bert.pkl',
 '.DS_Store',
 'collaborative_filters',
 'dataset',
 'train_df_with_cat.json',
 'topK_categories.json',
 'user_profiles.pkl',
 'item_profiles_bert.pkl',
 'test_df_filtered.json',
 '.ipynb_checkpoints',
 'train_data.json',
 'archive.zip',
 'item_profiles.pkl']

In [11]:
# Training and Testing filenames
train_fname = 'train_df_with_cat.json'
test_fname = 'test_df_filtered.json'


# Collaborative filters filenames
cf_ub_fname = 'collaborative_filters/user_based.pkl'
cf_ib_fname = 'collaborative_filters/item_based.pkl'

In [12]:
# Loading train and test data 
#train_df = pd.read_json(os.path.join(data_dir, train_fname))
test_df = pd.read_json(os.path.join(data_dir, test_fname))
print(f"Test Size : {test_df.shape[0]}")
print("Training and Test Data Loaded")

# Loading Collaborative Filters
cf_ub = pickle.load(open(os.path.join(data_dir, cf_ub_fname), 'rb'))
cf_ib = pickle.load(open(os.path.join(data_dir, cf_ib_fname), 'rb'))
print("Collaborative Filters Loaded")

Test Size : 6832
Training and Test Data Loaded
Collaborative Filters Loaded


In [18]:
# Fine Tuning with Different User Weight to optimize for MSE
user_wt_list = np.arange(0, 1, 0.05)

best_mse = np.inf
best_user_wt = -1


y_true = test_df['user_rating'].tolist()

for user_wt in user_wt_list:
    user_wt = round(user_wt, 2)
    item_wt = round(1 - user_wt,2)
    print(f"User Weight : {user_wt}, Item Weight : {item_wt}")
    y_pred = []
    for idx,row in test_df.iterrows():
        user_id = row['user_id']
        item_id = row['business_id']

        ub_rating = cf_ub.predict(user_id, item_id).est
        ib_rating = cf_ub.predict(user_id, item_id).est

        pred_rating = (user_wt * ub_rating) + (item_wt *  ib_rating)
        y_pred.append(pred_rating)

    mse = mean_squared_error(y_true, y_pred)
    print(f"MSE : {mse}")
    if mse < best_mse:
        best_mse = mse 
        best_user_wt = user_wt 
        print(f"Updating Best User Weight : {best_user_wt}")

User Weight : 0.0, Item Weight : 1.0
MSE : 0.3624400639756419
Updating Best User Weight : 0.0
User Weight : 0.05, Item Weight : 0.95
MSE : 0.3624400639756419
User Weight : 0.1, Item Weight : 0.9
MSE : 0.3624400639756418
Updating Best User Weight : 0.1
User Weight : 0.15, Item Weight : 0.85
MSE : 0.3624400639756418
User Weight : 0.2, Item Weight : 0.8
MSE : 0.3624400639756419
User Weight : 0.25, Item Weight : 0.75
MSE : 0.3624400639756419
User Weight : 0.3, Item Weight : 0.7
MSE : 0.3624400639756419
User Weight : 0.35, Item Weight : 0.65
MSE : 0.3624400639756418
User Weight : 0.4, Item Weight : 0.6
MSE : 0.3624400639756419
User Weight : 0.45, Item Weight : 0.55
MSE : 0.3624400639756419
User Weight : 0.5, Item Weight : 0.5
MSE : 0.3624400639756419
User Weight : 0.55, Item Weight : 0.45
MSE : 0.3624400639756419
User Weight : 0.6, Item Weight : 0.4
MSE : 0.3624400639756419
User Weight : 0.65, Item Weight : 0.35
MSE : 0.3624400639756418
User Weight : 0.7, Item Weight : 0.3
MSE : 0.362440063

In [19]:
print(f"Best MSE : {best_mse}")
print(f"Best User Weight : {best_user_wt}")

Best MSE : 0.3624400639756418
Best User Weight : 0.1
