In [27]:
import warnings
warnings.filterwarnings('ignore')


from engines import RecommenderInterface
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pickle
import os
import numpy as np 
import pandas as pd

In [20]:
data_dir = '/Users/dare_devil/Documents/MLDS_2024/Quarter2/DataMining/Project/data'

test_fname = 'test_df_filtered.json'
train_fname = 'train_data_red.json'

# Collaborative filters Data Fnames
cf_ub_fname = 'collaborative_filters/user_based.pkl'
cf_ib_fname = 'collaborative_filters/item_based.pkl'

# Content Based Data Fnames
cb_user_profiles_fname = 'user_profiles.pkl'
cb_item_profiles_fname = 'item_profiles.pkl'

In [3]:
# Loading Training Data 
train_df = pd.read_json(os.path.join(data_dir, train_fname))
train_df.head(2)

Unnamed: 0,review_id,user_id,business_id,user_rating,useful,funny,cool,text,date,name,...,Wine & Spirits,Beer,Arts & Entertainment,Cafes,Bakeries,Burgers,Asian Fusion,Vegetarian,Japanese,Salad
0,LEGT4hPIyEVMzy4HUROtoQ,-1MF2tosrw2WcCxeVNk81Q,9c7MUiE6VI8NesjPdj5FkA,3,4,0,1,A coworker and I stopped by here for an aftern...,2017-12-13 22:58:16,Bubblefish,...,0,0,0,0,0,0,0,0,1,0
1,QvoAHbml7mkqFrx9K18h2Q,-1MF2tosrw2WcCxeVNk81Q,1OfhM-ZKvcpxyxptCCzEwA,4,4,0,0,Stopped by here for happy hour and was fairly ...,2017-08-01 22:47:11,Independence Beer Garden,...,1,1,0,0,0,0,0,0,0,0


In [4]:
# Content Based Data 

## Loading user and item profiles
cb_user_profiles = pickle.load(open(os.path.join(data_dir, cb_user_profiles_fname), 'rb'))
cb_item_profiles = pickle.load(open(os.path.join(data_dir, cb_item_profiles_fname), 'rb'))

## Defining User, Item Matrices and Mapping Functions
cb_user_matrix = np.array([profile  for user_id, profile in cb_user_profiles.items()])
cb_item_matrix = np.array([profile  for item_id, profile in cb_item_profiles.items()])

cb_idx2user = { idx:user_id for idx,user_id in enumerate(cb_user_profiles.keys())}
cb_idx2item = { idx:item_id for idx,item_id in enumerate(cb_item_profiles.keys())}

# Defining Similarity Matrices
cb_user_sim_matrix = cosine_similarity(cb_user_matrix)
np.fill_diagonal(cb_user_sim_matrix, 0)
cb_item_sim_matrix = cosine_similarity(cb_item_matrix)
np.fill_diagonal(cb_item_sim_matrix, 0)

In [5]:
# Defining Weights

# Content Based User-Based Weight
CB_USER_WT = 0.6

# Collaborative Filtering User-Based Weight
CF_USER_WT = 0.1

# Content Based Engine Weight 
CB_WT = 0.5

SEARCH_RANGE_IN_MILES=2

In [6]:
# Intializing The Recommendation Engine
recommender = RecommenderInterface(
    cb_user_sim_matrix=cb_user_sim_matrix, 
    cb_item_sim_matrix=cb_item_sim_matrix,
    cb_idx2user=cb_idx2user, 
    cb_idx2item=cb_idx2item,
    search_range_in_miles=SEARCH_RANGE_IN_MILES,
    cb_user_wt=CB_USER_WT,
    cf_user_wt=CF_USER_WT,
    cf_ub_engine_path=os.path.join(data_dir,cf_ub_fname),
    cf_ib_engine_path=os.path.join(data_dir,cf_ib_fname),
    cb_wt=CB_WT,
    train_df=train_df
)

Initializing Engine
CB User Wt : 0.6
CF user Wt : 0.1


In [7]:
# Prediction using Hybrid Engine
preds = recommender.recommend(user_id = 'TVN3qoXU0-sateboutdCZg',
                         lat = 39.958211,
                         long = -75.173137,
                         user_preference  = 'Italian',
                         topk=20
                        )
preds.head()

Unnamed: 0,item_id,rating
171,wUnLSg_GKfEIQ5CQQ770_g,4.759277
73,oZzN706lKoL4faaTK739xA,4.636019
76,qjIN4UbE96Cq6JKwLIQ9VQ,4.591428
240,ZsSsyknkpARZUrD6rRH27A,4.508464
79,d5fAUl4lKaNxGfiXj4Kygg,4.481208


In [13]:
train_df.groupby('user_id')['review_id'].count().reset_index().sort_values(by='review_id',ascending=True)

Unnamed: 0,user_id,review_id
0,-1MF2tosrw2WcCxeVNk81Q,5
2535,ObpMR3OjoJDmiTcObXPZPQ,5
2594,P3oTNMEOVLCrbtDTgxH8Tw,5
2617,PEhyruu0fJtayrVJ7qYzaQ,5
2624,PIGRRUZlIsssfuLWkZ4vyw,5
...,...,...
2101,Jt3GylPuH64uA3zTdbMdCg,485
3346,WJVedIAJN0FRxDn6M1tLkA,485
1909,HxyLRaoH9PS09M6R3rV-EQ,485
933,8EMU7d4pCkdqUnvlIW40CA,485


In [23]:
# Reading Test Data
test_df = pd.read_json(os.path.join(data_dir, test_fname))

In [25]:
y_true = test_df['user_rating'].tolist()
y_pred = []
topk = 20

done=0
total=test_df.shape[0]
for idx,row in test_df.iterrows():
    user_id = row['user_id']
    item_id = row['business_id']
    pred = recommender.predict(user_id, item_id, topk)
    y_pred.append(pred)
    done+=1

    if done % 500 == 0:
        print(f"{done}/{total} Done")

500/6823 Done
1000/6823 Done
1500/6823 Done
2000/6823 Done
2500/6823 Done
3000/6823 Done
3500/6823 Done
4000/6823 Done
4500/6823 Done
5000/6823 Done
5500/6823 Done
6000/6823 Done
6500/6823 Done


In [30]:
mse = round(mean_squared_error(y_true, y_pred),4)
mae = round(mean_absolute_error(y_true, y_pred),4)

print(f"MSE : {mse} MAE : {mae}")

MSE : 0.3491 MAE : 0.4312


In [22]:
#print('\n\n'.join(train_df[train_df['user_id'] == '-1MF2tosrw2WcCxeVNk81Q'].text.tolist()))