In [2]:
!pip install surprise

You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.11/bin/python3.11 -m pip install --upgrade pip' command.[0m


In [1]:
import numpy as np
import pandas as pd
from surprise import Dataset, Reader, NormalPredictor, KNNBasic, KNNWithZScore, KNNWithMeans, KNNWithZScore, SVD
from surprise.model_selection import train_test_split, cross_validate
from collections import defaultdict
from surprise import accuracy
from tqdm import tqdm

In [2]:
def read_chunks(file, cols, chunk_size=500000):
    df = pd.read_json(
        f'/Users/ryounes/Documents/Feup/RecSNA_Project/yelp_academic_dataset_{file}.json', chunksize = chunk_size, lines=True)
    
    chunk_list = [chunk[cols] for chunk in df ]

    return pd.concat(chunk_list, ignore_index = True, join = 'outer', axis = 0)

In [3]:
business_json_path = '/Users/ryounes/Documents/Feup/RecSNA_Project/yelp_academic_dataset_business.json'
df_b = pd.read_json(business_json_path, lines=True)

In [4]:
df_u = read_chunks('user', ['user_id','name', 'friends'])

In [5]:
df_r = read_chunks('review', ['review_id','user_id', 'business_id','stars'])

In [11]:
df_Franklin_b = df_b[df_b['city']== 'Franklin']

In [12]:
# Filter reviews for businesses in Franklin directly
business_ids_Franklin = df_Franklin_b['business_id'].unique()
df_r_Franklin = df_r[df_r['business_id'].isin(business_ids_tucson)]

In [18]:



# Extract unique user IDs who left reviews for businesses in Franklin
Franklin_reviewed_user_ids = df_r_Franklin['user_id'].unique()

# Extract friends' user IDs directly from the 'friends' column
all_friends_user_ids = df_u[df_u['user_id'].isin(Franklin_reviewed_user_ids)]['friends'].str.split(', ').explode().unique()

# Filter the user data frame to include the users and their friends
df_users_Franklin_and_friends = df_u[df_u['user_id'].isin(all_friends_user_ids) | df_u['user_id'].isin(Franklin_reviewed_user_ids)]

# Filter reviews by users who reviewed businesses in tucson or their friends
df_r_u_and_f_Franklin = df_r[(df_r['user_id'].isin(df_users_Franklin_and_friends['user_id'])) | (df_r['business_id'].isin(business_ids_Franklin))]

df_r_u_and_f_Franklin


Unnamed: 0,review_id,user_id,business_id,stars
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3
24,lUUhg8ltDsUZ9h0xnwY4Dg,RreNy--tOmXMl1en0wiBOg,cPepkJeRMtHapc_b2Oe_dw,4
30,-P5E9BYUaK7s3PwBF5oAyg,Jha0USGDMefGFRLik_xFQg,bMratNjTG5ZFEA6hVyr-xQ,5
31,YbMyvlDA2W3Py5lTz8VK-A,4hBhtCSgoxkrFgHa4YAD-w,bbEXAEFr4RYHLlZ-HFssTA,5
34,p198qZsKOMCUhgdtRWsOKQ,3MpDvy5gEdsbZh9-p92dHg,8QnuWGVNBhzyYXGSeRdi4g,4
...,...,...,...,...
6990270,7NgXAuTFiJHYbuepOPwU0w,x1QLCwZGFAjxRRw4EHc3-g,1_BVWDzi5cVqWxNe9bOMMQ,5
6990272,wD5ZWao_vjyT2h4xmGam8Q,7L7GL5Pi2cf8mbm2Dpw4zw,e_E-jq9mwm7wk75k7Yi-Xw,5
6990274,YVX1Wsa4LYxjvFwuHBb_gA,RKPkxOYQlM0BjhM-H6_vAw,X4mouE_cMiwbfyCPZ_K-FA,4
6990275,H0RIamZu0B0Ei0P4aeh3sQ,qskILQ3k0I_qcCMI-k6_QQ,jals67o91gcrD4DC81Vk6w,5


In [13]:
df_r_Franklin

Unnamed: 0,review_id,user_id,business_id,stars
406,89SF8MFoiY4mIhOgzAMVhA,S8rqFLdHCFAeibs52a4QpQ,kKk_2OAnCRbxX5rLqQYBHg,5
425,HyD8P-CcFU3FR75T0dJAhA,YsaJXW7VWV2bhoN5tokKrg,HCqmx4ENAZ76SAjoalj-MQ,4
693,3B41BdxNIENFP1EsCCybnw,B0fzg0eUWplNBzfSJsCFDg,VUOD8yD1jwq_pqRP8EETBQ,3
802,h0N47vwJhQMBH9mscX9zhw,11jQWQYTJkzIE6o3s5ID4Q,hn3Rg2JrhQoDJBEhrpuwWg,1
804,cgppGTg8LpqZObn_k69w2Q,TSxzAqKzU2OvjWSLoSU8dA,skN2XhKXlcjf53uIwzAedw,5
...,...,...,...,...
6989373,Rjv4qf02STIbSUu5mq4W4w,U0kiORl1Gpc8K9jOWWgHzg,yi5VHfhTIPopCob2toQH8A,1
6989504,B50r7a0YV-WnBgf3bW-Dyw,H9AGj0t3ghPaHug0Ri3kOQ,y8AENR609baGvVe-d_F-dg,1
6989863,lbhfWHGz3C6dyLp_aiKadg,EUITvLX8HgKpJPs8ngXRPw,7rxLUgFNwAWd69Y4iZnUCA,5
6990131,sXMx4wAegK38PRbzyWe5Bg,4Ry0q14UDewv2LMa2wibCQ,IoRTm7IAJqBbHpiisUBnLg,4


In [19]:
# Create a Surprise Dataset from the merged DataFrame
reader = Reader(rating_scale=(1, 5))  # Define the rating scale
data = Dataset.load_from_df(df_r_Franklin[['user_id', 'business_id', 'stars']], reader)


In [20]:
#df_r_u_and_f_PHI.to_csv("df_r_u_and_f_PHI.csv", sep=',', index=False, encoding='utf-8')

In [21]:
# Split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42 )

In [22]:
# Define evaluation function
def evaluate_algorithm(algo, trainset, testset):
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and return RMSE
    rmse = accuracy.rmse(predictions)
    return rmse

In [24]:
ubcf_algo = KNNBasic(sim_options={'user_based': True})
ubcf_rmse = evaluate_algorithm(ubcf_algo, trainset, testset)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.5499


In [25]:
ibcf_algo = KNNBasic(sim_options={'user_based': False})
ibcf_rmse = evaluate_algorithm(ibcf_algo, trainset, testset)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.5693


In [26]:
# Singular Value Decomposition (SVD)
svd_algo = SVD()
svd_rmse = evaluate_algorithm(svd_algo, trainset, testset)

RMSE: 1.3738


In [27]:
ubcf_algo = KNNBasic(sim_options={'user_based': True})
cross_validate(ubcf_algo, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5324  1.5489  1.5447  1.5593  1.5376  1.5446  0.0093  
MAE (testset)     1.2708  1.2898  1.2813  1.2944  1.2706  1.2814  0.0097  
Fit time          6.47    6.93    7.04    6.94    6.98    6.87    0.21    
Test time         0.90    0.68    0.68    0.77    0.69    0.74    0.09    


{'test_rmse': array([1.53243037, 1.54893376, 1.54466266, 1.55928158, 1.53764759]),
 'test_mae': array([1.27079061, 1.2897851 , 1.28128195, 1.29435797, 1.27059805]),
 'fit_time': (6.466526985168457,
  6.929379224777222,
  7.03980016708374,
  6.9407618045806885,
  6.975151300430298),
 'test_time': (0.9030938148498535,
  0.6826369762420654,
  0.6790499687194824,
  0.7651760578155518,
  0.6874241828918457)}

In [None]:
# Build a user-based collaborative filtering recommender model
sim_options = {'name': 'cosine', 'user_based': True}
model = KNNBasic(sim_options=sim_options)

# Train the model on the training set
model.fit(trainset)

# Make predictions on the test set
predictions = model.test(testset)

# Evaluate the model using RMSE (Root Mean Squared Error)
rmse = accuracy.rmse(predictions)

# Example: Get top N recommendations for a user
# Replace 'user_id' and 'n' with the actual user ID and the number of recommendations you want.
user_id = 'your_user_id'
n = 10
user_items = data.build_full_trainset().ur[user_id]
user_unseen_items = [item for item in trainset.all_items() if item not in user_items]
user_unseen_items_ratings = [model.predict(user_id, item_id).est for item_id in user_unseen_items]
top_n_items = [x for _, x in sorted(zip(user_unseen_items_ratings, user_unseen_items), reverse=True)][:n]

# Print the top N recommended items
print(f"Top {n} Recommended Items for User {user_id}:")
for i, item_id in enumerate(top_n_items, 1):
    print(f"{i}. Business ID: {item_id}")

print(f"RMSE: {rmse}")
