In [2]:
!pip install surprise

You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.11/bin/python3.11 -m pip install --upgrade pip' command.[0m


In [3]:
import numpy as np
import pandas as pd
from surprise import Dataset, Reader, NormalPredictor, KNNBasic, KNNWithZScore, KNNWithMeans, KNNWithZScore, SVD
from surprise.model_selection import train_test_split, cross_validate
from collections import defaultdict
from surprise import accuracy

In [4]:
def read_chunks(file, cols, chunk_size=500000):
    df = pd.read_json(
        f'/Users/ryounes/Documents/Feup/RecSNA_Project/yelp_academic_dataset_{file}.json', chunksize = chunk_size, lines=True)
    
    chunk_list = [chunk[cols] for chunk in df ]

    return pd.concat(chunk_list, ignore_index = True, join = 'outer', axis = 0)

In [5]:
business_json_path = '/Users/ryounes/Documents/Feup/RecSNA_Project/yelp_academic_dataset_business.json'
df_b = pd.read_json(business_json_path, lines=True)

In [6]:
df_u = read_chunks('user', ['user_id','name', 'friends'])

In [7]:
df_r = read_chunks('review', ['review_id','user_id', 'business_id','stars'])

In [8]:
df_philadelphia_b = df_b[df_b['city']== 'Philadelphia']

In [9]:
# Filter reviews for businesses in Philadelphia directly
business_ids_philadelphia = df_philadelphia_b['business_id'].unique()
df_r_philadelphia = df_r[df_r['business_id'].isin(business_ids_philadelphia)]

# Extract unique user IDs who left reviews for businesses in Philadelphia
philadelphia_reviewed_user_ids = df_r_philadelphia['user_id'].unique()

# Extract friends' user IDs directly from the 'friends' column
all_friends_user_ids = df_u[df_u['user_id'].isin(philadelphia_reviewed_user_ids)]['friends'].str.split(', ').explode().unique()

# Filter the user data frame to include the users and their friends
df_users_philadelphia_and_friends = df_u[df_u['user_id'].isin(all_friends_user_ids) | df_u['user_id'].isin(philadelphia_reviewed_user_ids)]

# Filter reviews by users who reviewed businesses in Philadelphia or their friends
df_r_u_and_f_PHI = df_r[(df_r['user_id'].isin(df_users_philadelphia_and_friends['user_id'])) | (df_r['business_id'].isin(business_ids_philadelphia))]

df_r_u_and_f_PHI


Unnamed: 0,review_id,user_id,business_id,stars
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5
5,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1
...,...,...,...,...
6990275,H0RIamZu0B0Ei0P4aeh3sQ,qskILQ3k0I_qcCMI-k6_QQ,jals67o91gcrD4DC81Vk6w,5
6990276,shTPgbgdwTHSuU67mGCmZQ,Zo0th2m8Ez4gLSbHftiQvg,2vLksaMmSEcGbjI5gywpZA,5
6990277,YNfNhgZlaaCO5Q_YJR4rEw,mm6E4FbCMwJmb7kPDZ5v2Q,R1khUUxidqfaJmcpmGd4aw,4
6990278,i-I4ZOhoX70Nw5H0FwrQUA,YwAMC-jvZ1fvEUum6QkEkw,Rr9kKArrMhSLVE9a53q-aA,5


In [10]:
# Create a Surprise Dataset from the merged DataFrame
reader = Reader(rating_scale=(1, 5))  # Define the rating scale
data = Dataset.load_from_df(df_r_u_and_f_PHI[['user_id', 'business_id', 'stars']], reader)


In [11]:
# Split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [13]:
def evaluate_algorithm(algo, trainset, testset):
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and return RMSE
    rmse = accuracy.rmse(predictions)
    return rmse

In [1]:
ubcf_algo = KNNBasic(sim_options={'user_based': True})
ubcf_rmse = evaluate_algorithm(ubcf_algo, trainset, testset)

NameError: name 'KNNBasic' is not defined

In [None]:
# Build a user-based collaborative filtering recommender model
sim_options = {'name': 'cosine', 'user_based': True}
model = KNNBasic(sim_options=sim_options)

# Train the model on the training set
model.fit(trainset)

# Make predictions on the test set
predictions = model.test(testset)

# Evaluate the model using RMSE (Root Mean Squared Error)
rmse = accuracy.rmse(predictions)

# Example: Get top N recommendations for a user
# Replace 'user_id' and 'n' with the actual user ID and the number of recommendations you want.
user_id = 'your_user_id'
n = 10
user_items = data.build_full_trainset().ur[user_id]
user_unseen_items = [item for item in trainset.all_items() if item not in user_items]
user_unseen_items_ratings = [model.predict(user_id, item_id).est for item_id in user_unseen_items]
top_n_items = [x for _, x in sorted(zip(user_unseen_items_ratings, user_unseen_items), reverse=True)][:n]

# Print the top N recommended items
print(f"Top {n} Recommended Items for User {user_id}:")
for i, item_id in enumerate(top_n_items, 1):
    print(f"{i}. Business ID: {item_id}")

print(f"RMSE: {rmse}")
