In [32]:
# import libraries

import json
import pandas as pd
import random
import numpy as np

from utils import load_filtered_data

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity



from surprise import Reader, Dataset, KNNBasic, accuracy, SVD
from surprise.model_selection import train_test_split

city_info = load_filtered_data('Sicklerville')

In [33]:
with open('communities/Sicklerville_business_reviews_communities.json') as f:
    data = json.load(f)
    
print(data['communities'])

[['RgDVC3ZUBqpEe6Y1kPhIpw', 'FV6QtuA5P52Cm5q7SOeMLQ', 'JSpwp4pAp_Hu4xmPuNitBA', 'UKjsdQvypama0X_henp77A', 'p_qFRYEqyT93hgzCJ4Jnxw', '1C5kFjSPdf9P74IhPy6e-w', '3Qqi2SFrqZDmzb_txv3yaQ', 'iJe_4Z_731GUnEFYZ8xREw', 'XDHIa7cmHBL47XYcPPe5Rw', 'hof0egGfooFnQOSPscC7vA', 'YV7cc6mX_58JzMbj13TF5A', '1-icdXJv2v9MMbytcVLBcQ', 'e0vKaw6oJl0o-aou8YzVNw', '8DGc1UEegk6SBGV39OJtQA', 'ET8n-r7glWYqZhuR6GcdNw', 'ypAXUlbv4GCbI-l2O1JfXw', 'W8HUz05IY7AkX70BaSSwFw', '4TE2IPdI9_4dYAalTGImUg', 'uoeB_8Cff5zFWWzzXUOLNw', 'NRBvM9AHOPYSXMB45RkZ9g', '1n3IENFR9dpPztzTh0A6dA', '0Q3dKYtiXehjNlTjiMD8dg', 'uxkOQHfcphFghmwUYjEvng', 'M6fECH_et0ffSODBrvQABA'], ['zkamNMEjihh3zN7lC7_WVw', 'LI_A-_R6SQotk6fX8C0nPQ', 'ApJ9YgYU-AhS4a-F5oTGHw', 'A_BF2dDDUTKGVXrqxO9mag', 'B-s-8VUnuBjGTP3d01jsyw', 'WTNE5YK9RTgP5jzStlbWjQ', 'wpfvF8ue3yr1m5xD4OrZNw', 'ZTvil7E2JbtORKm0bu6MKQ', 'Vm0fyrUjwMTDgwqV6DhqDw', 'KwC7I2KR-HlEb3KSH4a0bw', 'uIKD7cJF4ZWkicin2ulWPg', 'Djxt8nbOFGJK6F0bzOnL3g', '0ZUHtvIiZ1DPKKDTiieuKA', 'HiKXqt366G3FmHZyIxM20g', '8ybqiII

In [34]:
# Define evaluation function
def evaluate_algorithm(algo, trainset, testset):
    algo.fit(trainset)
    predictions = algo.test(testset)
    
    # Compute and return RMSE
    rmse = accuracy.rmse(predictions)
    return rmse

In [35]:
merged = pd.merge(city_info['review'], city_info['user'], on='user_id', how='left')
merged = pd.merge(merged, city_info['business'], on='business_id', how='left')
merged = merged.drop_duplicates(subset=['user_id', 'business_id'])


In [36]:
# Recommend top N items for a user using a recommender model
def recommend_top_n(algo, trainset, user_id, n=10):
    user_ratings = trainset.ur[user_id]
    items = [item_id for (item_id, _) in user_ratings]
    
    item_scores = {}
    for item_id in trainset.all_items():
        if item_id not in items:
            prediction = algo.predict(trainset.to_raw_uid(user_id), trainset.to_raw_iid(item_id), verbose=True)
            item_scores[item_id] = prediction.est
    
    top_items = sorted(item_scores, key=item_scores.get, reverse=True)[:n]

    #from raw_id to actual_id
    return [trainset.to_raw_iid(i) for i in top_items]

# Get users with positive ratings in testset and training set
def get_users(testset, trainset):
    # we just want to recommend positive ratings
    pos_rating = 3
    pos_testset = testset[testset["stars_x"] > pos_rating]
    # which users exist in the training and testset
    users = []
    for u in pos_testset["user_id"].unique():
        try :
            trainset.to_inner_uid(u)
            users.append(u)
        except ValueError:
            continue
    return users



In [37]:
def get_community_recommendations(community, merged, min_ratings_for_split=5, test_size=0.2):
    global count
    community_reviews = merged[merged['user_id'].isin(community)]
    if len(community_reviews) < min_ratings_for_split:
        count = count + 1
        return None
    
    # Convert DataFrame to Surprise Dataset
    reader = Reader(rating_scale=(1, 5))  # Ratings scale is from 1 to 5
    surprise_dataset = Dataset.load_from_df(community_reviews[['user_id', 'business_id', 'stars_x']], reader)
    
    # Split the dataset into train and test sets
    trainset, testset = train_test_split(surprise_dataset, test_size=test_size)

    # Train the KNNBasic algorithm
    algo = KNNBasic(sim_options={'user_based': True})
    algo.fit(trainset)
    
    # Make predictions on the test set
    predictions_knn = algo.test(testset)
    rmse_knn = accuracy.rmse(predictions_knn)

    return rmse_knn


# Generate recommendations for each community
community_recommendations = {}
count = 0
id = 0
for community in data['communities']:
    recommendation = get_community_recommendations(community, merged)
    if recommendation is not None:
        community_recommendations[id] = recommendation
    id = id + 1


Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.9698
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.3993
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.4334
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.6434
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.8258
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.6379
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.5600
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.7325
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.7340
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.6487
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.0000
Computing the msd similarity matrix...
Done computing 

In [38]:
import pandas as pd
import ast

def convert_to_binary(value):
    if isinstance(value, str) and (value.lower() == 'true' or value.lower() == "u'free'" or value.lower() == "'free'" or value.lower() == "'casual'" or value.lower() == "u'full_bar'"):
        return 1
    elif isinstance(value, str) and value.lower() == 'false':
        return 0
    elif isinstance(value, str) and (value.lower() == 'nan' or value.lower() == 'none'):
        return 0
    elif isinstance(value, str) and (value.lower() == "u'no'" or value.lower() == "u'casual'" or value.lower() == "u'none'"):
        return 0
    elif isinstance(value, str) and (value.lower().strip("'") == 'no' or value.lower().strip("'") == 'none'):
        return 0
    return value


for index, row in merged.iterrows():
    try:
        attributes_dict = ast.literal_eval(row['attributes'])
    except ValueError:
        # Skip this row if attributes column is not a valid dictionary
        continue
    
    merged.fillna(0, inplace=True)
    merged.replace('None', 0, inplace=True)

    for key, value in attributes_dict.items():
        attributes_dict[key] = convert_to_binary(value)
        
        if attributes_dict[key] is not None:  
            merged.at[index, key] = attributes_dict[key]
        else:
            merged.at[index, key] = 0 
    
    merged.at[index, 'attributes'] = attributes_dict

merged.drop('attributes', axis=1, inplace=True)

merged.to_csv("merged.csv", index=False)

In [46]:
from sklearn.metrics.pairwise import cosine_similarity

def get_community_recommendations(community, merged, min_ratings_for_split=5, top_n=5):
    global count
    community_reviews = merged[merged['user_id'].isin(community)]
    if len(community_reviews) < min_ratings_for_split:
        count += 1
        return None
    
    # Content-based recommendation
    business_data = merged[['user_id', 'business_id', 'stars_x']].drop_duplicates()

    # # Standardize numerical features
    # scaler = StandardScaler()
    # business_data[['stars', 'review_count']] = scaler.fit_transform(business_data[['stars', 'review_count']])

    # Calculate similarity matrix
    similarity_matrix = cosine_similarity(merged[['stars_x', 'RestaurantsPriceRange2','ByAppointmentOnly', 'BusinessAcceptsCreditCards',
                                                         'WiFi','GoodForKids','RestaurantsReservations','RestaurantsTakeOut','BikeParking',
                                                         'Caters','RestaurantsAttire','RestaurantsGoodForGroups','HasTV','OutdoorSeating','DogsAllowed',
                                                         'Alcohol','WheelchairAccessible','RestaurantsTableService', 'RestaurantsDelivery','HappyHour',
                                                         'Corkage','BusinessAcceptsBitcoin','CoatCheck', 'DriveThru']])
    # Check Smoking

    # Get top recommendations for each user
    community_recommendations = {}
    for user_id in community:
        reviewed_businesses = community_reviews[community_reviews['user_id'] == user_id]['business_id'].unique()
        
        # Find indices of reviewed businesses within business data
        reviewed_indices = [i for i, bid in enumerate(merged['business_id']) if bid in reviewed_businesses]
        
        # Calculate average similarity with reviewed businesses
        avg_similarity = similarity_matrix[:, reviewed_indices].mean(axis=1)
        
        # Get top similar businesses (excluding those already reviewed)
        top_indices = avg_similarity.argsort()[-top_n:][::-1]
        top_recommendations = business_data.iloc[top_indices]['business_id'].tolist()
        
        community_recommendations[user_id] = top_recommendations
    
    return community_recommendations


# Generate recommendations for each community
community_recommendations = {}
count = 0
id = 0
for community in data['communities']:
    recommendation = get_community_recommendations(community, merged)
    print(recommendation)
    if recommendation is not None:
        community_recommendations[id] = recommendation
    id += 1


{'RgDVC3ZUBqpEe6Y1kPhIpw': ['DNuhEgKexpttzRobB221Vg', 'g0166v2G6DjO8LS73FVzlw', '4_II4KQ4EO_2Tcld-_Rxvg', 'g0166v2G6DjO8LS73FVzlw', '8fCFeHZzujXOZPAnGNR9gg'], 'FV6QtuA5P52Cm5q7SOeMLQ': ['DNuhEgKexpttzRobB221Vg', 'g0166v2G6DjO8LS73FVzlw', '4_II4KQ4EO_2Tcld-_Rxvg', 'g0166v2G6DjO8LS73FVzlw', '8fCFeHZzujXOZPAnGNR9gg'], 'JSpwp4pAp_Hu4xmPuNitBA': ['DNuhEgKexpttzRobB221Vg', 'g0166v2G6DjO8LS73FVzlw', '4_II4KQ4EO_2Tcld-_Rxvg', 'g0166v2G6DjO8LS73FVzlw', '8fCFeHZzujXOZPAnGNR9gg'], 'UKjsdQvypama0X_henp77A': ['8fCFeHZzujXOZPAnGNR9gg', '4_II4KQ4EO_2Tcld-_Rxvg', '2NLe7mbq_MpW4jXpV4VRYQ', '4_II4KQ4EO_2Tcld-_Rxvg', 'tx00Ucw5aza5q-yxxU2Efg'], 'p_qFRYEqyT93hgzCJ4Jnxw': ['DNuhEgKexpttzRobB221Vg', 'g0166v2G6DjO8LS73FVzlw', '4_II4KQ4EO_2Tcld-_Rxvg', 'g0166v2G6DjO8LS73FVzlw', '8fCFeHZzujXOZPAnGNR9gg'], '1C5kFjSPdf9P74IhPy6e-w': ['DNuhEgKexpttzRobB221Vg', 'g0166v2G6DjO8LS73FVzlw', '4_II4KQ4EO_2Tcld-_Rxvg', 'g0166v2G6DjO8LS73FVzlw', '8fCFeHZzujXOZPAnGNR9gg'], '3Qqi2SFrqZDmzb_txv3yaQ': ['DNuhEgKexpttzRobB221Vg'