In [70]:
import os
import pandas as pd
import numpy as np

### Loading datasets

In [71]:
data_path = './data'

# define file path
train_file = os.path.join(data_path, 'review_train.csv')
test_file = os.path.join(data_path, 'review_test.csv')
weight_matrix_file = os.path.join(data_path, 'weight_matrix.csv')


In [72]:
# read datasets
train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)
weight_matrix = pd.read_csv(weight_matrix_file)


### Item-based CF Prediction

In [73]:
# convert the review data into a dictionary {(uid, bid): rating}
review_dict = {(row[0], row[1]): row[2] for row in train_data.values.tolist()}

# map each user_id to a list of business {user_id: list[business_ids]}
user_groups = train_data.groupby('uid')
user_business_dict = {uid: list(user_groups.get_group(uid)['bid']) for uid in user_groups.groups}

# transfer weight_matrix to a dictionary {(bid1, bid2): weight}
weight_matrix_dict = {(row[0], row[1]): row[2] for row in weight_matrix.values.tolist()}

In [74]:
# find the most N correlated business according to the weight matrix

def find_n_nearest_business(target_bid, neighbor_business, n=3):
    
    ### pseudocode ###
    # for bid in neighbor_business:
    #     get the correlation between the target_bid and bid
    
    # find the most N correlated business
    # return top N business
    
    neighbor_business_weight = {}
    
    for bid in neighbor_business:
        b_pair = tuple(sorted([target_bid, bid]))
        if weight_matrix_dict.get(b_pair):
            neighbor_business_weight[bid] = weight_matrix_dict[b_pair]
            
    sorted_neighbor_business_weight = sorted(neighbor_business_weight.items(), key = lambda kv:kv[1], reverse = True)
    
    if len(sorted_neighbor_business_weight) > n:
        return sorted_neighbor_business_weight[:n]
    else:
        return sorted_neighbor_business_weight

In [81]:
# compute weighted average over neighborhood set

def weighted_average_prediction(target_user, target_business, nearest_business_weight):
    
    ### pseudocode ###
    
    weighted_sum, sum_weight = 0.0, 0.0
    
    w_list, r_list = [], []
    
    for item in nearest_business_weight:  
        bid, weight = item[0], item[1]
        rating = review_dict[(target_user, bid)]
        w_list.append(weight)
        r_list.append(rating)
    
    weighted_sum = sum([w_list[i] * r for i,r in enumerate(r_list)]) if len(r_list) > 0 else 0.0
    sum_weight = sum(w_list) if len(w_list) > 0 else 0.0
    
    prediction = weighted_sum / sum_weight if sum_weight != 0.0 else 0.0
    
    return prediction


In [91]:
# make prediction for each given (user, business) pair in the testing data

results = []
NEIGHBOR_THRE = 3

for pair in test_data.values.tolist():
    
    target_user, target_business = pair[0], pair[1]
    neighbor_business = user_business_dict.get(target_user)
    
    if neighbor_business is None:
        results.append([target_user, target_business, 0.0])
        continue
    ### pseudocode ###
    n_nearest_business = find_n_nearest_business(target_business, neighbor_business, n = 3)
    
    prediction = weighted_average_prediction(target_user, target_business, n_nearest_business)
    results.append([target_user, target_business, prediction])


In [93]:
# finalize the results
# fill the missing predictions with average values

avg_rating = sum(list(train_data['ratings'])) / len(train_data)

def quick_check(x):
    if x > 5:
        return 5.0
    elif x < 1:
        return 1.0
    else:
        return x

final_results = []

for result in results:
    uid, bid, prediction = result[0], result[1], result[2]
    prediction = avg_rating if prediction == 0.0 else prediction
    prediction = quick_check(prediction)
    final_results.append([uid, bid, prediction])


In [112]:
print(len(final_results), len(test_data))

36480 36480


In [113]:
# write the results to a CSV file

results_df = pd.DataFrame(final_results, columns=['uid', 'bid', 'prediction'])
results_file_path = os.path.join(data_path, 'review_prediction.csv')
results_df.to_csv(results_file_path, index=False)


### Evaluation

In [117]:
# load ground truth for the testing data

test_ground_truth_file = os.path.join(data_path, 'review_test_ground_truth.csv')
test_ground_truth_data = pd.read_csv(test_ground_truth_file)
test_ground_truth_data = test_ground_truth_data.rename(columns={'user_id': 'uid', 'business_id': 'bid', 'stars': 'rating'})

evaluation = test_ground_truth_data.merge(results_df, on=['uid', 'bid'])
evaluation['delta'] = evaluation['ratings'] - evaluation['prediction']

In [118]:
# method 1: count the segments

segments = {0:0, 1:0, 2:0, 3:0, 4:0}

for _, row in evaluation.iterrows():
    if row['delta'] < 1.0:
        segments[0] +=1
    elif 1.0<= row['delta'] < 2.0:
        segments[1] +=1
    elif 2.0 <= row['delta'] < 3.0:
        segments[2] +=1
    elif 3.0 <= row['delta'] < 4.0:
        segments[3] +=1
    elif row['delta'] >= 4.0:
        segments[4] +=1
    
print(segments)
    

{0: 28070, 1: 7913, 2: 1443, 3: 267, 4: 59}


In [119]:
# method 2: compute RMSE
RMSE = (sum(evaluation['delta'] ** 2) / len(evaluation)) ** 0.5
print('RMSE = {}.'.format(RMSE))

RMSE = 1.2221553493271007.
