In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import deque
from surprise import Dataset, NormalPredictor, Reader
from surprise.model_selection import cross_validate
import os
from sklearn.model_selection import train_test_split
import pickle
from surprise import accuracy
from surprise import accuracy, Dataset, Reader, SVD, KNNBaseline
from surprise.model_selection import PredefinedKFold
import pprint
import tempfile
from typing import Dict, Text
import tensorflow as tf
import tensorflow_recommenders as tfrs
from collections import defaultdict
import joblib
from scipy import stats

In [2]:
#!pwd

/Users/alecclarkfeather/capstone_project


In [3]:
PATH = '/Users/alecclarkfeather/capstone_project/'

train_hr0 = pd.read_csv(PATH + 'hit_rate_folds_actual/train_hr0.csv', header=None)
test_hr0 = pd.read_csv(PATH + 'hit_rate_folds_actual/test_hr0.csv', header=None)

In [4]:
train_hr0.columns = ['user_id', 'movie_id', 'rating']
test_hr0.columns = ['user_id', 'movie_id', 'rating']

In [5]:
train_hr0.head()

Unnamed: 0,user_id,movie_id,rating
0,826574,9729,4.0
1,2200645,1719,4.0
2,2150434,5356,5.0
3,2142065,8753,3.0
4,867086,6721,4.0


In [10]:
sample = train_hr0['rating'].loc[:4]
sample

0    4.0
1    4.0
2    5.0
3    3.0
4    4.0
Name: rating, dtype: float64

In [11]:
sample_preds = [3, 3, 3, 3, 3]

In [12]:
sample_preds_arr = np.array(sample_preds)
sample_actual = sample.values

In [13]:
np.sqrt(np.mean((sample_actual - sample_preds_arr)**2))

1.1832159566199232

In [15]:
np.sqrt(7/5)

1.1832159566199232

In [16]:
hr_nn_df = pd.read_csv(PATH + 'hr_nn_df.csv')
hr_nn_df

Unnamed: 0,neural_network_hr
0,0.488122
1,0.487022
2,0.488501
3,0.486388
4,0.487842
5,0.486011
6,0.48702
7,0.486285
8,0.487874
9,0.486775


In [17]:
hr_results_df = pd.read_csv(PATH + 'hr_results_df.csv')
hr_results_df

Unnamed: 0,baseline,cf,svd
0,0.296561,0.439228,0.458413
1,0.297034,0.439001,0.460726
2,0.297715,0.4389,0.459328
3,0.298997,0.437977,0.460333
4,0.295871,0.439655,0.460662
5,0.298046,0.438874,0.459719
6,0.297368,0.43956,0.461304
7,0.29803,0.439275,0.460432
8,0.296864,0.438533,0.459555
9,0.298423,0.439623,0.459121


In [18]:
hr_results_df_final = pd.concat([hr_results_df, hr_nn_df], axis=1)
hr_results_df_final

Unnamed: 0,baseline,cf,svd,neural_network_hr
0,0.296561,0.439228,0.458413,0.488122
1,0.297034,0.439001,0.460726,0.487022
2,0.297715,0.4389,0.459328,0.488501
3,0.298997,0.437977,0.460333,0.486388
4,0.295871,0.439655,0.460662,0.487842
5,0.298046,0.438874,0.459719,0.486011
6,0.297368,0.43956,0.461304,0.48702
7,0.29803,0.439275,0.460432,0.486285
8,0.296864,0.438533,0.459555,0.487874
9,0.298423,0.439623,0.459121,0.486775


In [24]:
def five_by_two_paired_t_test(df, col1, col2):
    variances = []
    diffs = []
    for i in range(2, 12, 2):
        group1 = df[col1].iloc[:i].values
        group2 = df[col2].iloc[:i].values
        result1a = group1[i-2]
        result1b = group1[i-1]
        result2a = group2[i-2]
        result2b = group2[i-1]
        performance1 = result1a - result2a
        performance2 = result1b - result2b
        mean_performance = (performance1 + performance2) / 2
        variance_performance = (performance1 - mean_performance)**2 + (performance2 - mean_performance)**2
        variances.append(variance_performance)
        diffs.append(performance1)
        
    t_statistic = diffs[0] / np.sqrt((1/len(variances))*np.sum(variances))
    p_value = stats.t.sf(abs(t_statistic), df=len(variances))
    
    return t_statistic, p_value
        

In [20]:
t_test_results_hr_df = pd.read_csv(PATH + 't_test_results_hr_df.csv')
t_test_results_hr_df

Unnamed: 0,t_statistic,p_value_unadjusted
0,-115.729967,4.567698e-10
1,-115.890316,4.536195e-10
2,-16.915946,6.601872e-06


In [21]:
index=['baseline_vs_cf','baseline_vs_svd','cf_vs_svd']
t_test_results_hr_df.index = index
t_test_results_hr_df

Unnamed: 0,t_statistic,p_value_unadjusted
baseline_vs_cf,-115.729967,4.567698e-10
baseline_vs_svd,-115.890316,4.536195e-10
cf_vs_svd,-16.915946,6.601872e-06


In [27]:
nn_hr_t_stats = []
nn_hr_p_vals = []

t_b_vs_nn, p_b_vs_nn = five_by_two_paired_t_test(hr_results_df_final, 'baseline', 'neural_network_hr')
t_cf_vs_nn, p_cf_vs_nn = five_by_two_paired_t_test(hr_results_df_final, 'cf', 'neural_network_hr')
t_svd_vs_nn, p_svd_vs_nn = five_by_two_paired_t_test(hr_results_df_final, 'svd', 'neural_network_hr')

nn_hr_t_stats.append(t_b_vs_nn)
nn_hr_t_stats.append(t_cf_vs_nn)
nn_hr_t_stats.append(t_svd_vs_nn)

nn_hr_p_vals.append(p_b_vs_nn)
nn_hr_p_vals.append(p_cf_vs_nn)
nn_hr_p_vals.append(p_svd_vs_nn)

In [28]:
nn_hr_t_stats

[-96.93358254993991, -53.738861718903244, -19.75200638954657]

In [29]:
nn_hr_p_vals

[1.107662498914664e-09, 2.109704460894901e-08, 3.0715976673604223e-06]

In [30]:
t_test_results_dict_nn_hr = dict(t_statistic=nn_hr_t_stats, p_value_unadjusted=nn_hr_p_vals)
t_test_results_dict_nn_hr

{'t_statistic': [-96.93358254993991, -53.738861718903244, -19.75200638954657],
 'p_value_unadjusted': [1.107662498914664e-09,
  2.109704460894901e-08,
  3.0715976673604223e-06]}

In [31]:
t_test_results_hr_nn_df = pd.DataFrame(t_test_results_dict_nn_hr, index=['baseline_vs_neural_network',
                                                                 'cf_vs_neural_network',
                                                                 'svd_vs_neural_network'])
t_test_results_hr_nn_df

Unnamed: 0,t_statistic,p_value_unadjusted
baseline_vs_neural_network,-96.933583,1.107662e-09
cf_vs_neural_network,-53.738862,2.109704e-08
svd_vs_neural_network,-19.752006,3.071598e-06


In [32]:
t_test_results_hr_final = pd.concat([t_test_results_hr_df, t_test_results_hr_nn_df], axis=0)
t_test_results_hr_final

Unnamed: 0,t_statistic,p_value_unadjusted
baseline_vs_cf,-115.729967,4.567698e-10
baseline_vs_svd,-115.890316,4.536195e-10
cf_vs_svd,-16.915946,6.601872e-06
baseline_vs_neural_network,-96.933583,1.107662e-09
cf_vs_neural_network,-53.738862,2.109704e-08
svd_vs_neural_network,-19.752006,3.071598e-06


In [33]:
original_alpha = 0.05
bonferroni_adjusted_alpha = original_alpha / 6
bonferroni_adjusted_alpha

0.008333333333333333

In [34]:
t_test_results_hr_final['significant'] = \
t_test_results_hr_final['p_value_unadjusted'].apply(lambda x: 'Yes' if x <= bonferroni_adjusted_alpha
                                                   else 'No')

t_test_results_hr_final

Unnamed: 0,t_statistic,p_value_unadjusted,significant
baseline_vs_cf,-115.729967,4.567698e-10,Yes
baseline_vs_svd,-115.890316,4.536195e-10,Yes
cf_vs_svd,-16.915946,6.601872e-06,Yes
baseline_vs_neural_network,-96.933583,1.107662e-09,Yes
cf_vs_neural_network,-53.738862,2.109704e-08,Yes
svd_vs_neural_network,-19.752006,3.071598e-06,Yes


In [35]:
hr_results_df_final

Unnamed: 0,baseline,cf,svd,neural_network_hr
0,0.296561,0.439228,0.458413,0.488122
1,0.297034,0.439001,0.460726,0.487022
2,0.297715,0.4389,0.459328,0.488501
3,0.298997,0.437977,0.460333,0.486388
4,0.295871,0.439655,0.460662,0.487842
5,0.298046,0.438874,0.459719,0.486011
6,0.297368,0.43956,0.461304,0.48702
7,0.29803,0.439275,0.460432,0.486285
8,0.296864,0.438533,0.459555,0.487874
9,0.298423,0.439623,0.459121,0.486775


In [36]:
hr_means = {}
hr_stds = {}

for col in hr_results_df_final.columns:
    hr_means[col] = np.mean(hr_results_df_final[col].values)
    hr_stds[col] = np.std(hr_results_df_final[col].values, ddof=1)



In [37]:
hr_means

{'baseline': 0.2974909388054058,
 'cf': 0.43906257989659575,
 'svd': 0.45995943789521176,
 'neural_network_hr': 0.48718396105595063}

In [38]:
hr_stds

{'baseline': 0.0009363190822027744,
 'cf': 0.000528164959683763,
 'svd': 0.0008797856905011649,
 'neural_network_hr': 0.0008538506238907006}

In [39]:
hr_means_df = pd.DataFrame(hr_means, index=['mean'])
hr_stds_df = pd.DataFrame(hr_stds, index=['standard deviation'])

hr_summary_statistics_final = pd.concat([hr_means_df, hr_stds_df], axis=0)

hr_summary_statistics_final

Unnamed: 0,baseline,cf,svd,neural_network_hr
mean,0.297491,0.439063,0.459959,0.487184
standard deviation,0.000936,0.000528,0.00088,0.000854


In [40]:
hr_summary_statistics_final.to_csv(PATH + 'hr_summary_statistics_final.csv')
hr_results_df_final.to_csv(PATH + 'hr_results_df_final.csv')
t_test_results_hr_final.to_csv(PATH + 't_test_results_hr_final.csv')

In [41]:
train_hr0.head()

Unnamed: 0,user_id,movie_id,rating
0,826574,9729,4.0
1,2200645,1719,4.0
2,2150434,5356,5.0
3,2142065,8753,3.0
4,867086,6721,4.0


In [42]:
train_hr0.duplicated().sum()

0