In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [2]:
# this data is based on the output of 1.0.0_Data_Filtering
data = pd.read_csv('../data/filtered_reviews_in_Phonex.csv')
data.head(3)

Unnamed: 0.1,Unnamed: 0,review_id,user_id,business_id,stars,text,date,business_name,city,state,categories
0,1,aBWKb49Xfkv1946YN7_SIg,sSPbLBHcEMXaJfoO8zs1bA,poSV39UqEg-gpESXafS9-g,5.0,"Amazing food, drinks, service!\n\nWe started w...",2016-01-17 05:33:14,Angry Crab Shack,Phoenix,AZ,"Restaurants, American (New), Seafood, Cajun/Cr..."
1,2,jCHaWXgppmZjkOdpFltWGA,D5ywfFmwtJxLReqAYlDDmw,poSV39UqEg-gpESXafS9-g,5.0,I couldn't be more excited to have found this ...,2016-01-30 01:13:29,Angry Crab Shack,Phoenix,AZ,"Restaurants, American (New), Seafood, Cajun/Cr..."
2,5,CfueO6B4_WauGRQ0cq9Whg,fhwZh6_7YxWeVEuskKMTcA,poSV39UqEg-gpESXafS9-g,5.0,Loved this place! \nI came for crab legs and w...,2016-05-07 01:19:53,Angry Crab Shack,Phoenix,AZ,"Restaurants, American (New), Seafood, Cajun/Cr..."


In [3]:
def train_valid_test_split(data, m, n):

    '''
    construct rating matrix from data
    the columns of which represent business_id
    the rows of which represent user_id
    the values of whose elements represent the according ratings
    @ data: filterd_reviews 
    @ m: counts of ratings for validation
    @ n: counts of ratings for test
    '''

    # to construct sparse matrix
    # train
    train_user_id = []
    train_business_id = []
    train_stars = []
    # validation
    valid_user_id = []
    valid_business_id = []
    valid_stars = []
    # train + validation
    train_valid_user_id = []
    train_valid_business_id = []
    train_valid_stars = []
    # test
    test_user_id = []
    test_business_id = []
    test_stars = []
    
    user_id_lst = data['user_id'].unique().tolist() # rows of sparse matrix
    busi_id_lst = data['business_id'].unique().tolist() # columns of sparse matrix

    train_sparse_matrix = np.zeros(shape=(len(user_id_lst), len(busi_id_lst)))
    valid_sparse_matrix = np.zeros(shape=(len(user_id_lst), len(busi_id_lst)))
    train_valid_sparse_matrix = np.zeros(shape=(len(user_id_lst), len(busi_id_lst)))
    test_sparse_matrix = np.zeros(shape=(len(user_id_lst), len(busi_id_lst)))

    ranking_df = data[['user_id','business_id','stars','date']].groupby(['user_id'])
    
    for group_name, group_df in ranking_df:
        group_df = group_df.sort_values(by='date')

        # if the len(group_df) > valid_m + test_n, split the group_df as 
        # training set : group_df.iloc[:len(group_df)-m-n, :]
        # validation set : group_df.iloc[len(group_df)-m-n:len(group_df)-n, :]
        # test set : group_df.iloc[len(group_df)-n:, :]

        # otherwise, not split the group_df
        # keep the group_df as training set

        if len(group_df) > m+n: 

            training_set = group_df.iloc[:len(group_df)-m-n, :]
            train_user_id.extend(training_set.loc[:,'user_id'].tolist()) 
            train_business_id.extend(training_set.loc[:,'business_id'].tolist())
            train_stars.extend(training_set.loc[:,'stars'].tolist())

            validation_set = group_df.iloc[len(group_df)-m-n:len(group_df)-n, :]
            valid_user_id.extend(validation_set.loc[:,'user_id'].tolist()) 
            valid_business_id.extend(validation_set.loc[:,'business_id'].tolist())
            valid_stars.extend(validation_set.loc[:,'stars'].tolist())
            
            train_validation_set = group_df.iloc[:len(group_df)-n, :]
            train_valid_user_id.extend(train_validation_set.loc[:,'user_id'].tolist()) 
            train_valid_business_id.extend(train_validation_set.loc[:,'business_id'].tolist())
            train_valid_stars.extend(train_validation_set.loc[:,'stars'].tolist())

            testing_set = group_df.iloc[len(group_df)-n:, :]
            test_user_id.extend(testing_set.loc[:,'user_id'].tolist()) 
            test_business_id.extend(testing_set.loc[:,'business_id'].tolist())
            test_stars.extend(testing_set.loc[:,'stars'].tolist())

        else:
            training_set = group_df
            train_user_id.extend(training_set.loc[:,'user_id'].tolist()) 
            train_business_id.extend(training_set.loc[:,'business_id'].tolist())
            train_stars.extend(training_set.loc[:,'stars'].tolist())

    train_df = pd.DataFrame({'user_id': train_user_id, 'business_id': train_business_id, 'stars': train_stars})
    valid_df = pd.DataFrame({'user_id': valid_user_id, 'business_id': valid_business_id, 'stars': valid_stars})
    train_valid_df = pd.DataFrame({'user_id': train_valid_user_id, 'business_id': train_valid_business_id, 'stars': train_valid_stars})
    test_df = pd.DataFrame({'user_id': test_user_id, 'business_id': test_business_id, 'stars': test_stars})


    for i in range(len(train_df)):
        ratings = train_df.iloc[i, 2] # stars
        row_index = user_id_lst.index(train_df.iloc[i, 0]) # user_id
        column_index = busi_id_lst.index(train_df.iloc[i, 1]) # business_id
        train_sparse_matrix[row_index, column_index] = ratings

    for i in range(len(valid_df)):
        ratings = valid_df.iloc[i, 2] # stars
        row_index = user_id_lst.index(valid_df.iloc[i, 0]) # user_id
        column_index = busi_id_lst.index(valid_df.iloc[i, 1]) # business_id
        valid_sparse_matrix[row_index, column_index] = ratings
        
    for i in range(len(train_valid_df)):
        ratings = train_valid_df.iloc[i, 2] # stars
        row_index = user_id_lst.index(train_valid_df.iloc[i, 0]) # user_id
        column_index = busi_id_lst.index(train_valid_df.iloc[i, 1]) # business_id
        train_valid_sparse_matrix[row_index, column_index] = ratings
        
    for i in range(len(test_df)):
        ratings = test_df.iloc[i, 2] # stars
        row_index = user_id_lst.index(test_df.iloc[i, 0]) # user_id
        column_index = busi_id_lst.index(test_df.iloc[i, 1]) # business_id
        test_sparse_matrix[row_index, column_index] = ratings

    # calculate sparstiy of the matrix
    train_sparsity = 1 - np.count_nonzero(train_sparse_matrix)/ (train_sparse_matrix.shape[0] * train_sparse_matrix.shape[1])
    valid_sparsity = 1 - np.count_nonzero(valid_sparse_matrix)/ (valid_sparse_matrix.shape[0] * valid_sparse_matrix.shape[1])
    train_valid_sparsity = 1 - np.count_nonzero(train_valid_sparse_matrix)/ (train_valid_sparse_matrix.shape[0] * train_valid_sparse_matrix.shape[1])
    test_sparsity = 1 - np.count_nonzero(test_sparse_matrix)/ (test_sparse_matrix.shape[0] * test_sparse_matrix.shape[1])

    train_sparsity *= 100
    valid_sparsity *=100
    train_valid_sparse_matrix *= 100
    test_sparsity *= 100

    print (f'{len(user_id_lst)} users')
    print (f'{len(busi_id_lst)} business')

    print (f'Train_rating_matrix Sparsity: {round(train_sparsity,4)}%')
    print (f'Valid_rating_matrix Sparsity: {round(valid_sparsity,4)}%')
    print(f'Test_rating_matrix Sparsity:  {round(test_sparsity,4)}%')


    return train_sparse_matrix, valid_sparse_matrix, train_valid_sparse_matrix, test_sparse_matrix, \
           train_df, valid_df, train_valid_df, test_df, \
           user_id_lst, busi_id_lst

In [4]:
train_sparse_matrix, valid_sparse_matrix, train_valid_sparse_matrix, test_sparse_matrix, \
           train_df, valid_df, train_valid_df, test_df, \
           user_id_lst, busi_id_lst = train_valid_test_split(data=data, m=1, n=1)

19683 users
1728 business
Train_rating_matrix Sparsity: 99.4691%
Valid_rating_matrix Sparsity: 99.9427%
Test_rating_matrix Sparsity:  99.9427%


In [5]:
np.save('../data/train_sparse_matrix.npy', train_sparse_matrix)
np.save('../data/valid_sparse_matrix.npy', valid_sparse_matrix)
np.save('../data/test_sparse_matrix.npy', test_sparse_matrix)
np.save('../data/train_valid_sparse_matrix.npy', train_valid_sparse_matrix)

train_df.to_pickle('../data/train_df.pkl')
valid_df.to_pickle('../data/valid_df.pkl')
test_df.to_pickle('../data/test_df.pkl')
train_valid_df.to_pickle('../data/train_valid_df.pkl')

In [6]:
test_sparse_matrix.shape

(19683, 1728)

In [7]:
train_sparse_matrix.shape

(19683, 1728)

In [8]:
train_valid_sparse_matrix.shape

(19683, 1728)

# sparse to long format

In [9]:
def sparse_to_long_format(sparse_matrix):

    user_loc_lst = np.nonzero(sparse_matrix)[0]
    busi_loc_lst = np.nonzero(sparse_matrix)[1]
    
    prediction = [nlp_sparse[loc] for loc in zip(user_loc_lst, busi_loc_lst)]
    
    user_id = [user_id_lst[i] for i in user_loc_lst]
    busi_id = [busi_id_lst[i] for i in busi_loc_lst]
    
    long_format = pd.DataFrame({'user_id': user_id,
                               'busi_id': busi_id,
                               'prediction_ratings': prediction})
    
    return long_format

# long format to sparse

In [10]:
def long_format_to_sparse(data, pre_feature):
    '''
    @ pre_feature: the feature represents predict_ratings
    '''
    test_sparse_matrix = np.zeros(shape=(len(user_id_lst), len(busi_id_lst)))
    for i in range(len(data)):
        predict_col_index = data.columns.get_loc(pre_feature)
        predict_ratings = data.iloc[i, predict_col_index]
        row_index = user_id_lst.index(data.iloc[i, 0]) # user_id
        column_index = busi_id_lst.index(data.iloc[i, 1]) # business_id
        
        test_sparse_matrix[row_index, column_index] = predict_ratings
        
    return test_sparse_matrix

# Ensemble 

- Baseline model
- SGD(funk-svd)
- ALS(with regularization)
- SGD(svd+bias)
- CB
- tf-idf cos

In [11]:
nlp_long_format = pd.read_csv('../data/Predictions_CB_bus.csv')
nlp_sparse = long_format_to_sparse(nlp_long_format, 'prediction_ratings')
nlp_sparse

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [12]:
cb_tfidf_long_format = pd.read_csv('../data/Predictions_CB_tfidf.csv')
cb_tfidf_sparse = long_format_to_sparse(cb_tfidf_long_format, 'prediction_ratings')
cb_tfidf_sparse

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [13]:
cb_lsi_long_format = pd.read_csv('../data/Predictions_CB_LSI.csv')
# cb_lsi_long_format.head()
cb_lsi_sparse = long_format_to_sparse(cb_lsi_long_format, 'pred_lsi')
cb_lsi_sparse

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
baseline_long_format = pd.read_csv('../data/baseline_predict_df.csv')
baseline_long_format = baseline_long_format.loc[ : , ~baseline_long_format.columns.str.contains('Unnamed')]
baseline_sparse = long_format_to_sparse(baseline_long_format, 'predict')
baseline_sparse

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [15]:
sgd_long_format = pd.read_csv('../data/sgd_predict_df.csv')
sgd_long_format = sgd_long_format.loc[ : , ~sgd_long_format.columns.str.contains('Unnamed')]
sgd_sparse = long_format_to_sparse(sgd_long_format, 'predict')
sgd_sparse

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [16]:
# np.save('../data/test_prediction_CF_ALS.npy', prediction)

cf_als_sparse = np.load('../data/test_prediction_CF_ALS.npy')
rows = np.nonzero(test_sparse_matrix)[0]
cols = np.nonzero(test_sparse_matrix)[1]
cf_als_test = np.zeros(shape=(len(user_id_lst), len(busi_id_lst)))
loc_zip = zip(rows, cols)
for loc in loc_zip:
    cf_als_test[loc] = cf_als_sparse[loc]
cf_als_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [17]:
cf_sgd_sparse = np.load('../data/test_prediction_CF_SGD.npy')
rows = np.nonzero(test_sparse_matrix)[0]
cols = np.nonzero(test_sparse_matrix)[1]
cf_sgd_test = np.zeros(shape=(len(user_id_lst), len(busi_id_lst)))
loc_zip = zip(rows, cols)
for loc in loc_zip:
    cf_sgd_test[loc] = cf_sgd_sparse[loc]
cf_sgd_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# MSE & MAE

In [18]:
def get_mse(pred, actual):
        # Ignore nonzero terms.
        pred = pred[actual.nonzero()].flatten()
        actual = actual[actual.nonzero()].flatten()
        
        return mean_squared_error(pred, actual)

In [19]:
def get_mae(pred, actual):
        # Ignore nonzero terms.
        pred = pred[actual.nonzero()].flatten()
        actual = actual[actual.nonzero()].flatten()
        
        return mean_absolute_error(pred, actual)

In [36]:
nlp_mse = get_mse(nlp_sparse, valid_sparse_matrix)
nlp_mse

1.7866242990130192

In [37]:
nlp_mae = get_mae(nlp_sparse, valid_sparse_matrix)
nlp_mae

1.0084162683470375

In [38]:
cb_tfidf_mse = get_mse(cb_tfidf_sparse, valid_sparse_matrix)
cb_tfidf_mse

1.7902632606937827

In [39]:
cb_tfidf_mae = get_mae(cb_tfidf_sparse, valid_sparse_matrix)
cb_tfidf_mae

1.009961664148823

In [40]:
cb_lsi_mse = get_mse(cb_lsi_sparse, valid_sparse_matrix)
cb_lsi_mse

1.7904323585257838

In [41]:
cb_lsi_mae = get_mae(cb_lsi_sparse, valid_sparse_matrix)
cb_lsi_mae

1.0100283174474598

In [31]:
baseline_mse = get_mse(baseline_sparse, valid_sparse_matrix)
baseline_mse

1.436931792125898

In [67]:
baseline_mae = get_mae(baseline_sparse, valid_sparse_matrix)
baseline_mae

0.9584879141703012

In [32]:
sgd_mse = get_mse(sgd_sparse, valid_sparse_matrix)
sgd_mse

1.4163437783279094

In [44]:
sgd_mae = get_mae(sgd_sparse, valid_sparse_matrix)
sgd_mae

0.9420043986294402

In [42]:
cf_als_mse = get_mse(cf_als_test, valid_sparse_matrix)
cf_als_mse

17.037692307692307

In [43]:
cf_als_mae = get_mae(cf_als_test, valid_sparse_matrix)
cf_als_mae 

3.914820512820513

In [45]:
cf_sgd_mse = get_mse(cf_sgd_sparse, valid_sparse_matrix)
cf_sgd_mse

1.4178860388775774

In [46]:
cf_sgd_mae = get_mae(cf_sgd_sparse, valid_sparse_matrix)
cf_sgd_mae

0.9477884332870176

# Weight the models

In [27]:
# nlp + cb_tfidf + baseline + sgd + cf_als + cf_sgd

In [64]:
nlp = 1/ nlp_mse
cb_tfidf = 1/ cb_tfidf_mse
cb_lsi = 1/cb_lsi_mse
baseline = 1/ baseline_mse
sgd = 1/ sgd_mse
cf_als = 1/ cf_als_mse
cf_sgd = 1/ cf_sgd_mse

all_ = nlp + cb_tfidf + cb_lsi + cf_sgd+  baseline + sgd 

nlp = nlp/ all_
cb_tfidf = cb_tfidf/ all_
cb_lsi = cb_lsi/ all_
baseline = baseline/ all_
sgd = sgd/ all_
cf_als = cf_als/ all_
cf_sgd = cf_sgd/ all_


ensemble_matrix = \
nlp * nlp_sparse + cb_tfidf * cb_tfidf_sparse + cb_lsi * cb_lsi_sparse + cf_als * cf_als_sparse \
+ cf_sgd * cf_sgd_sparse + baseline * baseline_sparse + sgd * sgd_sparse 

In [65]:
ensemble_mse = get_mse(ensemble_matrix, valid_sparse_matrix)
ensemble_mse

1.48058483886994

In [66]:
ensemble_mae = get_mae(cf_sgd_sparse, valid_sparse_matrix)
ensemble_mae

0.9477884332870176

In [None]:
# cf_sgd + nlp

In [None]:
# cf_sgd = 1/ cf_sgd_mse
# nlp = 1/ nlp_mse
# all_ = cf_sgd + nlp
# cf_sgd = cf_sgd/ all_
# nlp = nlp/ all_

# ensemble_matrix = nlp * nlp_sparse + cf_sgd * sgd_sparse
# ensemble_mse = get_mse(ensemble_matrix, test_sparse_matrix)
# ensemble_mse