In [1]:
import pandas as pd
import numpy as np

In [2]:
# this data is based on the output of 1.0.0_Data_Filtering
data = pd.read_csv('../data/filtered_reviews_in_Phonex.csv')
data.head(3)


Unnamed: 0.1,Unnamed: 0,review_id,user_id,business_id,stars,date,business_name,city,state,categories
0,1,aBWKb49Xfkv1946YN7_SIg,sSPbLBHcEMXaJfoO8zs1bA,poSV39UqEg-gpESXafS9-g,5.0,2016-01-17 05:33:14,Angry Crab Shack,Phoenix,AZ,"Restaurants, American (New), Seafood, Cajun/Cr..."
1,2,jCHaWXgppmZjkOdpFltWGA,D5ywfFmwtJxLReqAYlDDmw,poSV39UqEg-gpESXafS9-g,5.0,2016-01-30 01:13:29,Angry Crab Shack,Phoenix,AZ,"Restaurants, American (New), Seafood, Cajun/Cr..."
2,3,KvJ8yT-dODmCYe21J-Pp_A,gjoN4q-f61kwjmmU4mm1_g,poSV39UqEg-gpESXafS9-g,2.0,2016-03-30 22:28:36,Angry Crab Shack,Phoenix,AZ,"Restaurants, American (New), Seafood, Cajun/Cr..."


In [3]:
def train_valid_test_split(data, m, n):

    '''
    construct rating matrix from data
    the columns of which represent business_id
    the rows of which represent user_id
    the values of whose elements represent the according ratings
    @ data: filterd_reviews 
    @ m: counts of ratings for validation
    @ n: counts of ratings for test
    '''

    # to construct sparse matrix
    # train
    train_user_id = []
    train_business_id = []
    train_stars = []
    # validation
    valid_user_id = []
    valid_business_id = []
    valid_stars = []
    # test
    test_user_id = []
    test_business_id = []
    test_stars = []
    
    user_id_lst = data['user_id'].unique().tolist() # rows of sparse matrix
    busi_id_lst = data['business_id'].unique().tolist() # columns of sparse matrix

    train_sparse_matrix = np.zeros(shape=(len(user_id_lst), len(busi_id_lst)))
    valid_sparse_matrix = np.zeros(shape=(len(user_id_lst), len(busi_id_lst)))
    test_sparse_matrix = np.zeros(shape=(len(user_id_lst), len(busi_id_lst)))

    ranking_df = data[['user_id','business_id','stars','date']].groupby(['user_id'])
    
    for group_name, group_df in ranking_df:
        group_df.sort_values(by='date')

        # if the len(group_df) > valid_m + test_n, split the group_df as 
        # training set : group_df.iloc[:len(group_df)-m-n, :]
        # validation set : group_df.iloc[len(group_df)-m-n:len(group_df)-n, :]
        # test set : group_df.iloc[len(group_df)-n:, :]

        # otherwise, not split the group_df
        # keep the group_df as training set

        if len(group_df) > m+n: 

            training_set = group_df.iloc[:len(group_df)-m-n, :]
            train_user_id.extend(training_set.loc[:,'user_id'].tolist()) 
            train_business_id.extend(training_set.loc[:,'business_id'].tolist())
            train_stars.extend(training_set.loc[:,'stars'].tolist())

            validation_set = group_df.iloc[len(group_df)-m-n:len(group_df)-n, :]
            valid_user_id.extend(validation_set.loc[:,'user_id'].tolist()) 
            valid_business_id.extend(validation_set.loc[:,'business_id'].tolist())
            valid_stars.extend(validation_set.loc[:,'stars'].tolist())

            testing_set = group_df.iloc[len(group_df)-n:, :]
            test_user_id.extend(testing_set.loc[:,'user_id'].tolist()) 
            test_business_id.extend(testing_set.loc[:,'business_id'].tolist())
            test_stars.extend(testing_set.loc[:,'stars'].tolist())

        else:
            training_set = group_df
            train_user_id.extend(training_set.loc[:,'user_id'].tolist()) 
            train_business_id.extend(training_set.loc[:,'business_id'].tolist())
            train_stars.extend(training_set.loc[:,'stars'].tolist())

    train_df = pd.DataFrame({'user_id': train_user_id, 'business_id': train_business_id, 'stars': train_stars})
    valid_df = pd.DataFrame({'user_id': valid_user_id, 'business_id': valid_business_id, 'stars': valid_stars})
    test_df = pd.DataFrame({'user_id': test_user_id, 'business_id': test_business_id, 'stars': test_stars})


    for i in range(len(train_df)):
        ratings = train_df.iloc[i, 2] # stars
        row_index = user_id_lst.index(train_df.iloc[i, 0]) # user_id
        column_index = busi_id_lst.index(train_df.iloc[i, 1]) # business_id
        train_sparse_matrix[row_index, column_index] = ratings

    for i in range(len(valid_df)):
        ratings = valid_df.iloc[i, 2] # stars
        row_index = user_id_lst.index(valid_df.iloc[i, 0]) # user_id
        column_index = busi_id_lst.index(valid_df.iloc[i, 1]) # business_id
        valid_sparse_matrix[row_index, column_index] = ratings

    for i in range(len(test_df)):
        ratings = test_df.iloc[i, 2] # stars
        row_index = user_id_lst.index(test_df.iloc[i, 0]) # user_id
        column_index = busi_id_lst.index(test_df.iloc[i, 1]) # business_id
        test_sparse_matrix[row_index, column_index] = ratings

    # calculate sparstiy of the matrix
    train_sparsity = 1 - np.count_nonzero(train_sparse_matrix)/ (train_sparse_matrix.shape[0] * train_sparse_matrix.shape[1])
    valid_sparsity = 1 - np.count_nonzero(valid_sparse_matrix)/ (valid_sparse_matrix.shape[0] * valid_sparse_matrix.shape[1])
    test_sparsity = 1 - np.count_nonzero(test_sparse_matrix)/ (test_sparse_matrix.shape[0] * test_sparse_matrix.shape[1])

    train_sparsity *= 100
    valid_sparsity *=100
    test_sparsity *= 100

    print (f'{len(user_id_lst)} users')
    print (f'{len(busi_id_lst)} business')

    print (f'Train_rating_matrix Sparsity: {round(train_sparsity,4)}%')
    print (f'Valid_rating_matrix Sparsity: {round(valid_sparsity,4)}%')
    print(f'Test_rating_matrix Sparsity:  {round(test_sparsity,4)}%')


    return train_sparse_matrix, valid_sparse_matrix, test_sparse_matrix, train_df, valid_df, test_df

In [4]:
train_sparse_matrix, valid_sparse_matrix, test_sparse_matrix, train_df, valid_df, test_df = train_valid_test_split(data=data, m=1, n=1)

20472 users
1782 business
Train_rating_matrix Sparsity: 99.4921%
Valid_rating_matrix Sparsity: 99.9444%
Test_rating_matrix Sparsity:  99.9444%


In [5]:
np.save('train_sparse_matrix.npy', train_sparse_matrix)
np.save('valid_sparse_matrix.npy', valid_sparse_matrix)
np.save('test_sparse_matrix.npy', test_sparse_matrix)

In [6]:
train_df

Unnamed: 0,user_id,business_id,stars
0,--2HUmLkcNHZp0xw6AMBPg,zidkKI_N1OPxsiddTOQH_Q,5.0
1,--2HUmLkcNHZp0xw6AMBPg,YOD9dXrnpu8HTRILpF0onw,5.0
2,--2HUmLkcNHZp0xw6AMBPg,YOD9dXrnpu8HTRILpF0onw,5.0
3,--2HUmLkcNHZp0xw6AMBPg,uTCOEqjuVAXUOzti5TWj2Q,5.0
4,--2HUmLkcNHZp0xw6AMBPg,APXWKd1N-COyUdncd_FdyQ,5.0
...,...,...,...
197092,zzO9aMo33jA3pPv8SoYskw,EhplcymNbSX5TvPgGilL7Q,5.0
197093,zzO9aMo33jA3pPv8SoYskw,pDewiJY6KCcZgLxxgxg13Q,5.0
197094,zzO9aMo33jA3pPv8SoYskw,ylxqmxh2gO1yCpQkIk6o3A,5.0
197095,zzO9aMo33jA3pPv8SoYskw,5eK_pgro9_LxPYDoRVJnEA,2.0


In [7]:
train_df.to_pickle('../data/train_df.pkl')
valid_df.to_pickle('../data/valid_df.pkl')
test_df.to_pickle('../data/test_df.pkl')