In [1]:
##### MODELING #####

In [2]:
print('Starting modeling...')

Starting modeling...


In [3]:
### import libraries
import numpy as np
import pandas as pd
from datetime import datetime,date
import csv

start_time = datetime.now()
print('Start time: ', start_time)

Start time:  2019-02-26 20:43:18.077215


In [4]:
#### SELECT INPUT FILE

In [5]:
input_file = '3_day_sample_preprocessed_with_additional_features.tsv.gz'
#input_file = '6_week_sample_preprocessed_with_additional_features.tsv.gz'
#input_file = '12_week_sample_preprocessed_with_additional_features.tsv.gz'
#input_file = '25_week_sample_preprocessed_with_additional_features.tsv.gz'

print('Input file selected: ', input_file)

Input file selected:  3_day_sample_preprocessed_with_additional_features.tsv.gz


In [6]:
##### LOAD DATA
print('Loading data...')

Loading data...


In [7]:
df = pd.read_csv('../data/processed_data/'+input_file, compression='gzip', sep='\t', low_memory=False, encoding='iso-8859-1', parse_dates=['hit_time_gmt', 
                                                                                                                                           'last_hit_time_gmt_visit',
                                                                                                                                           'date_time', 
                                                                                                                                           'last_date_time_visit'])

print('Loading data complete.')

Loading data complete.


In [8]:
##### DESCRIPTIVES
print('Calculating descriptives...')

Calculating descriptives...


In [9]:
descriptives_dict = {'sample size (days)' : len(df['date_time'].apply(lambda x: x.date()).unique()),
                     'unique visitors' : df['visitor_id'].nunique(),
                     'visits' : df.shape[0],
                     'features' : df.shape[1],
                     'conversion rate' : round(df['purchase'].value_counts()[1]/(len(df['purchase'])), 4)}
print('Sample descriptives: ', descriptives_dict)

descriptives_dict_file = '3_day_sample_descriptives.txt'
#descriptives_dict_file = '6_week_sample_descriptives.txt'
#descriptives_dict_file = '12_week_sample_descriptives.txt'
#descriptives_dict_file = '25_week_sample_descriptives.txt'

f = open('../data/descriptives/'+descriptives_dict_file, 'w')
f.write(str(descriptives_dict))
f.close()

print('Calculating descriptives complete.')

Sample descriptives:  {'sample size (days)': 3, 'unique visitors': 188746, 'visits': 214009, 'features': 142, 'conversion rate': 0.0077000000000000002}
Calculating descriptives complete.


In [10]:
##### PREPARE DATA FOR MODELING

In [11]:
print('Preparing data for modeling...')

Preparing data for modeling...


In [12]:
# drop columns that have many missing values, are static or where their usefulness is unclear
cols_to_drop = ['visitor_id_lag',
                'hit_time_gmt',
                'last_hit_time_gmt_visit', 
                'last_hit_time_gmt_visit_lag',
                'last_date_time_visit',
                'days_since_last_visit',
                'purchase_date',
                'purchase_date_lag',
                'days_since_last_purchase',
                'country', 
                'geo_region',
                'geo_city',
                'geo_zip',
                'geo_dma',
                'search_page_num',
                'net_promoter_score_raw_(v10)_-_user',
                'registration_(any_form)_(e20)',
                'hit_of_logged_in_user_(e23)', # duplicate of login_status
                'newsletter_signup_(any_form)_(e26)', 
                'newsletter_subscriber_(e27)', 
                'user_gender_(v61)',
                'user_age_(v62)',
                'login_success_(e72)', 
                'logout_success_(e73)', 
                'login_fail_(e74)', 
                'registration_fail_(e75)',
                'product_categories_level_1',
                'product_categories_level_2',
                'product_categories_level_3', 
                'user_agent',
                'connection_type',
                'browser',
                'operating_system_generalized',
                'search_engine_generalized',
                'marketing_channel',
                'referrer_type']

for col in cols_to_drop:
    if col in df.columns:
        df.drop(col, axis=1, inplace=True)
    else:
        pass

In [13]:
### generate training set from 2/3 of the data
y_train = df[df['date_time'] <= '2016-05-10 23:59:59']['purchase']
X_train = df[df['date_time'] <= '2016-05-10 23:59:59'].copy()
X_train = X_train.reset_index(drop=True)

train_descriptives_dict = {'days for training': len(X_train['date_time'].apply(lambda x: x.date()).unique()),
                           'unique visitors' : X_train['visitor_id'].nunique(),
                           'visits' : X_train.shape[0],
                           'features' : X_train.shape[1] - 3,
                           'conversion rate' : round(y_train.value_counts()[1]/(len(y_train)), 4)}
X_train.drop(['purchase', 'date_time', 'visitor_id'], axis=1, inplace=True)
print('Descriptives training set: ', train_descriptives_dict)

train_descriptives_dict_file = '3_day_sample_train_descriptives.txt'
#train_descriptives_dict_file = '6_week_sample_train_descriptives.txt'
#train_descriptives_dict_file = '12_week_sample_train_descriptives.txt'
#train_descriptives_dict_file = '25_week_sample_train_descriptives.txt'

f = open('../data/descriptives/'+train_descriptives_dict_file, 'w')
f.write(str(descriptives_dict))
f.close()

Descriptives training set:  {'days for training': 2, 'unique visitors': 134912, 'visits': 150771, 'features': 103, 'conversion rate': 0.0071999999999999998}


In [14]:
### generate test set from 1/3 of the data
y_test = df[df['date_time'] > '2016-05-10 23:59:59']['purchase']
X_test = df[df['date_time'] > '2016-05-10 23:59:59'].copy()
X_test = X_test.reset_index(drop=True)

test_descriptives_dict = {'days for training': len(X_test['date_time'].apply(lambda x: x.date()).unique()),
                           'unique visitors' : X_test['visitor_id'].nunique(),
                           'visits' : X_test.shape[0],
                           'features' : X_test.shape[1] - 3,
                           'conversion rate' : round(y_test.value_counts()[1]/(len(y_test)), 4)}
X_test.drop(['purchase', 'date_time', 'visitor_id'], axis=1, inplace=True)
print('Descriptives test set: ', test_descriptives_dict)

test_descriptives_dict_file = '3_day_sample_test_descriptives.txt'
#test_descriptives_dict_file = '6_week_sample_test_descriptives.txt'
#test_descriptives_dict_file = '12_week_sample_test_descriptives.txt'
#test_descriptives_dict_file = '25_week_sample_test_descriptives.txt'

f = open('../data/descriptives/'+test_descriptives_dict_file, 'w')
f.write(str(descriptives_dict))
f.close()

Descriptives test set:  {'days for training': 1, 'unique visitors': 59070, 'visits': 63238, 'features': 103, 'conversion rate': 0.0088000000000000005}


In [15]:
print('Preparing data for modeling complete.')

Preparing data for modeling complete.


In [16]:
##### TRAIN, TEST AND EVALUATE MODELS
print('Starting training, testing and evaluating models...')

Starting training, testing and evaluating models...


In [17]:
### import libraries for modeling and performance evaluation
from sklearn.linear_model import LogisticRegression
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.neighbors import KNeighborsClassifier
#from sklearn.naive_bayes import GaussianNB
#from sklearn.svm import SVC
#from sklearn.ensemble import RandomForestClassifier

#from sklearn.model_selection import KFold,cross_val_score
from sklearn.metrics import accuracy_score,roc_curve,auc,confusion_matrix,classification_report

In [18]:
### build models, do 10-fold cross validation and evaluate each model in turn
models = []
models.append(('LR', LogisticRegression()))
#models.append(('KNN', KNeighborsClassifier()))
#models.append(('CART', DecisionTreeClassifier()))
#models.append(('NB', GaussianNB()))
#models.append(('SVM', SVC()))
#models.append(('RF', RandomForestClassifier()))

In [19]:
### train, test and evaluate each model in turn
for name, model in models:
    
    # train
    print('Training ', name, '...')
    training_start_time = datetime.now()
    model.fit(X_train, y_train)
    training_duration = (datetime.now() - training_start_time)
    print('Training ', name, 'complete, training_duration: ', training_duration)
    
    # test
    print('Testing ', name, '...')
    test_start_time = datetime.now()
    y_pred = model.predict(X_test)
    test_duration = datetime.now() - test_start_time
    print('Testing ', name, 'complete, test_duration: ', test_duration)
    
    # evaluate
    print('Evaluating ', name, '...')
    accuracy = accuracy_score(y_test, y_pred)
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    auc_score = auc(fpr, tpr)
    true_negatives = confusion_matrix(y_test, y_pred)[0,0]
    false_negatives = confusion_matrix(y_test, y_pred)[1,0]
    true_positives = confusion_matrix(y_test, y_pred)[1,1]
    false_positives = confusion_matrix(y_test, y_pred)[0,1]
    precision = true_positives/(true_positives+false_positives)
    recall = true_positives/(true_positives+false_negatives)
    f1_score = 2*((precision*recall)/(precision+recall))

    current_model = name
    if current_model == 'LR':
        
        lr_output_dict = {'model' : 'LR',
                          'training duration' : training_duration,
                          'test duration' : test_duration, 
                          'accuracy' : accuracy, 
                          'auc' : auc_score, 
                          'true negatives' : true_negatives, 
                          'false negatives' : false_negatives, 
                          'true positives' : true_positives, 
                          'false positives' : false_positives, 
                          'precision' : precision, 
                          'recall' : recall, 
                          'f1 score' : f1_score}    
        
        print(lr_output_dict)
        
        f = open('../data/model_output/lr_output.txt', 'w')
        f.write(str(lr_output_dict))
        f.close()
        
    else:
        pass

    print('Evaluating ', name, 'complete.')

Training  LR ...




Training  LR complete, training_duration:  0:00:16.773270
Testing  LR ...
Testing  LR complete, test_duration:  0:00:00.093745
Evaluating  LR ...
{'model': 'LR', 'training duration': datetime.timedelta(seconds=16, microseconds=773270), 'test duration': datetime.timedelta(microseconds=93745), 'accuracy': 0.99011670198298496, 'auc': 0.53587683242869855, 'true negatives': 62572, 'false negatives': 517, 'true positives': 41, 'false positives': 108, 'precision': 0.27516778523489932, 'recall': 0.073476702508960573, 'f1 score': 0.11598302687411596}
Evaluating  LR complete.


In [20]:
##### RESAMPLE DATA DUE TO CLASS IMBALANCE USING SMOTENC
print('Starting resampling data using SMOTENC...')

Starting resampling data using SMOTENC...


In [21]:
from imblearn.over_sampling import SMOTENC

resampling_start_time = datetime.now()
sm = SMOTENC(random_state=42, categorical_features=[17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
                                                    35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
                                                    53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
                                                    71, 72, 73, 74, 75, 76, 77, 78, 79, 81, 82, 85, 86])
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
resampling_duration = (datetime.now() - resampling_start_time)

resampled_train_descriptives_dict = {'resampling duration' : resampling_duration,
                                     'original training set size' : X_train.shape[0], 
                                     'resampled training set size' : X_train_res.shape[0],
                                     'original conversion rate' : round(y_train.value_counts()[1]/(len(y_train)), 4),
                                     'new conversion rate' : round(y_train_res.sum()/(len(y_train_res)), 4)}
print(resampled_train_descriptives_dict)

resampled_train_descriptives_dict_file = '3_day_sample_resampled_train_descriptives.txt'
#resampled_train_descriptives_dict_file = '6_week_sample_resampled_train_descriptives.txt'
#resampled_train_descriptives_dict_file = '12_week_sample_resampled_train_descriptives.txt'
#resampled_train_descriptives_dict_file = '25_week_sample_resampled_train_descriptives.txt'

f = open('../data/descriptives/'+resampled_train_descriptives_dict_file, 'w')
f.write(str(resampled_train_descriptives_dict))
f.close()

print('Resampling complete.')

{'resampling duration': datetime.timedelta(seconds=1746, microseconds=548441), 'original training set size': 150771, 'resampled training set size': 299382, 'original conversion rate': 0.0071999999999999998, 'new conversion rate': 0.5}
Resampling complete.


In [22]:
print('Reevaluating models...')

Reevaluating models...


In [23]:
### train, test and evaluate each model in turn using the resampled training set
for name, model in models:
    
    # train
    print('Training ', name, '...')
    training_start_time = datetime.now()
    model.fit(X_train_res, y_train_res)
    training_duration = (datetime.now() - training_start_time)
    print('Training ', name, 'complete, training_duration: ', training_duration)
    
    # test
    print('Testing ', name, '...')
    test_start_time = datetime.now()
    y_pred_res = model.predict(X_test)
    test_duration = datetime.now() - test_start_time
    print('Testing ', name, 'complete, test_duration: ', test_duration)
    
    # evaluate
    print('Evaluating ', name, '...')
    accuracy = accuracy_score(y_test, y_pred_res)
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_res)
    auc_score = auc(fpr, tpr)
    true_negatives = confusion_matrix(y_test, y_pred_res)[0,0]
    false_negatives = confusion_matrix(y_test, y_pred_res)[1,0]
    true_positives = confusion_matrix(y_test, y_pred_res)[1,1]
    false_positives = confusion_matrix(y_test, y_pred_res)[0,1]
    precision = true_positives/(true_positives+false_positives)
    recall = true_positives/(true_positives+false_negatives)
    f1_score = 2*((precision*recall)/(precision+recall))

    current_model = name
    if current_model == 'LR':
        
        lr_res_output_dict = {'model' : 'LR resampled',
                              'training duration' : training_duration,
                              'test duration' : test_duration, 
                              'accuracy' : accuracy, 
                              'auc' : auc_score, 
                              'true negatives' : true_negatives, 
                              'false negatives' : false_negatives, 
                              'true positives' : true_positives, 
                              'false positives' : false_positives, 
                              'precision' : precision, 
                              'recall' : recall, 
                              'f1 score' : f1_score}    
        
        print(lr_res_output_dict)
        
        f = open('../data/model_output/lr_res_output.txt', 'w')
        f.write(str(lr_res_output_dict))
        f.close()
        
    else:
        pass

    print('Evaluating ', name, 'complete.')

Training  LR ...




Training  LR complete, training_duration:  0:00:36.348734
Testing  LR ...
Testing  LR complete, test_duration:  0:00:00.234365
Evaluating  LR ...
{'model': 'LR resampled', 'training duration': datetime.timedelta(seconds=36, microseconds=348734), 'test duration': datetime.timedelta(microseconds=234365), 'accuracy': 0.98293747430342515, 'auc': 0.9780715839457631, 'true negatives': 61616, 'false negatives': 15, 'true positives': 543, 'false positives': 1064, 'precision': 0.33789670192906035, 'recall': 0.9731182795698925, 'f1 score': 0.50161662817551966}
Evaluating  LR complete.


In [24]:
print('Reevaluating models complete.')

Reevaluating models complete.


In [25]:
##### HYPERPARAMETER TUNING USING RANDOM SEARCH
print('Starting hyperparameter tuning using RandomSearch...')

Starting hyperparameter tuning using RandomSearch...


In [26]:
from sklearn.model_selection import RandomizedSearchCV

In [27]:
print('Tuning logistic regression with original training data...')

lr = LogisticRegression()
params = {'C' : np.arange(0.0001, 1000, 10)}
grid = RandomizedSearchCV(lr, params)

tuning_start_time = datetime.now()
grid.fit(X_train, y_train)
tuning_duration = (datetime.now() - training_start_time)

evaluation_start_time = datetime.now()
y_pred_tuned = grid.best_estimator_.predict(X_test)
evaluation_duration = (datetime.now() - evaluation_start_time)

accuracy = accuracy_score(y_test, y_pred_tuned)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_tuned)
auc_score = auc(fpr, tpr)
true_negatives = confusion_matrix(y_test, y_pred_tuned)[0,0]
false_negatives = confusion_matrix(y_test, y_pred_tuned)[1,0]
true_positives = confusion_matrix(y_test, y_pred_tuned)[1,1]
false_positives = confusion_matrix(y_test, y_pred_tuned)[0,1]
precision = true_positives/(true_positives+false_positives)
recall = true_positives/(true_positives+false_negatives)
f1_score = 2*((precision*recall)/(precision+recall))

lr_tuned_output_dict = {'model' : 'LR tuned', 
                        'default params' : lr,
                        'optimal params' : grid.best_params_,
                        'tuning duration' : tuning_duration,
                        'evaluation duration' : evaluation_duration, 
                        'accuracy' : accuracy, 
                        'auc' : auc_score, 
                        'true negatives' : true_negatives, 
                        'false negatives' : false_negatives, 
                        'true positives' : true_positives, 
                        'false positives' : false_positives, 
                        'precision' : precision, 
                        'recall' : recall, 
                        'f1 score' : f1_score}    

print(lr_tuned_output_dict)

f = open('../data/model_output/lr_tuned_output.txt', 'w')
f.write(str(lr_tuned_output_dict))
f.close()

print('Tuning logistic regression with original training data complete.')

Tuning logistic regression with original training data...






{'model': 'LR tuned', 'default params': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False), 'optimal params': {'C': 720.00009999999997}, 'tuning duration': datetime.timedelta(seconds=405, microseconds=252638), 'evaluation duration': datetime.timedelta(microseconds=93748), 'accuracy': 0.99011670198298496, 'auc': 0.53410067178568732, 'true negatives': 62574, 'false negatives': 519, 'true positives': 39, 'false positives': 106, 'precision': 0.26896551724137929, 'recall': 0.069892473118279563, 'f1 score': 0.11095305832147936}
Tuning logistic regression with original training data complete.


In [28]:
print('Tuning logistic regression with resampled training data...')

lr = LogisticRegression()
params = {'C' : np.arange(0.0001, 1000, 10)}
grid = RandomizedSearchCV(lr, params)

tuning_start_time = datetime.now()
grid.fit(X_train_res, y_train_res)
tuning_duration = (datetime.now() - training_start_time)

evaluation_start_time = datetime.now()
y_pred_res_tuned = grid.best_estimator_.predict(X_test)
evaluation_duration = (datetime.now() - evaluation_start_time)

accuracy = accuracy_score(y_test, y_pred_res_tuned)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_res_tuned)
auc_score = auc(fpr, tpr)
true_negatives = confusion_matrix(y_test, y_pred_res_tuned)[0,0]
false_negatives = confusion_matrix(y_test, y_pred_res_tuned)[1,0]
true_positives = confusion_matrix(y_test, y_pred_res_tuned)[1,1]
false_positives = confusion_matrix(y_test, y_pred_res_tuned)[0,1]
precision = true_positives/(true_positives+false_positives)
recall = true_positives/(true_positives+false_negatives)
f1_score = 2*((precision*recall)/(precision+recall))

lr_res_tuned_output_dict = {'model' : 'LR resampled tuned',
                            'default params' : lr,
                            'optimal params' : grid.best_params_,
                            'tuning duration' : tuning_duration,
                            'evaluation duration' : evaluation_duration,  
                            'accuracy' : accuracy, 
                            'auc' : auc_score, 
                            'true negatives' : true_negatives, 
                            'false negatives' : false_negatives, 
                            'true positives' : true_positives, 
                            'false positives' : false_positives, 
                            'precision' : precision, 
                            'recall' : recall, 
                            'f1 score' : f1_score}    

print(lr_res_tuned_output_dict)

f = open('../data/model_output/lr_res_tuned_output.txt', 'w')
f.write(str(lr_res_tuned_output_dict))
f.close()

print('Tuning logistic regression with resampled training data complete.')

Tuning logistic regression with resampled training data...






{'model': 'LR resampled tuned', 'default params': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False), 'optimal params': {'C': 930.00009999999997}, 'tuning duration': datetime.timedelta(seconds=1077, microseconds=76141), 'evaluation duration': datetime.timedelta(microseconds=109372), 'accuracy': 0.97696005566273447, 'auc': 0.96262314355444845, 'true negatives': 61252, 'false negatives': 29, 'true positives': 529, 'false positives': 1428, 'precision': 0.27031170158405721, 'recall': 0.94802867383512546, 'f1 score': 0.42067594433399602}
Tuning logistic regression with resampled training data complete.


In [29]:
print('Hyperparameter tuning complete.')

Hyperparameter tuning complete.


In [30]:
print('Modeling complete.')
print('Run time: ', datetime.now() - start_time)

Modeling complete.
Run time:  0:48:03.482051
