In [1]:
import numpy as np
import pandas as pd
import re
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, recall_score, classification_report,confusion_matrix
import glob
import os
import matplotlib.pyplot as plt

In [2]:
train_path = max(glob.glob('./data/train/*.csv'), key=os.path.getctime) 
test_path = max(glob.glob('./data/test/*.csv'), key=os.path.getctime) 
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print(train_df.shape)
print(test_df.shape)

(24756, 576)
(4369, 576)


In [3]:
print(train_df.columns[:25])
print(test_df.columns[:25])
# test_df.columns == train_df.columns

Index(['reward_tiers', 'min_reward', 'max_reward', 'goal', 'state',
       'staff_pick', 'has_video', 'rewards_0', 'rewards_1', 'rewards_2',
       'rewards_3', 'rewards_4', 'rewards_5', 'rewards_6', 'rewards_7',
       'rewards_8', 'rewards_9', 'rewards_10', 'rewards_11', 'rewards_12',
       'rewards_13', 'rewards_14', 'rewards_15', 'rewards_16', 'rewards_17'],
      dtype='object')
Index(['reward_tiers', 'min_reward', 'max_reward', 'goal', 'state',
       'staff_pick', 'has_video', 'rewards_0', 'rewards_1', 'rewards_2',
       'rewards_3', 'rewards_4', 'rewards_5', 'rewards_6', 'rewards_7',
       'rewards_8', 'rewards_9', 'rewards_10', 'rewards_11', 'rewards_12',
       'rewards_13', 'rewards_14', 'rewards_15', 'rewards_16', 'rewards_17'],
      dtype='object')


In [3]:
# features_to_drop = ['rewards', 'deadline', 'launched_at', 'rewards_processed', 'created_at',
# 'description_processed', 'description_story_processed','description_risks_processed',
# 'id', 'name', 'description', 'description_story', 'description_risks', 'video', 'state',
# 'location', 'category']

#features that are dependent on time and the final outcome

X_train, y_train = train_df.drop('state', axis=1), train_df['state']
X_test, y_test = test_df.drop('state', axis=1), test_df['state']

In [4]:
# Combine train and test data set tgt

X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

In [5]:
# Check if dataset is balanced
print(train_df[train_df.state == 1].shape)
print(train_df[train_df.state == 0].shape)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(17482, 576)
(7274, 576)
(24756, 575)
(24756,)
(4369, 575)
(4369,)


In [6]:
# Make sure all data are of integer of float type
X_train.dtypes.unique()

array([dtype('float64'), dtype('int64')], dtype=object)

In [12]:
#Default model
model_xgboost_default = xgb.XGBClassifier(eval_metric='auc')

model_xgboost_default.fit(X_train,
                  y_train,
                  early_stopping_rounds=10,
                  eval_set=[(X_test, y_test)],
                  verbose=True)

xgb_train_default_predict = model_xgboost_default.predict(X_train)
xgb_test_default_predict = model_xgboost_default.predict(X_test)



[0]	validation_0-auc:0.92330
[1]	validation_0-auc:0.93843
[2]	validation_0-auc:0.94577
[3]	validation_0-auc:0.95097
[4]	validation_0-auc:0.95267
[5]	validation_0-auc:0.95624
[6]	validation_0-auc:0.96052
[7]	validation_0-auc:0.96136
[8]	validation_0-auc:0.96290
[9]	validation_0-auc:0.96415
[10]	validation_0-auc:0.96495
[11]	validation_0-auc:0.96647
[12]	validation_0-auc:0.96779
[13]	validation_0-auc:0.96815
[14]	validation_0-auc:0.96919
[15]	validation_0-auc:0.96966
[16]	validation_0-auc:0.97024
[17]	validation_0-auc:0.97077
[18]	validation_0-auc:0.97111
[19]	validation_0-auc:0.97175
[20]	validation_0-auc:0.97224
[21]	validation_0-auc:0.97246
[22]	validation_0-auc:0.97237
[23]	validation_0-auc:0.97275
[24]	validation_0-auc:0.97314
[25]	validation_0-auc:0.97306
[26]	validation_0-auc:0.97325
[27]	validation_0-auc:0.97340
[28]	validation_0-auc:0.97365
[29]	validation_0-auc:0.97366
[30]	validation_0-auc:0.97377
[31]	validation_0-auc:0.97395
[32]	validation_0-auc:0.97392
[33]	validation_0-au

In [9]:
# model_xgboost_complex = xgb.XGBClassifier(learning_rate=0.1,
#                                       max_depth=5,
#                                       n_estimators=5000,
#                                       subsample=0.5,
#                                       colsample_bytree=0.5,
#                                       eval_metric='auc',
#                                       verbosity=1)

# model_xgboost_complex.fit(X_train,
#                   y_train,
#                   early_stopping_rounds=10,
#                   eval_set=[(X_test, y_test)],
#                   verbose=True)
# xgb_complex_predict = model_xgboost_complex.predict(X_train)

[0]	validation_0-auc:0.85166
[1]	validation_0-auc:0.91481
[2]	validation_0-auc:0.92373
[3]	validation_0-auc:0.93388
[4]	validation_0-auc:0.93562
[5]	validation_0-auc:0.93624
[6]	validation_0-auc:0.94064
[7]	validation_0-auc:0.94356
[8]	validation_0-auc:0.94420
[9]	validation_0-auc:0.94361
[10]	validation_0-auc:0.94358
[11]	validation_0-auc:0.94443
[12]	validation_0-auc:0.94417
[13]	validation_0-auc:0.94540
[14]	validation_0-auc:0.94505
[15]	validation_0-auc:0.94838
[16]	validation_0-auc:0.95047
[17]	validation_0-auc:0.95008
[18]	validation_0-auc:0.95013
[19]	validation_0-auc:0.95049
[20]	validation_0-auc:0.95036
[21]	validation_0-auc:0.95041
[22]	validation_0-auc:0.95036
[23]	validation_0-auc:0.95048
[24]	validation_0-auc:0.95026
[25]	validation_0-auc:0.95276
[26]	validation_0-auc:0.95421
[27]	validation_0-auc:0.95578
[28]	validation_0-auc:0.95583
[29]	validation_0-auc:0.95619
[30]	validation_0-auc:0.95656
[31]	validation_0-auc:0.95677
[32]	validation_0-auc:0.95818
[33]	validation_0-au

**Evaluate Model Performance**

In [14]:
print(confusion_matrix(y_train, xgb_train_default_predict))
print(classification_report(y_train, xgb_train_default_predict))

[[ 7000   274]
 [   74 17408]]
              precision    recall  f1-score   support

           0       0.99      0.96      0.98      7274
           1       0.98      1.00      0.99     17482

    accuracy                           0.99     24756
   macro avg       0.99      0.98      0.98     24756
weighted avg       0.99      0.99      0.99     24756



In [15]:
print(confusion_matrix(y_test, xgb_test_default_predict))
print(classification_report(y_test, xgb_test_default_predict))

[[1063  219]
 [  71 3016]]
              precision    recall  f1-score   support

           0       0.94      0.83      0.88      1282
           1       0.93      0.98      0.95      3087

    accuracy                           0.93      4369
   macro avg       0.93      0.90      0.92      4369
weighted avg       0.93      0.93      0.93      4369



In [16]:
y_train_pred = model_xgboost_default.predict_proba(X_train)[:,1]
y_test_pred = model_xgboost_default.predict_proba(X_test)[:,1] # Slicing to obtain prob of observation being 1

print("AUC Train: {:.4f}\nAUC Valid: {:.4f}".format(roc_auc_score(y_train, y_train_pred),
                                                    roc_auc_score(y_test, y_test_pred)))

# print("AUC Train: {:.4f}\nAUC Valid: {:.4f}".format(recall_score(y_train, y_train_pred),
#                                                     recall_score(y_test, y_test_pred)))

AUC Train: 0.9992
AUC Valid: 0.9753


In [13]:
# y_train_pred = model_xgboost_complex.predict_proba(X_train)[:,1]
# y_test_pred = model_xgboost_complex.predict_proba(X_test)[:,1] # Slicing to obtain prob of observation being 1

# print("AUC Train: {:.4f}\nAUC Valid: {:.4f}".format(roc_auc_score(y_train, y_train_pred),
#                                                     roc_auc_score(y_test, y_test_pred)))

AUC Train: 0.9891
AUC Valid: 0.9750


**Hyperparameter Tuning**

We will use GridSearchCV for hyperparameter tuning.


In [17]:
learning_rate_list = [0.02, 0.05, 0.1]
max_depth_list = [2, 3, 5]
n_estimators_list = [1000, 2000, 3000]

params_dict = {"learning_rate": learning_rate_list,
               "max_depth": max_depth_list,
               "n_estimators": n_estimators_list}

num_combinations = 1
for v in params_dict.values(): num_combinations *= len(v) 

print(num_combinations)
params_dict

27


{'learning_rate': [0.02, 0.05, 0.1],
 'max_depth': [2, 3, 5],
 'n_estimators': [1000, 2000, 3000]}

In [18]:
# Wrapper for auc auc score
def auc_score(model, X, y): 
    return roc_auc_score(y, model.predict_proba(X)[:,1])

model_xgboost_hp = GridSearchCV(estimator=xgb.XGBClassifier(subsample=0.5, # Params that we are not tuning
                                                                colsample_bytree=0.25,
                                                                eval_metric='auc',
                                                                use_label_encoder=False),
                                param_grid=params_dict,
                                cv=2, # Half of data are used for validation every iteration
                                scoring=auc_score,
                                return_train_score=True,
                                verbose=4)

model_xgboost_hp.fit(X, y)

Fitting 2 folds for each of 27 candidates, totalling 54 fits
[CV 1/2] END learning_rate=0.02, max_depth=2, n_estimators=1000;, score=(train=0.977, test=0.968) total time=  21.4s


KeyboardInterrupt: 

In [16]:
df_cv_results = pd.DataFrame(model_xgboost_hp.cv_results_)
df_cv_results = df_cv_results[['rank_test_score','mean_test_score','mean_train_score',
                               'param_learning_rate', 'param_max_depth', 'param_n_estimators']]
df_cv_results.sort_values(by='rank_test_score', inplace=True)
df_cv_results

Unnamed: 0,rank_test_score,mean_test_score,mean_train_score,param_learning_rate,param_max_depth,param_n_estimators
5,1,0.975185,0.998228,0.02,3,3000
4,2,0.974875,0.995068,0.02,3,2000
12,3,0.974697,0.996764,0.05,3,1000
8,4,0.97466,1.0,0.02,5,3000
2,5,0.974535,0.990758,0.02,2,3000
7,6,0.97452,0.999973,0.02,5,2000
13,7,0.974347,0.999809,0.05,3,2000
10,8,0.974261,0.995414,0.05,2,2000
14,9,0.974135,0.999999,0.05,3,3000
18,10,0.974088,0.995228,0.1,2,1000


## Final Model
Using best parameters from above step

In [18]:
model_xgboost_fin = xgb.XGBClassifier(learning_rate=0.02,
                                          max_depth=3,
                                          n_estimators=3000,
                                          subsample=0.5,
                                          colsample_bytree=0.25,
                                          eval_metric='auc',
                                          verbosity=1,
                                          use_label_encoder=False)

# Passing both training and validation dataset as we want to plot AUC for both
eval_set = [(X_train, y_train),(X_test, y_test)]

model_xgboost_fin.fit(X_train,
                  y_train,
                  early_stopping_rounds=20,
                  eval_set=eval_set,
                  verbose=True)



[0]	validation_0-auc:0.79799	validation_1-auc:0.80196
[1]	validation_0-auc:0.90261	validation_1-auc:0.90683
[2]	validation_0-auc:0.91192	validation_1-auc:0.91179
[3]	validation_0-auc:0.91154	validation_1-auc:0.91270
[4]	validation_0-auc:0.91492	validation_1-auc:0.91528
[5]	validation_0-auc:0.91704	validation_1-auc:0.91609
[6]	validation_0-auc:0.91929	validation_1-auc:0.91835
[7]	validation_0-auc:0.92176	validation_1-auc:0.92103
[8]	validation_0-auc:0.92356	validation_1-auc:0.92308
[9]	validation_0-auc:0.92479	validation_1-auc:0.92398
[10]	validation_0-auc:0.92460	validation_1-auc:0.92372
[11]	validation_0-auc:0.92433	validation_1-auc:0.92337
[12]	validation_0-auc:0.92444	validation_1-auc:0.92324
[13]	validation_0-auc:0.92561	validation_1-auc:0.92274
[14]	validation_0-auc:0.92556	validation_1-auc:0.92262
[15]	validation_0-auc:0.92590	validation_1-auc:0.92290
[16]	validation_0-auc:0.92695	validation_1-auc:0.92406
[17]	validation_0-auc:0.92721	validation_1-auc:0.92446
[18]	validation_0-au

In [20]:
var_colums = [c for c in X_train.columns if c not in ['state']]

df_var_imp = pd.DataFrame({"Feature": var_colums,
                           "Importance": model_xgboost_fin.feature_importances_})\
                        .sort_values(by='Importance', ascending=False)
df_var_imp[:10]

Unnamed: 0,Variable,Importance
4,staff_pick,0.032358
7,rewards_1,0.031856
9,rewards_3,0.03106
6,rewards_0,0.030741
82,rewards_76,0.019662
104,rewards_98,0.018016
37,rewards_31,0.016567
70,rewards_64,0.016051
105,rewards_99,0.014675
416,rewards_word_count,0.014463


In [4]:
train_df['rewards_1']

0        0.000000
1        0.069940
2        0.077307
3        0.014744
4        0.000000
           ...   
24751    0.094775
24752    0.034807
24753    0.026905
24754    0.034975
24755    0.173234
Name: rewards_1, Length: 24756, dtype: float64