In [21]:
import numpy as np
import pandas as pd
import re
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
import glob
import os
import matplotlib.pyplot as plt

In [30]:
train_path = max(glob.glob('./data/train/*.csv'), key=os.path.getctime) 
test_path = max(glob.glob('./data/test/*.csv'), key=os.path.getctime) 
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print(train_df.shape)
print(test_df.shape)

(1557, 563)
(293, 563)


In [43]:
features_to_drop = ['rewards', 'deadline', 'launched_at', 'rewards_processed',
           'description_processed', 'description_story_processed','description_risks_processed',
           'id', 'name', 'description', 'description_story', 'description_risks', 'video', 'state',
          'pledged', 'category', 'location']

#features that are dependent on time and the final outcome
to_drop_more = features_to_drop + ['staff_pick', 'spotlight', 'backers_count', 'update_count', 'faq_count']

X_train, y_train = train_df.drop(to_drop_more, axis=1), train_df['state']
X_test, y_test = test_df.drop(to_drop_more, axis=1), test_df['state']


In [53]:
# Combine train and test data set tgt

X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

In [44]:
# Pretty balanced dataset
print(train_df[train_df.state == 1].shape)
print(train_df[train_df.state == 0].shape)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(784, 563)
(773, 563)
(1557, 541)
(1557,)
(293, 541)
(293,)


In [26]:
xgb.XGBClassifier().get_params()


{'objective': 'binary:logistic',
 'use_label_encoder': False,
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'gamma': None,
 'gpu_id': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [47]:
X_train.dtypes.unique()

array([dtype('int64'), dtype('float64')], dtype=object)

In [48]:
model_xgboost1 = xgb.XGBClassifier(learning_rate=0.1,
                                      max_depth=5,
                                      n_estimators=5000,
                                      subsample=0.5,
                                      colsample_bytree=0.5,
                                      eval_metric='auc',
                                      verbosity=1)

model_xgboost1.fit(X_train,
                  y_train,
                  early_stopping_rounds=10,
                  eval_set=[(X_test, y_test)],
                  verbose=True)

[0]	validation_0-auc:0.77547
[1]	validation_0-auc:0.85995
[2]	validation_0-auc:0.87097
[3]	validation_0-auc:0.87854




[4]	validation_0-auc:0.88912
[5]	validation_0-auc:0.89122
[6]	validation_0-auc:0.89059
[7]	validation_0-auc:0.89043
[8]	validation_0-auc:0.88875
[9]	validation_0-auc:0.89271
[10]	validation_0-auc:0.89686
[11]	validation_0-auc:0.89677
[12]	validation_0-auc:0.89555
[13]	validation_0-auc:0.89784
[14]	validation_0-auc:0.89728
[15]	validation_0-auc:0.90133
[16]	validation_0-auc:0.90366
[17]	validation_0-auc:0.90571
[18]	validation_0-auc:0.90893
[19]	validation_0-auc:0.91051
[20]	validation_0-auc:0.91354
[21]	validation_0-auc:0.91448
[22]	validation_0-auc:0.91839
[23]	validation_0-auc:0.91820
[24]	validation_0-auc:0.91937
[25]	validation_0-auc:0.91932
[26]	validation_0-auc:0.91993
[27]	validation_0-auc:0.92249
[28]	validation_0-auc:0.92161
[29]	validation_0-auc:0.92161
[30]	validation_0-auc:0.92235
[31]	validation_0-auc:0.92105
[32]	validation_0-auc:0.92123
[33]	validation_0-auc:0.91988
[34]	validation_0-auc:0.91974
[35]	validation_0-auc:0.92030
[36]	validation_0-auc:0.91993
[37]	validation_

In [49]:
model_xgboost2 = xgb.XGBClassifier(learning_rate=0.1)

model_xgboost2.fit(X_train,
                  y_train,
                  early_stopping_rounds=10,
                  eval_set=[(X_test, y_test)],
                  eval_metric='auc',
                  verbose=True)

[0]	validation_0-auc:0.86151
[1]	validation_0-auc:0.86237
[2]	validation_0-auc:0.88432
[3]	validation_0-auc:0.88777




[4]	validation_0-auc:0.88908
[5]	validation_0-auc:0.89807
[6]	validation_0-auc:0.90324
[7]	validation_0-auc:0.90341
[8]	validation_0-auc:0.90711
[9]	validation_0-auc:0.91271
[10]	validation_0-auc:0.91238
[11]	validation_0-auc:0.91660
[12]	validation_0-auc:0.91734
[13]	validation_0-auc:0.91869
[14]	validation_0-auc:0.92095
[15]	validation_0-auc:0.92450
[16]	validation_0-auc:0.92324
[17]	validation_0-auc:0.92426
[18]	validation_0-auc:0.92594
[19]	validation_0-auc:0.92790
[20]	validation_0-auc:0.92837
[21]	validation_0-auc:0.92762
[22]	validation_0-auc:0.92664
[23]	validation_0-auc:0.92869
[24]	validation_0-auc:0.92781
[25]	validation_0-auc:0.92795
[26]	validation_0-auc:0.92790
[27]	validation_0-auc:0.92925
[28]	validation_0-auc:0.92878
[29]	validation_0-auc:0.92757
[30]	validation_0-auc:0.92850
[31]	validation_0-auc:0.92804
[32]	validation_0-auc:0.92781
[33]	validation_0-auc:0.92701
[34]	validation_0-auc:0.92715
[35]	validation_0-auc:0.92767
[36]	validation_0-auc:0.92832


**Evaluate Model Performance**

In [50]:
y_train_pred = model_xgboost1.predict_proba(X_train)[:,1]
y_test_pred = model_xgboost1.predict_proba(X_test)[:,1] # Slicing to obtain prob of observation being 1

print("AUC Train: {:.4f}\nAUC Valid: {:.4f}".format(roc_auc_score(y_train, y_train_pred),
                                                    roc_auc_score(y_test, y_test_pred)))

AUC Train: 0.9879
AUC Valid: 0.9225


In [51]:
y_train_pred = model_xgboost2.predict_proba(X_train)[:,1]
y_test_pred = model_xgboost2.predict_proba(X_test)[:,1] # Slicing to obtain prob of observation being 1

print("AUC Train: {:.4f}\nAUC Valid: {:.4f}".format(roc_auc_score(y_train, y_train_pred),
                                                    roc_auc_score(y_test, y_test_pred)))

AUC Train: 0.9997
AUC Valid: 0.9293


**Hyperparameter Tuning**

We will use GridSearchCV for hyperparameter tuning.


In [52]:
learning_rate_list = [0.02, 0.05, 0.1]
max_depth_list = [2, 3, 5]
n_estimators_list = [1000, 2000, 3000]

params_dict = {"learning_rate": learning_rate_list,
               "max_depth": max_depth_list,
               "n_estimators": n_estimators_list}

num_combinations = 1
for v in params_dict.values(): num_combinations *= len(v) 

print(num_combinations)
params_dict

27


{'learning_rate': [0.02, 0.05, 0.1],
 'max_depth': [2, 3, 5],
 'n_estimators': [1000, 2000, 3000]}

In [54]:
def auc_score(model, X, y): 
    return roc_auc_score(y, model.predict_proba(X)[:,1])

model_xgboost_hp = GridSearchCV(estimator=xgb.XGBClassifier(subsample=0.5,
                                                                colsample_bytree=0.25,
                                                                eval_metric='auc',
                                                                use_label_encoder=False),
                                param_grid=params_dict,
                                cv=2,
                                scoring=auc_score,
                                return_train_score=True,
                                verbose=4)

model_xgboost_hp.fit(X, y)

Fitting 2 folds for each of 27 candidates, totalling 54 fits
[CV 1/2] END learning_rate=0.02, max_depth=2, n_estimators=1000;, score=(train=0.999, test=0.942) total time=   2.2s
[CV 2/2] END learning_rate=0.02, max_depth=2, n_estimators=1000;, score=(train=0.999, test=0.932) total time=   2.2s
[CV 1/2] END learning_rate=0.02, max_depth=2, n_estimators=2000;, score=(train=1.000, test=0.942) total time=   4.2s
[CV 2/2] END learning_rate=0.02, max_depth=2, n_estimators=2000;, score=(train=1.000, test=0.935) total time=   4.1s
[CV 1/2] END learning_rate=0.02, max_depth=2, n_estimators=3000;, score=(train=1.000, test=0.942) total time=   6.3s
[CV 2/2] END learning_rate=0.02, max_depth=2, n_estimators=3000;, score=(train=1.000, test=0.934) total time=   6.4s
[CV 1/2] END learning_rate=0.02, max_depth=3, n_estimators=1000;, score=(train=1.000, test=0.943) total time=   2.7s
[CV 2/2] END learning_rate=0.02, max_depth=3, n_estimators=1000;, score=(train=1.000, test=0.929) total time=   3.2s
[CV

In [55]:
df_cv_results = pd.DataFrame(model_xgboost_hp.cv_results_)
df_cv_results = df_cv_results[['rank_test_score','mean_test_score','mean_train_score',
                               'param_learning_rate', 'param_max_depth', 'param_n_estimators']]
df_cv_results.sort_values(by='rank_test_score', inplace=True)
df_cv_results

Unnamed: 0,rank_test_score,mean_test_score,mean_train_score,param_learning_rate,param_max_depth,param_n_estimators
1,1,0.938422,1.0,0.02,2,2000
2,2,0.937772,1.0,0.02,2,3000
0,3,0.936851,0.998899,0.02,2,1000
9,4,0.936736,1.0,0.05,2,1000
4,5,0.936245,1.0,0.02,3,2000
5,6,0.936189,1.0,0.02,3,3000
11,7,0.93611,1.0,0.05,2,3000
12,8,0.936105,1.0,0.05,3,1000
13,9,0.935993,1.0,0.05,3,2000
10,10,0.935993,1.0,0.05,2,2000
