In [60]:
import numpy as np
import pandas as pd
import re
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt

In [26]:
df = pd.read_csv("./data/kickstarter_train_final_20220927-134328.csv")


In [27]:
# Only care about successful and failed projects
ignored_states =  ['canceled', 'live']
df = df[~df.state.isin(ignored_states)]
df['binary_state'] = np.where(df['state'] == 'successful', 1, 0)
df.shape

(1557, 563)

In [28]:
# Remove id, dates and textual features
features_to_drop = ['id', 'name', 'description','description_story', 'description_risks','rewards', 'deadline', 'state', 'video', 'launched_at', 'rewards_processed', 'description_processed', 'description_story_processed', 'description_risks_processed' ]
df = df.drop(features_to_drop, axis=1)


In [71]:
# Convert faq_count to integer
print(df.faq_count.dtype)
df["faq_count"] = df["faq_count"].apply(lambda x: re.sub("[^\d\.]", "", x))
df.faq_count = pd.to_numeric(df.faq_count)

int64


TypeError: expected string or bytes-like object

In [70]:
# Pretty balanced dataset
print(df[df.binary_state == 1].shape)
print(df[df.binary_state == 0].shape)

(784, 549)
(773, 549)


In [72]:
var_cols = [c for c in df.columns if c not in ['binary_state']]
X = df.loc[:, var_cols]
y = df.loc[:, 'binary_state']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1245, 548), (312, 548), (1245,), (312,))

In [61]:
xgb.XGBClassifier().get_params()


{'objective': 'binary:logistic',
 'use_label_encoder': False,
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'gamma': None,
 'gpu_id': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [74]:
model_xgboost = xgb.XGBClassifier(learning_rate=0.1,
                                      max_depth=5,
                                      n_estimators=5000,
                                      subsample=0.5,
                                      colsample_bytree=0.5,
                                      eval_metric='auc',
                                      verbosity=1)

model_xgboost.fit(X_train,
                  y_train,
                  early_stopping_rounds=10,
                  eval_set=[(X_test, y_test)],
                  verbose=True)

[0]	validation_0-auc:0.84276
[1]	validation_0-auc:0.88027
[2]	validation_0-auc:0.98991
[3]	validation_0-auc:0.98052
[4]	validation_0-auc:0.99671
[5]	validation_0-auc:1.00000
[6]	validation_0-auc:0.99938
[7]	validation_0-auc:1.00000
[8]	validation_0-auc:1.00000
[9]	validation_0-auc:1.00000
[10]	validation_0-auc:1.00000
[11]	validation_0-auc:1.00000
[12]	validation_0-auc:1.00000
[13]	validation_0-auc:1.00000
[14]	validation_0-auc:1.00000
[15]	validation_0-auc:1.00000


In [73]:
model_xgboost = xgb.XGBClassifier(learning_rate=0.1)

model_xgboost.fit(X_train,
                  y_train,
                  early_stopping_rounds=10,
                  eval_set=[(X_test, y_test)],
                  eval_metric='auc',
                  verbose=True)

[0]	validation_0-auc:1.00000
[1]	validation_0-auc:1.00000
[2]	validation_0-auc:1.00000
[3]	validation_0-auc:1.00000
[4]	validation_0-auc:1.00000
[5]	validation_0-auc:1.00000




[6]	validation_0-auc:1.00000
[7]	validation_0-auc:1.00000
[8]	validation_0-auc:1.00000
[9]	validation_0-auc:1.00000
[10]	validation_0-auc:1.00000


**Evaluate Model Performance**

In [None]:
y_train_pred = model_xgboost.predict_proba(X_train)[:,1]
y_test_pred = model_xgboost.predict_proba(X_test)[:,1] # Slicing to obtain Prob of observation being 1

print("AUC Train: {:.4f}\nAUC Valid: {:.4f}".format(roc_auc_score(y_train, y_train_pred),
                                                    roc_auc_score(y_test, y_test_pred)))