In [24]:
import pandas as pd
import pickle

In [25]:
with open('/content/drive/MyDrive/datasets/ps_split_data_dict.pickle', 'rb') as file:
    data_dict = pickle.load(file)

In [26]:
X_train, X_val, X_holdout = data_dict['X_train'], data_dict['X_val'], data_dict['X_holdout']

In [27]:
y_train, y_val, y_holdout = data_dict['y_train'], data_dict['y_val'], data_dict['y_holdout']

In [28]:
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix
from xgboost import XGBClassifier

In [29]:
%run /content/drive/MyDrive/Colab\ Notebooks/common_functions.ipynb

In [30]:
xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)

In [35]:
fit_and_score(xgb, X_train, y_train, X_val, y_val)

Recall: 0.000 
Precision: 0.000
F1: 0.000 
Accuracy: 0.963 
ROC AUC: 0.638


In [36]:
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.offline as py

In [37]:
len(X_train.columns.values)

58

In [38]:
len(xgb.feature_importances_)

58

In [39]:
features = X_train.columns.values

In [40]:
x, y = (list(x) for x in zip(*sorted(zip(xgb.feature_importances_, features), 
                                                            reverse = True)))
trace2 = go.Bar(
    x=x ,
    y=y,
    marker=dict(
        color=x,
        colorscale = 'Viridis',
        reversescale = True
    ),
    name='Gradient Boosting Classifer Feature importance',
    orientation='h',
)

layout = dict(
    title='Barplot of Feature importances',
     width = 900, height = 2000,
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=True,
    ))

fig1 = go.Figure(data=[trace2])
fig1['layout'].update(layout)
py.iplot(fig1, filename='plots')

In [41]:
top_25_features = y[:25]

In [42]:
# xgb2 = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
#                     silent=True, nthread=1)

# fit_and_score(xgb2, X_train[top_25_features], y_train, X_val[top_25_features], y_val)

In [43]:
top_9_features = y[:9]

In [44]:
# xgb3 = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
#                     silent=True, nthread=1)

# fit_and_score(xgb3, X_train[top_9_features], y_train, X_val[top_9_features], y_val)

In [45]:
top_18_features = y[:18]

In [46]:
# xgb3 = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
#                     silent=True, nthread=1)

# fit_and_score(xgb3, X_train[top_18_features], y_train, X_val[top_18_features], y_val)

In [47]:
top_28_features = y[:28]

In [48]:
# xgb4 = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
#                     silent=True, nthread=1)

# fit_and_score(xgb4, X_train[top_28_features], y_train, X_val[top_28_features], y_val)

In [49]:
X_train_25 = X_train[top_25_features]
X_val_25 = X_val[top_25_features]
X_holdout_25 = X_holdout[top_25_features]

In [None]:
scale_pos_weight=100

In [50]:
def fit_and_score_gbm(scale_pos_weight=1, max_depth=3, subsample=1, min_child_weight=1, colsample_bytree=1):
    gbm = XGBClassifier( 
                        n_estimators=30000,
                        max_depth=max_depth,
                        objective='binary:logistic', 
                        learning_rate=.05, 
                        subsample=subsample,
                        min_child_weight=min_child_weight,
                        colsample_bytree=colsample_bytree,
                        scale_pos_weight=26.5
                        )

    eval_set=[(X_train_25,y_train),(X_val_25,y_val)]
    gbm.fit( 
            X_train_25, y_train, 
            eval_set=eval_set,
            eval_metric='auc',
            early_stopping_rounds=50,
            verbose=False
            )
    auc = roc_auc_score(y_val, gbm.predict_proba(X_val_25)[:,1])
    
    print(f"Scale Pos Weight: {scale_pos_weight} \nMax Depth: {max_depth} \nSubsample: {subsample} \n\
Min Child Weight: {min_child_weight} \nROC AUC: {auc:.3f}\n")

In [51]:
for max_depth in [3, 5, 7, 9]:
	fit_and_score_gbm(max_depth=max_depth)

Scale Pos Weight: 1 
Max Depth: 3 
Subsample: 1 
Min Child Weight: 1 
ROC AUC: 0.643

Scale Pos Weight: 1 
Max Depth: 5 
Subsample: 1 
Min Child Weight: 1 
ROC AUC: 0.641

Scale Pos Weight: 1 
Max Depth: 7 
Subsample: 1 
Min Child Weight: 1 
ROC AUC: 0.639

Scale Pos Weight: 1 
Max Depth: 9 
Subsample: 1 
Min Child Weight: 1 
ROC AUC: 0.631



In [64]:
from collections import Counter

# count examples in each class
counter = Counter(y_train)
# estimate scale_pos_weight value
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

Estimate: 26.528


In [53]:
for scale_pos_weight in [20, 26.5,  30]:
	fit_and_score_gbm(scale_pos_weight=scale_pos_weight)

Scale Pos Weight: 20 
Max Depth: 3 
Subsample: 1 
Min Child Weight: 1 
ROC AUC: 0.643

Scale Pos Weight: 26.5 
Max Depth: 3 
Subsample: 1 
Min Child Weight: 1 
ROC AUC: 0.643

Scale Pos Weight: 30 
Max Depth: 3 
Subsample: 1 
Min Child Weight: 1 
ROC AUC: 0.643



In [55]:
for subsample in [0.9, 0.8]:
	fit_and_score_gbm(subsample=subsample)

Scale Pos Weight: 1 
Max Depth: 3 
Subsample: 0.9 
Min Child Weight: 1 
ROC AUC: 0.644

Scale Pos Weight: 1 
Max Depth: 3 
Subsample: 0.8 
Min Child Weight: 1 
ROC AUC: 0.644



In [56]:
for subsample in [0.7]:
	fit_and_score_gbm(subsample=subsample)

Scale Pos Weight: 1 
Max Depth: 3 
Subsample: 0.7 
Min Child Weight: 1 
ROC AUC: 0.643



In [57]:
for min_child_weight in [3, 12]:
	fit_and_score_gbm(min_child_weight=min_child_weight)

Scale Pos Weight: 1 
Max Depth: 3 
Subsample: 1 
Min Child Weight: 3 
ROC AUC: 0.644

Scale Pos Weight: 1 
Max Depth: 3 
Subsample: 1 
Min Child Weight: 12 
ROC AUC: 0.642



In [58]:
for max_depth in [2, 4]:
	fit_and_score_gbm(max_depth=max_depth)

Scale Pos Weight: 1 
Max Depth: 2 
Subsample: 1 
Min Child Weight: 1 
ROC AUC: 0.641

Scale Pos Weight: 1 
Max Depth: 4 
Subsample: 1 
Min Child Weight: 1 
ROC AUC: 0.643



In [59]:
for min_child_weight in [2,4]:
	fit_and_score_gbm(min_child_weight=min_child_weight)

Scale Pos Weight: 1 
Max Depth: 3 
Subsample: 1 
Min Child Weight: 4 
ROC AUC: 0.642



In [60]:
import imblearn.over_sampling

n_pos = np.sum(y_train == 1)
n_neg = np.sum(y_train == 0)

ratio2 = {1 : n_pos * 10, 0 : n_neg} 

ROS2 = imblearn.over_sampling.RandomOverSampler(sampling_strategy = ratio2, random_state=42)

X_train_rs, y_train_rs = ROS2.fit_resample(X_train_25, y_train)

In [61]:
def fit_and_score_gbm_rebalance(X_train_new, y_train_new):
    gbm = XGBClassifier( 
                        n_estimators=30000,
                        max_depth=3,
                        objective='binary:logistic', 
                        learning_rate=.05, 
                        subsample=0.9,
                        min_child_weight=3,
                        colsample_bytree=0.9,
                        )

    eval_set=[(X_train_new,y_train_new),(X_val_25,y_val)]
    gbm.fit( 
            X_train_new, y_train_new, 
            eval_set=eval_set,
            eval_metric='auc',
            early_stopping_rounds=50,
            verbose=False
            )
    auc = roc_auc_score(y_val, gbm.predict_proba(X_val_25)[:,1])
    
    print(f"ROC AUC: {auc:.3f}\n")

In [62]:
fit_and_score_gbm_rebalance(X_train_rs, y_train_rs)

Scale Pos Weight: 30 
Max Depth: 4 
Subsample: 0.7 
Min Child Weight: 4 
ROC AUC: 0.644



In [63]:
for colsample_bytree in [0.8,0.6]:
	fit_and_score_gbm(colsample_bytree=colsample_bytree)

Scale Pos Weight: 1 
Max Depth: 3 
Subsample: 1 
Min Child Weight: 1 
ROC AUC: 0.643

Scale Pos Weight: 1 
Max Depth: 3 
Subsample: 1 
Min Child Weight: 1 
ROC AUC: 0.643



In [None]:
# from numpy import mean
# # fit balanced xgboost on an imbalanced classification dataset
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import RepeatedStratifiedKFold

# # define model
# model = XGBClassifier(scale_pos_weight=26.5)
# # define evaluation procedure
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# # evaluate model
# scores = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
# # summarize performance
# print('Mean ROC AUC: %.5f' % mean(scores))