Objective: Develop a method to implement a good baseline model

# Import packages

In [1]:
import pandas as pd
import category_encoders as ce
import numpy as np
import seaborn as sns
import math

from matplotlib import pyplot as plt

from feature_engine.selection import (DropFeatures, DropConstantFeatures, 
                                      DropDuplicateFeatures)

from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold

from sklearn.preprocessing import (StandardScaler, FunctionTransformer, Normalizer, OrdinalEncoder, OneHotEncoder,
                                   RobustScaler, MinMaxScaler, PowerTransformer, MaxAbsScaler, LabelEncoder)

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer

from sklearn.compose import ColumnTransformer

from sklearn.metrics import roc_auc_score, f1_score, classification_report

from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV
from skopt import BayesSearchCV

# Read Data

In [2]:
N_ROWS=1000

In [3]:
df=pd.read_csv('data/application_train.csv', nrows=N_ROWS)

In [4]:
df.head(5)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
X=df.loc[:, ~df.columns.isin(['SK_ID_CURR', 'TARGET'])]
y=df['TARGET'].astype(int)

In [6]:
# using the train test split function
X_train, X_test,y_train, y_test = train_test_split(X,y,random_state=104,test_size=0.25,shuffle=True)

# Feature Selection

In [7]:
for i in list(df.columns):
    
    try:
        sns.violinplot(x=df["TARGET"], y=df["i"], palette="Blues")
    except:
        print(i, 'failed...')
        pass

SK_ID_CURR failed...
TARGET failed...
NAME_CONTRACT_TYPE failed...
CODE_GENDER failed...
FLAG_OWN_CAR failed...
FLAG_OWN_REALTY failed...
CNT_CHILDREN failed...
AMT_INCOME_TOTAL failed...
AMT_CREDIT failed...
AMT_ANNUITY failed...
AMT_GOODS_PRICE failed...
NAME_TYPE_SUITE failed...
NAME_INCOME_TYPE failed...
NAME_EDUCATION_TYPE failed...
NAME_FAMILY_STATUS failed...
NAME_HOUSING_TYPE failed...
REGION_POPULATION_RELATIVE failed...
DAYS_BIRTH failed...
DAYS_EMPLOYED failed...
DAYS_REGISTRATION failed...
DAYS_ID_PUBLISH failed...
OWN_CAR_AGE failed...
FLAG_MOBIL failed...
FLAG_EMP_PHONE failed...
FLAG_WORK_PHONE failed...
FLAG_CONT_MOBILE failed...
FLAG_PHONE failed...
FLAG_EMAIL failed...
OCCUPATION_TYPE failed...
CNT_FAM_MEMBERS failed...
REGION_RATING_CLIENT failed...
REGION_RATING_CLIENT_W_CITY failed...
WEEKDAY_APPR_PROCESS_START failed...
HOUR_APPR_PROCESS_START failed...
REG_REGION_NOT_LIVE_REGION failed...
REG_REGION_NOT_WORK_REGION failed...
LIVE_REGION_NOT_WORK_REGION failed...


# Create a Scikit Learn Pipeline

Categoric: Nominal, Ordinal  
Numeric: Discrete, Continious

In [8]:
categorical_feature_mask = X.dtypes==object
categorical_features = X.columns[categorical_feature_mask].tolist()
print(categorical_features)

['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']


In [9]:
numeric_feature_mask = X.dtypes!=object
numeric_features = X.columns[numeric_feature_mask].tolist()
print(numeric_features)

['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG', 'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE', 'COMMONAR

In [10]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('cat_enc', OneHotEncoder(handle_unknown='ignore')),
])

In [11]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
#     ('norm', Normalizer()),
])

In [12]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Determine Base Pipeline Performance

In [13]:
ratio = float(np.sum(y_train == 0)) / np.sum(y_train==1)

best_params = {
    'learning_rate': 0.01,
    'n_estimators': 100,
    'max_depth': 3,
    'subsample': 0.8,
    'colsample_bytree': 1,
    'gamma': 1,
    'scale_pos_weight': ratio}

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(eval_metric='aucpr', 
                                 objective='binary:logistic', 
                                 use_label_encoder=False,
                                 **best_params))
])

In [14]:
for parameter in pipeline.get_params():
    print(parameter)

memory
steps
verbose
preprocessor
classifier
preprocessor__n_jobs
preprocessor__remainder
preprocessor__sparse_threshold
preprocessor__transformer_weights
preprocessor__transformers
preprocessor__verbose
preprocessor__verbose_feature_names_out
preprocessor__num
preprocessor__cat
preprocessor__num__memory
preprocessor__num__steps
preprocessor__num__verbose
preprocessor__num__imputer
preprocessor__num__scaler
preprocessor__num__imputer__add_indicator
preprocessor__num__imputer__copy
preprocessor__num__imputer__fill_value
preprocessor__num__imputer__missing_values
preprocessor__num__imputer__strategy
preprocessor__num__imputer__verbose
preprocessor__num__scaler__copy
preprocessor__num__scaler__with_mean
preprocessor__num__scaler__with_std
preprocessor__cat__memory
preprocessor__cat__steps
preprocessor__cat__verbose
preprocessor__cat__imputer
preprocessor__cat__cat_enc
preprocessor__cat__imputer__add_indicator
preprocessor__cat__imputer__copy
preprocessor__cat__imputer__fill_value
preproce

In [15]:
pipeline.fit(X_train, y_train)

In [16]:
train_score=f1_score(y_train, pipeline.predict(X_train))
test_score=f1_score(y_test, pipeline.predict(X_test))
print('Training set score: ' + str(train_score))
print('Test set score: ' + str(test_score))

Training set score: 0.2816696914700544
Test set score: 0.22451081359423272


In [17]:
print('*'*15, 'BASELINE TRAIN RESULTS', '*'*15, '\n')
print(classification_report(y_train, pipeline.predict(X_train)))

print('*'*15, 'BASELINE TEST RESULTS', '*'*15, '\n')
print(classification_report(y_test, pipeline.predict(X_test)))

*************** BASELINE TRAIN RESULTS *************** 

              precision    recall  f1-score   support

           0       0.96      0.74      0.84      6909
           1       0.18      0.66      0.28       591

    accuracy                           0.74      7500
   macro avg       0.57      0.70      0.56      7500
weighted avg       0.90      0.74      0.79      7500

*************** BASELINE TEST RESULTS *************** 

              precision    recall  f1-score   support

           0       0.96      0.71      0.81      2316
           1       0.14      0.59      0.22       184

    accuracy                           0.70      2500
   macro avg       0.55      0.65      0.52      2500
weighted avg       0.90      0.70      0.77      2500



In [18]:
from sklearn.metrics import confusion_matrix

TN, FP, FN, TP = confusion_matrix(list(y_train), list(pipeline.predict(X_train)), labels=[0, 1]).ravel()

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 

print(TPR)
print(TNR)

0.6565143824027073
0.7429439861050803


In [19]:
from sklearn.metrics import confusion_matrix

TN, FP, FN, TP = confusion_matrix(list(y_test), list(pipeline.predict(X_test)), labels=[0, 1]).ravel()

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 

print(TPR)
print(TNR)

0.592391304347826
0.7072538860103627


# Determine Best Pipeline Transformations

Could also add: RobustScaler, Normalizer, ('log', FunctionTransformer(np.log1p)), Imputer = KNNImputer, LabelBinarizer

In [20]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

parameters = {'preprocessor__num__scaler': [StandardScaler(), 
                                            MinMaxScaler(), 
                                            MaxAbsScaler(), 
                                            RobustScaler()],
              'preprocessor__cat__cat_enc': [OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-999), 
                                             OneHotEncoder(handle_unknown='ignore'),
                                             ce.LeaveOneOutEncoder(handle_unknown='value', handle_missing='value')],
             'preprocessor__cat__imputer': [SimpleImputer(strategy='most_frequent'), 
                                            SimpleImputer(strategy='constant', fill_value='MI') 
                                           ],
             'preprocessor__num__imputer': [SimpleImputer(strategy='mean'), 
                                            SimpleImputer(strategy='median') 
                                           ]}
 
cv_feat = RepeatedStratifiedKFold(n_splits=2, n_repeats=1, random_state=1)
grid = GridSearchCV(pipeline, parameters, cv=cv_feat, scoring = 'f1', verbose=10).fit(X_train, y_train)
 
train_score=f1_score(y_train, grid.predict(X_train))
test_score=f1_score(y_test, grid.predict(X_test))
print('Training set score: ' + str(train_score))
print('Test set score: ' + str(test_score))

Fitting 2 folds for each of 48 candidates, totalling 96 fits
[CV 1/2; 1/48] START preprocessor__cat__cat_enc=OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-999), preprocessor__cat__imputer=SimpleImputer(strategy='most_frequent'), preprocessor__num__imputer=SimpleImputer(), preprocessor__num__scaler=StandardScaler()
[CV 1/2; 1/48] END preprocessor__cat__cat_enc=OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-999), preprocessor__cat__imputer=SimpleImputer(strategy='most_frequent'), preprocessor__num__imputer=SimpleImputer(), preprocessor__num__scaler=StandardScaler();, score=0.254 total time=   0.9s
[CV 2/2; 1/48] START preprocessor__cat__cat_enc=OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-999), preprocessor__cat__imputer=SimpleImputer(strategy='most_frequent'), preprocessor__num__imputer=SimpleImputer(), preprocessor__num__scaler=StandardScaler()
[CV 2/2; 1/48] END preprocessor__cat__cat_enc=OrdinalEncoder(handle_unknown='use_en

[CV 1/2; 8/48] END preprocessor__cat__cat_enc=OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-999), preprocessor__cat__imputer=SimpleImputer(strategy='most_frequent'), preprocessor__num__imputer=SimpleImputer(strategy='median'), preprocessor__num__scaler=RobustScaler();, score=0.250 total time=   1.0s
[CV 2/2; 8/48] START preprocessor__cat__cat_enc=OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-999), preprocessor__cat__imputer=SimpleImputer(strategy='most_frequent'), preprocessor__num__imputer=SimpleImputer(strategy='median'), preprocessor__num__scaler=RobustScaler()
[CV 2/2; 8/48] END preprocessor__cat__cat_enc=OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-999), preprocessor__cat__imputer=SimpleImputer(strategy='most_frequent'), preprocessor__num__imputer=SimpleImputer(strategy='median'), preprocessor__num__scaler=RobustScaler();, score=0.259 total time=   1.0s
[CV 1/2; 9/48] START preprocessor__cat__cat_enc=OrdinalEncoder(handl

[CV 1/2; 15/48] END preprocessor__cat__cat_enc=OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-999), preprocessor__cat__imputer=SimpleImputer(fill_value='MI', strategy='constant'), preprocessor__num__imputer=SimpleImputer(strategy='median'), preprocessor__num__scaler=MaxAbsScaler();, score=0.249 total time=   1.0s
[CV 2/2; 15/48] START preprocessor__cat__cat_enc=OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-999), preprocessor__cat__imputer=SimpleImputer(fill_value='MI', strategy='constant'), preprocessor__num__imputer=SimpleImputer(strategy='median'), preprocessor__num__scaler=MaxAbsScaler()
[CV 2/2; 15/48] END preprocessor__cat__cat_enc=OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-999), preprocessor__cat__imputer=SimpleImputer(fill_value='MI', strategy='constant'), preprocessor__num__imputer=SimpleImputer(strategy='median'), preprocessor__num__scaler=MaxAbsScaler();, score=0.259 total time=   1.0s
[CV 1/2; 16/48] START preproc

[CV 2/2; 22/48] END preprocessor__cat__cat_enc=OneHotEncoder(handle_unknown='ignore'), preprocessor__cat__imputer=SimpleImputer(strategy='most_frequent'), preprocessor__num__imputer=SimpleImputer(strategy='median'), preprocessor__num__scaler=MinMaxScaler();, score=0.260 total time=   1.5s
[CV 1/2; 23/48] START preprocessor__cat__cat_enc=OneHotEncoder(handle_unknown='ignore'), preprocessor__cat__imputer=SimpleImputer(strategy='most_frequent'), preprocessor__num__imputer=SimpleImputer(strategy='median'), preprocessor__num__scaler=MaxAbsScaler()
[CV 1/2; 23/48] END preprocessor__cat__cat_enc=OneHotEncoder(handle_unknown='ignore'), preprocessor__cat__imputer=SimpleImputer(strategy='most_frequent'), preprocessor__num__imputer=SimpleImputer(strategy='median'), preprocessor__num__scaler=MaxAbsScaler();, score=0.249 total time=   1.5s
[CV 2/2; 23/48] START preprocessor__cat__cat_enc=OneHotEncoder(handle_unknown='ignore'), preprocessor__cat__imputer=SimpleImputer(strategy='most_frequent'), prep

[CV 1/2; 30/48] END preprocessor__cat__cat_enc=OneHotEncoder(handle_unknown='ignore'), preprocessor__cat__imputer=SimpleImputer(fill_value='MI', strategy='constant'), preprocessor__num__imputer=SimpleImputer(strategy='median'), preprocessor__num__scaler=MinMaxScaler();, score=0.249 total time=   1.5s
[CV 2/2; 30/48] START preprocessor__cat__cat_enc=OneHotEncoder(handle_unknown='ignore'), preprocessor__cat__imputer=SimpleImputer(fill_value='MI', strategy='constant'), preprocessor__num__imputer=SimpleImputer(strategy='median'), preprocessor__num__scaler=MinMaxScaler()
[CV 2/2; 30/48] END preprocessor__cat__cat_enc=OneHotEncoder(handle_unknown='ignore'), preprocessor__cat__imputer=SimpleImputer(fill_value='MI', strategy='constant'), preprocessor__num__imputer=SimpleImputer(strategy='median'), preprocessor__num__scaler=MinMaxScaler();, score=0.260 total time=   1.5s
[CV 1/2; 31/48] START preprocessor__cat__cat_enc=OneHotEncoder(handle_unknown='ignore'), preprocessor__cat__imputer=SimpleImp

[CV 1/2; 38/48] END preprocessor__cat__cat_enc=LeaveOneOutEncoder(), preprocessor__cat__imputer=SimpleImputer(strategy='most_frequent'), preprocessor__num__imputer=SimpleImputer(strategy='median'), preprocessor__num__scaler=MinMaxScaler();, score=0.000 total time=   1.0s
[CV 2/2; 38/48] START preprocessor__cat__cat_enc=LeaveOneOutEncoder(), preprocessor__cat__imputer=SimpleImputer(strategy='most_frequent'), preprocessor__num__imputer=SimpleImputer(strategy='median'), preprocessor__num__scaler=MinMaxScaler()
[CV 2/2; 38/48] END preprocessor__cat__cat_enc=LeaveOneOutEncoder(), preprocessor__cat__imputer=SimpleImputer(strategy='most_frequent'), preprocessor__num__imputer=SimpleImputer(strategy='median'), preprocessor__num__scaler=MinMaxScaler();, score=0.000 total time=   0.9s
[CV 1/2; 39/48] START preprocessor__cat__cat_enc=LeaveOneOutEncoder(), preprocessor__cat__imputer=SimpleImputer(strategy='most_frequent'), preprocessor__num__imputer=SimpleImputer(strategy='median'), preprocessor__n

[CV 1/2; 46/48] END preprocessor__cat__cat_enc=LeaveOneOutEncoder(), preprocessor__cat__imputer=SimpleImputer(fill_value='MI', strategy='constant'), preprocessor__num__imputer=SimpleImputer(strategy='median'), preprocessor__num__scaler=MinMaxScaler();, score=0.000 total time=   1.1s
[CV 2/2; 46/48] START preprocessor__cat__cat_enc=LeaveOneOutEncoder(), preprocessor__cat__imputer=SimpleImputer(fill_value='MI', strategy='constant'), preprocessor__num__imputer=SimpleImputer(strategy='median'), preprocessor__num__scaler=MinMaxScaler()
[CV 2/2; 46/48] END preprocessor__cat__cat_enc=LeaveOneOutEncoder(), preprocessor__cat__imputer=SimpleImputer(fill_value='MI', strategy='constant'), preprocessor__num__imputer=SimpleImputer(strategy='median'), preprocessor__num__scaler=MinMaxScaler();, score=0.000 total time=   0.9s
[CV 1/2; 47/48] START preprocessor__cat__cat_enc=LeaveOneOutEncoder(), preprocessor__cat__imputer=SimpleImputer(fill_value='MI', strategy='constant'), preprocessor__num__imputer=S

In [21]:
print('*'*15, 'TRANSFORMATION TRAIN RESULTS', '*'*15, '\n')
print(classification_report(y_train, grid.predict(X_train)))

print('*'*15, 'TRANSFORMATION TEST RESULTS', '*'*15, '\n')
print(classification_report(y_test, grid.predict(X_test)))

*************** TRANSFORMATION TRAIN RESULTS *************** 

              precision    recall  f1-score   support

           0       0.96      0.75      0.84      6909
           1       0.18      0.65      0.28       591

    accuracy                           0.74      7500
   macro avg       0.57      0.70      0.56      7500
weighted avg       0.90      0.74      0.80      7500

*************** TRANSFORMATION TEST RESULTS *************** 

              precision    recall  f1-score   support

           0       0.96      0.71      0.81      2316
           1       0.14      0.61      0.23       184

    accuracy                           0.70      2500
   macro avg       0.55      0.66      0.52      2500
weighted avg       0.90      0.70      0.77      2500



In [22]:
best_features=grid.best_params_
print(best_features)

{'preprocessor__cat__cat_enc': OneHotEncoder(handle_unknown='ignore'), 'preprocessor__cat__imputer': SimpleImputer(strategy='most_frequent'), 'preprocessor__num__imputer': SimpleImputer(), 'preprocessor__num__scaler': StandardScaler()}


# Set-up Best Pipeline

In [23]:
best_params = {
    'classifier__learning_rate': 0.01,
    'classifier__n_estimators': 100,
    'classifier__max_depth': 3,
    'classifier__subsample': 0.8,
    'classifier__colsample_bytree': 1,
    'classifier__gamma': 1,
    'classifier__scale_pos_weight': ratio}

best_params = {**best_params, **best_features}

# Semi-Manual Opt

In [24]:
# # use early stopping to determine the estimators
# X_train_es, X_test_es,y_train_es, y_test_es = train_test_split(X_train,
#                                                                y_train, 
#                                                                random_state=104,
#                                                                test_size=0.25,
#                                                                shuffle=True)

# best_params['n_estimators']=1000
# xgb = XGBClassifier(eval_metric='auc', use_label_encoder=False, **best_params)
# xgb.fit(X_train_es, y_train_es,
#             eval_set=[(X_train_es, y_train_es), (X_test_es, y_test_es)], 
#             early_stopping_rounds=10) 

# results = xgb.evals_result()

# # plot the results
# plt.figure(figsize=(10,7))
# plt.plot(results["validation_0"]["auc"], label="Training loss")
# plt.plot(results["validation_1"]["auc"], label="Testing loss")
# plt.axvline(21, color="gray", label="Optimal tree number")
# plt.xlabel("Number of trees")
# plt.ylabel("Loss")
# plt.legend()

In [25]:
def get_best_params(opt):
    
    """
    function to add mean score and std score to find the most stable model
    """
    
    df=pd.DataFrame(opt.cv_results_)
    df['score']=df['mean_test_score']+df['std_test_score']
    df=df.sort_values(by='score', ascending=False)
    
    return df['params'].iloc[0]

In [26]:
"""
following: https://towardsdatascience.com/fine-tuning-xgboost-in-python-like-a-boss-b4543ed8b1e
"""

# define the cross-fold
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)

# begin by determining the optimal scale_pos_weight
ratio = float(np.sum(y_train == 0)) / np.sum(y_train==1)
ratio_root = math.sqrt(ratio)
print('scale_pos_weight ratio: ', ratio)
print('scale_pos_weight ratio root: ', ratio_root)

hardcoded_params = {
    
}

find_scale_pos_weight = {
    'classifier__scale_pos_weight' : [1, ratio_root, ratio, ratio*2, ratio*3]
    
}

find_learning_rate = {
    'classifier__learning_rate': list([i/100 for i in range(0,100)])
}

find_depth_and_child = {
    'classifier__max_depth': list(range(3,12,1)),
    'classifier__min_child_weight': list(range(1,10,1))
}

find_gamma = {
    'classifier__gamma': list([i/10.0 for i in range(0,5)])
}

find_subsample_and_colsample = {
    'classifier__subsample':list([i/10.0 for i in range(6,10)]),
    'classifier__colsample_bytree':list([i/10.0 for i in range(6,10)])
}

find_alpha = {
    'classifier__reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}

find_estimators = {
    'classifier__n_estimators': list(range(0,1000,50))
}

scale_pos_weight ratio:  11.690355329949238
scale_pos_weight ratio root:  3.4191161621023114


In [27]:
print('*'*43)
print('*'*15, 'MODEL TUNER', '*'*15)
print('*'*43, '\n')

*******************************************
*************** MODEL TUNER ***************
******************************************* 



In [28]:
# find the optimal value of scale pos_weight
print('*'*15, 'SCALE POS WEIGHT', '*'*15, '\n')
pipeline.set_params(**best_params).fit(X_train, y_train)
opt = GridSearchCV(pipeline, param_grid = find_scale_pos_weight, scoring = 'f1', verbose=10, cv=cv).fit(X_train, y_train)
best_scale_pos_weight=get_best_params(opt)
print(best_scale_pos_weight)

*************** SCALE POS WEIGHT *************** 

Fitting 2 folds for each of 5 candidates, totalling 10 fits
[CV 1/2; 1/5] START classifier__scale_pos_weight=1..............................
[CV 1/2; 1/5] END classifier__scale_pos_weight=1;, score=0.000 total time=   1.4s
[CV 2/2; 1/5] START classifier__scale_pos_weight=1..............................
[CV 2/2; 1/5] END classifier__scale_pos_weight=1;, score=0.007 total time=   1.4s
[CV 1/2; 2/5] START classifier__scale_pos_weight=3.4191161621023114.............
[CV 1/2; 2/5] END classifier__scale_pos_weight=3.4191161621023114;, score=0.221 total time=   1.4s
[CV 2/2; 2/5] START classifier__scale_pos_weight=3.4191161621023114.............
[CV 2/2; 2/5] END classifier__scale_pos_weight=3.4191161621023114;, score=0.227 total time=   1.5s
[CV 1/2; 3/5] START classifier__scale_pos_weight=11.690355329949238.............
[CV 1/2; 3/5] END classifier__scale_pos_weight=11.690355329949238;, score=0.254 total time=   1.4s
[CV 2/2; 3/5] START cla

In [29]:
# find the optimal value of depth and child
print('*'*15, 'LEARNING_RATE', '*'*15, '\n')
best_params = {**best_params,**best_scale_pos_weight}
pipeline.set_params(**best_params).fit(X_train, y_train)
opt = GridSearchCV(pipeline, param_grid = find_learning_rate, scoring = 'f1', verbose=10, cv=cv).fit(X_train, y_train)
learning_rate=get_best_params(opt)
print(learning_rate)

*************** LEARNING_RATE *************** 

Fitting 2 folds for each of 100 candidates, totalling 200 fits
[CV 1/2; 1/100] START classifier__learning_rate=0.0.............................
[CV 1/2; 1/100] END classifier__learning_rate=0.0;, score=0.000 total time=   1.5s
[CV 2/2; 1/100] START classifier__learning_rate=0.0.............................
[CV 2/2; 1/100] END classifier__learning_rate=0.0;, score=0.000 total time=   1.4s
[CV 1/2; 2/100] START classifier__learning_rate=0.01............................
[CV 1/2; 2/100] END classifier__learning_rate=0.01;, score=0.254 total time=   1.5s
[CV 2/2; 2/100] START classifier__learning_rate=0.01............................
[CV 2/2; 2/100] END classifier__learning_rate=0.01;, score=0.261 total time=   1.5s
[CV 1/2; 3/100] START classifier__learning_rate=0.02............................
[CV 1/2; 3/100] END classifier__learning_rate=0.02;, score=0.260 total time=   1.5s
[CV 2/2; 3/100] START classifier__learning_rate=0.02..............

[CV 2/2; 25/100] END classifier__learning_rate=0.24;, score=0.286 total time=   1.4s
[CV 1/2; 26/100] START classifier__learning_rate=0.25...........................
[CV 1/2; 26/100] END classifier__learning_rate=0.25;, score=0.243 total time=   1.4s
[CV 2/2; 26/100] START classifier__learning_rate=0.25...........................
[CV 2/2; 26/100] END classifier__learning_rate=0.25;, score=0.292 total time=   1.4s
[CV 1/2; 27/100] START classifier__learning_rate=0.26...........................
[CV 1/2; 27/100] END classifier__learning_rate=0.26;, score=0.246 total time=   1.4s
[CV 2/2; 27/100] START classifier__learning_rate=0.26...........................
[CV 2/2; 27/100] END classifier__learning_rate=0.26;, score=0.273 total time=   1.4s
[CV 1/2; 28/100] START classifier__learning_rate=0.27...........................
[CV 1/2; 28/100] END classifier__learning_rate=0.27;, score=0.249 total time=   1.4s
[CV 2/2; 28/100] START classifier__learning_rate=0.27...........................
[CV 

[CV 2/2; 50/100] END classifier__learning_rate=0.49;, score=0.232 total time=   1.4s
[CV 1/2; 51/100] START classifier__learning_rate=0.5............................
[CV 1/2; 51/100] END classifier__learning_rate=0.5;, score=0.213 total time=   1.4s
[CV 2/2; 51/100] START classifier__learning_rate=0.5............................
[CV 2/2; 51/100] END classifier__learning_rate=0.5;, score=0.227 total time=   1.5s
[CV 1/2; 52/100] START classifier__learning_rate=0.51...........................
[CV 1/2; 52/100] END classifier__learning_rate=0.51;, score=0.231 total time=   1.4s
[CV 2/2; 52/100] START classifier__learning_rate=0.51...........................
[CV 2/2; 52/100] END classifier__learning_rate=0.51;, score=0.216 total time=   1.5s
[CV 1/2; 53/100] START classifier__learning_rate=0.52...........................
[CV 1/2; 53/100] END classifier__learning_rate=0.52;, score=0.226 total time=   1.4s
[CV 2/2; 53/100] START classifier__learning_rate=0.52...........................
[CV 2/

[CV 2/2; 75/100] END classifier__learning_rate=0.74;, score=0.224 total time=   1.4s
[CV 1/2; 76/100] START classifier__learning_rate=0.75...........................
[CV 1/2; 76/100] END classifier__learning_rate=0.75;, score=0.206 total time=   1.4s
[CV 2/2; 76/100] START classifier__learning_rate=0.75...........................
[CV 2/2; 76/100] END classifier__learning_rate=0.75;, score=0.219 total time=   1.4s
[CV 1/2; 77/100] START classifier__learning_rate=0.76...........................
[CV 1/2; 77/100] END classifier__learning_rate=0.76;, score=0.220 total time=   1.4s
[CV 2/2; 77/100] START classifier__learning_rate=0.76...........................
[CV 2/2; 77/100] END classifier__learning_rate=0.76;, score=0.243 total time=   1.4s
[CV 1/2; 78/100] START classifier__learning_rate=0.77...........................
[CV 1/2; 78/100] END classifier__learning_rate=0.77;, score=0.199 total time=   1.4s
[CV 2/2; 78/100] START classifier__learning_rate=0.77...........................
[CV 

[CV 2/2; 100/100] END classifier__learning_rate=0.99;, score=0.160 total time=   1.4s
{'classifier__learning_rate': 0.25}


In [30]:
# find the optimal value of depth and child
print('*'*15, 'DEPTH AND CHILD', '*'*15, '\n')
best_params = { **best_params, **learning_rate}
pipeline.set_params(**best_params).fit(X_train, y_train)
opt = GridSearchCV(pipeline, param_grid = find_depth_and_child, scoring = 'f1', verbose=10, cv=cv).fit(X_train, y_train)
depth_and_child=get_best_params(opt)
print(depth_and_child)

*************** DEPTH AND CHILD *************** 

Fitting 2 folds for each of 81 candidates, totalling 162 fits
[CV 1/2; 1/81] START classifier__max_depth=3, classifier__min_child_weight=1....
[CV 1/2; 1/81] END classifier__max_depth=3, classifier__min_child_weight=1;, score=0.243 total time=   1.4s
[CV 2/2; 1/81] START classifier__max_depth=3, classifier__min_child_weight=1....
[CV 2/2; 1/81] END classifier__max_depth=3, classifier__min_child_weight=1;, score=0.292 total time=   1.4s
[CV 1/2; 2/81] START classifier__max_depth=3, classifier__min_child_weight=2....
[CV 1/2; 2/81] END classifier__max_depth=3, classifier__min_child_weight=2;, score=0.232 total time=   1.4s
[CV 2/2; 2/81] START classifier__max_depth=3, classifier__min_child_weight=2....
[CV 2/2; 2/81] END classifier__max_depth=3, classifier__min_child_weight=2;, score=0.284 total time=   1.4s
[CV 1/2; 3/81] START classifier__max_depth=3, classifier__min_child_weight=3....
[CV 1/2; 3/81] END classifier__max_depth=3, classif

[CV 2/2; 22/81] END classifier__max_depth=5, classifier__min_child_weight=4;, score=0.241 total time=   2.2s
[CV 1/2; 23/81] START classifier__max_depth=5, classifier__min_child_weight=5...
[CV 1/2; 23/81] END classifier__max_depth=5, classifier__min_child_weight=5;, score=0.233 total time=   2.1s
[CV 2/2; 23/81] START classifier__max_depth=5, classifier__min_child_weight=5...
[CV 2/2; 23/81] END classifier__max_depth=5, classifier__min_child_weight=5;, score=0.246 total time=   2.1s
[CV 1/2; 24/81] START classifier__max_depth=5, classifier__min_child_weight=6...
[CV 1/2; 24/81] END classifier__max_depth=5, classifier__min_child_weight=6;, score=0.242 total time=   2.1s
[CV 2/2; 24/81] START classifier__max_depth=5, classifier__min_child_weight=6...
[CV 2/2; 24/81] END classifier__max_depth=5, classifier__min_child_weight=6;, score=0.230 total time=   2.1s
[CV 1/2; 25/81] START classifier__max_depth=5, classifier__min_child_weight=7...
[CV 1/2; 25/81] END classifier__max_depth=5, class

[CV 2/2; 44/81] END classifier__max_depth=7, classifier__min_child_weight=8;, score=0.199 total time=   2.8s
[CV 1/2; 45/81] START classifier__max_depth=7, classifier__min_child_weight=9...
[CV 1/2; 45/81] END classifier__max_depth=7, classifier__min_child_weight=9;, score=0.234 total time=   2.7s
[CV 2/2; 45/81] START classifier__max_depth=7, classifier__min_child_weight=9...
[CV 2/2; 45/81] END classifier__max_depth=7, classifier__min_child_weight=9;, score=0.186 total time=   2.7s
[CV 1/2; 46/81] START classifier__max_depth=8, classifier__min_child_weight=1...
[CV 1/2; 46/81] END classifier__max_depth=8, classifier__min_child_weight=1;, score=0.206 total time=   3.2s
[CV 2/2; 46/81] START classifier__max_depth=8, classifier__min_child_weight=1...
[CV 2/2; 46/81] END classifier__max_depth=8, classifier__min_child_weight=1;, score=0.144 total time=   3.3s
[CV 1/2; 47/81] START classifier__max_depth=8, classifier__min_child_weight=2...
[CV 1/2; 47/81] END classifier__max_depth=8, class

[CV 2/2; 66/81] END classifier__max_depth=10, classifier__min_child_weight=3;, score=0.165 total time=   3.5s
[CV 1/2; 67/81] START classifier__max_depth=10, classifier__min_child_weight=4..
[CV 1/2; 67/81] END classifier__max_depth=10, classifier__min_child_weight=4;, score=0.238 total time=   3.5s
[CV 2/2; 67/81] START classifier__max_depth=10, classifier__min_child_weight=4..
[CV 2/2; 67/81] END classifier__max_depth=10, classifier__min_child_weight=4;, score=0.197 total time=   3.5s
[CV 1/2; 68/81] START classifier__max_depth=10, classifier__min_child_weight=5..
[CV 1/2; 68/81] END classifier__max_depth=10, classifier__min_child_weight=5;, score=0.232 total time=   3.4s
[CV 2/2; 68/81] START classifier__max_depth=10, classifier__min_child_weight=5..
[CV 2/2; 68/81] END classifier__max_depth=10, classifier__min_child_weight=5;, score=0.195 total time=   3.4s
[CV 1/2; 69/81] START classifier__max_depth=10, classifier__min_child_weight=6..
[CV 1/2; 69/81] END classifier__max_depth=10,

In [31]:
# find the optimal value of depth and child
print('*'*15, 'GAMMA', '*'*15, '\n')
best_params = { **best_params, **depth_and_child}
pipeline.set_params(**best_params).fit(X_train, y_train)
opt = GridSearchCV(pipeline, param_grid = find_gamma, scoring = 'f1', verbose=10, cv=cv).fit(X_train, y_train)
gamma=get_best_params(opt)
print(gamma)

*************** GAMMA *************** 

Fitting 2 folds for each of 5 candidates, totalling 10 fits
[CV 1/2; 1/5] START classifier__gamma=0.0.......................................
[CV 1/2; 1/5] END ........classifier__gamma=0.0;, score=0.243 total time=   1.5s
[CV 2/2; 1/5] START classifier__gamma=0.0.......................................
[CV 2/2; 1/5] END ........classifier__gamma=0.0;, score=0.292 total time=   1.4s
[CV 1/2; 2/5] START classifier__gamma=0.1.......................................
[CV 1/2; 2/5] END ........classifier__gamma=0.1;, score=0.243 total time=   1.4s
[CV 2/2; 2/5] START classifier__gamma=0.1.......................................
[CV 2/2; 2/5] END ........classifier__gamma=0.1;, score=0.292 total time=   1.4s
[CV 1/2; 3/5] START classifier__gamma=0.2.......................................
[CV 1/2; 3/5] END ........classifier__gamma=0.2;, score=0.243 total time=   1.4s
[CV 2/2; 3/5] START classifier__gamma=0.2.......................................
[CV 2/2; 

In [32]:
# find the optimal value of depth and child
print('*'*15, 'SUBSAMPLE AND COLSAMPLE', '*'*15, '\n')
best_params = { **best_params, **gamma}
pipeline.set_params(**best_params).fit(X_train, y_train)
opt = GridSearchCV(pipeline, param_grid = find_subsample_and_colsample, scoring = 'f1', verbose=10, cv=cv).fit(X_train, y_train)
subsample_and_colsample=get_best_params(opt)
print(subsample_and_colsample)

*************** SUBSAMPLE AND COLSAMPLE *************** 

Fitting 2 folds for each of 16 candidates, totalling 32 fits
[CV 1/2; 1/16] START classifier__colsample_bytree=0.6, classifier__subsample=0.6
[CV 1/2; 1/16] END classifier__colsample_bytree=0.6, classifier__subsample=0.6;, score=0.254 total time=   1.0s
[CV 2/2; 1/16] START classifier__colsample_bytree=0.6, classifier__subsample=0.6
[CV 2/2; 1/16] END classifier__colsample_bytree=0.6, classifier__subsample=0.6;, score=0.276 total time=   1.0s
[CV 1/2; 2/16] START classifier__colsample_bytree=0.6, classifier__subsample=0.7
[CV 1/2; 2/16] END classifier__colsample_bytree=0.6, classifier__subsample=0.7;, score=0.260 total time=   1.0s
[CV 2/2; 2/16] START classifier__colsample_bytree=0.6, classifier__subsample=0.7
[CV 2/2; 2/16] END classifier__colsample_bytree=0.6, classifier__subsample=0.7;, score=0.274 total time=   1.0s
[CV 1/2; 3/16] START classifier__colsample_bytree=0.6, classifier__subsample=0.8
[CV 1/2; 3/16] END classifie

In [33]:
# find the optimal value of depth and child
print('*'*15, 'ALPHA', '*'*15, '\n')
best_params = { **best_params, **subsample_and_colsample}
pipeline.set_params(**best_params).fit(X_train, y_train)
opt = GridSearchCV(pipeline, param_grid = find_alpha, scoring = 'f1', verbose=10, cv=cv).fit(X_train, y_train)
alpha=get_best_params(opt)
print(alpha)

*************** ALPHA *************** 

Fitting 2 folds for each of 5 candidates, totalling 10 fits
[CV 1/2; 1/5] START classifier__reg_alpha=1e-05.................................
[CV 1/2; 1/5] END ..classifier__reg_alpha=1e-05;, score=0.256 total time=   1.3s
[CV 2/2; 1/5] START classifier__reg_alpha=1e-05.................................
[CV 2/2; 1/5] END ..classifier__reg_alpha=1e-05;, score=0.280 total time=   1.3s
[CV 1/2; 2/5] START classifier__reg_alpha=0.01..................................
[CV 1/2; 2/5] END ...classifier__reg_alpha=0.01;, score=0.256 total time=   1.3s
[CV 2/2; 2/5] START classifier__reg_alpha=0.01..................................
[CV 2/2; 2/5] END ...classifier__reg_alpha=0.01;, score=0.280 total time=   1.3s
[CV 1/2; 3/5] START classifier__reg_alpha=0.1...................................
[CV 1/2; 3/5] END ....classifier__reg_alpha=0.1;, score=0.247 total time=   1.3s
[CV 2/2; 3/5] START classifier__reg_alpha=0.1...................................
[CV 2/2; 

In [34]:
# best_params['n_estimators']=xgb.best_ntree_limit

In [35]:
# find the optimal value of depth and child
print('*'*15, 'N_ESTIMATORS', '*'*15, '\n')
best_params = {**best_params, **alpha}
pipeline.set_params(**best_params).fit(X_train, y_train)
opt = GridSearchCV(pipeline, param_grid = find_estimators, scoring = 'f1', verbose=10, cv=cv).fit(X_train, y_train)
estimators=get_best_params(opt)
print(estimators)

*************** N_ESTIMATORS *************** 

Fitting 2 folds for each of 20 candidates, totalling 40 fits
[CV 1/2; 1/20] START classifier__n_estimators=0.................................
[CV 1/2; 1/20] END ..classifier__n_estimators=0;, score=0.000 total time=   0.2s
[CV 2/2; 1/20] START classifier__n_estimators=0.................................
[CV 2/2; 1/20] END ..classifier__n_estimators=0;, score=0.000 total time=   0.2s
[CV 1/2; 2/20] START classifier__n_estimators=50................................
[CV 1/2; 2/20] END .classifier__n_estimators=50;, score=0.245 total time=   0.7s
[CV 2/2; 2/20] START classifier__n_estimators=50................................
[CV 2/2; 2/20] END .classifier__n_estimators=50;, score=0.271 total time=   0.7s
[CV 1/2; 3/20] START classifier__n_estimators=100...............................
[CV 1/2; 3/20] END classifier__n_estimators=100;, score=0.256 total time=   1.3s
[CV 2/2; 3/20] START classifier__n_estimators=100...............................
[

In [36]:
n_estimators=sum(estimators.values())
find_refined_estimators = {
    'classifier__n_estimators': list(range(n_estimators-25,n_estimators+25,1))
}

In [37]:
# find the optimal value of depth and child
print('*'*15, 'N_ESTIMATORS', '*'*15, '\n')
best_params = {**best_params}
pipeline.set_params(**best_params).fit(X_train, y_train)
opt = GridSearchCV(pipeline, param_grid = find_refined_estimators, scoring = 'f1', verbose=10, cv=cv).fit(X_train, y_train)
estimators=get_best_params(opt)
print(estimators)
best_params = {**best_params, **estimators}

*************** N_ESTIMATORS *************** 

Fitting 2 folds for each of 50 candidates, totalling 100 fits
[CV 1/2; 1/50] START classifier__n_estimators=75................................
[CV 1/2; 1/50] END .classifier__n_estimators=75;, score=0.244 total time=   1.0s
[CV 2/2; 1/50] START classifier__n_estimators=75................................
[CV 2/2; 1/50] END .classifier__n_estimators=75;, score=0.282 total time=   1.0s
[CV 1/2; 2/50] START classifier__n_estimators=76................................
[CV 1/2; 2/50] END .classifier__n_estimators=76;, score=0.228 total time=   1.0s
[CV 2/2; 2/50] START classifier__n_estimators=76................................
[CV 2/2; 2/50] END .classifier__n_estimators=76;, score=0.272 total time=   1.0s
[CV 1/2; 3/50] START classifier__n_estimators=77................................
[CV 1/2; 3/50] END .classifier__n_estimators=77;, score=0.232 total time=   1.0s
[CV 2/2; 3/50] START classifier__n_estimators=77................................


[CV 1/2; 26/50] END classifier__n_estimators=100;, score=0.256 total time=   1.5s
[CV 2/2; 26/50] START classifier__n_estimators=100..............................
[CV 2/2; 26/50] END classifier__n_estimators=100;, score=0.280 total time=   1.4s
[CV 1/2; 27/50] START classifier__n_estimators=101..............................
[CV 1/2; 27/50] END classifier__n_estimators=101;, score=0.259 total time=   1.5s
[CV 2/2; 27/50] START classifier__n_estimators=101..............................
[CV 2/2; 27/50] END classifier__n_estimators=101;, score=0.282 total time=   1.5s
[CV 1/2; 28/50] START classifier__n_estimators=102..............................
[CV 1/2; 28/50] END classifier__n_estimators=102;, score=0.254 total time=   1.5s
[CV 2/2; 28/50] START classifier__n_estimators=102..............................
[CV 2/2; 28/50] END classifier__n_estimators=102;, score=0.280 total time=   1.4s
[CV 1/2; 29/50] START classifier__n_estimators=103..............................
[CV 1/2; 29/50] END cl

In [38]:
print(best_params)

{'classifier__learning_rate': 0.25, 'classifier__n_estimators': 94, 'classifier__max_depth': 3, 'classifier__subsample': 0.6, 'classifier__colsample_bytree': 0.9, 'classifier__gamma': 0.0, 'classifier__scale_pos_weight': 11.690355329949238, 'preprocessor__cat__cat_enc': OneHotEncoder(handle_unknown='ignore'), 'preprocessor__cat__imputer': SimpleImputer(strategy='most_frequent'), 'preprocessor__num__imputer': SimpleImputer(), 'preprocessor__num__scaler': StandardScaler(), 'classifier__min_child_weight': 1, 'classifier__reg_alpha': 1e-05}


In [39]:
# Fit best model
pipeline.set_params(**best_params).fit(X_train, y_train)

# Comparing Manual Tune vs Baseline

In [40]:
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

print('Training actual total:', y_train.sum())
print('Training predicted total:', pipeline.predict(X_train).sum())
print('Testing actual total:', y_test.sum())
print('Testing predicted total:', pipeline.predict(X_test).sum())

train=classification_report(y_train, pipeline.predict(X_train))
test=classification_report(y_test, pipeline.predict(X_test))

print(train)
print(test)

Training actual total: 591
Training predicted total: 1564
Testing actual total: 184
Testing predicted total: 551
              precision    recall  f1-score   support

           0       0.99      0.85      0.92      6909
           1       0.34      0.91      0.50       591

    accuracy                           0.86      7500
   macro avg       0.67      0.88      0.71      7500
weighted avg       0.94      0.86      0.88      7500

              precision    recall  f1-score   support

           0       0.95      0.80      0.87      2316
           1       0.16      0.47      0.24       184

    accuracy                           0.78      2500
   macro avg       0.55      0.64      0.55      2500
weighted avg       0.89      0.78      0.82      2500



In [41]:
from sklearn.metrics import confusion_matrix

TN, FP, FN, TP = confusion_matrix(list(y_train), list(pipeline.predict(X_train)), labels=[0, 1]).ravel()

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 

print(TPR)
print(TNR)

0.9052453468697124
0.851063829787234


In [42]:
from sklearn.metrics import confusion_matrix

TN, FP, FN, TP = confusion_matrix(list(y_test), list(pipeline.predict(X_test)), labels=[0, 1]).ravel()

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 

print(TPR)
print(TNR)

0.47282608695652173
0.7996545768566494


In [43]:
# Plot Hyperparams

In [44]:

# dxgb=XGBClassifier(eval_metric='auc', use_label_encoder=False)
# dxgb.fit(X_train_transformed, y_train)

# print('actual 1s:', y_train.sum())
# print('predicted 1s:', dxgb.predict(X_train_transformed).sum())

# print('actual 1s:', y_test.sum())
# print('predicted 1s:', dxgb.predict(X_test_transformed).sum())

# f1_train=classification_report(y_train, dxgb.predict(X_train_transformed))
# f1_test=classification_report(y_test, dxgb.predict(X_test_transformed))

# print(f1_train)
# print(f1_test)

In [45]:
# # scoring on test dataset
# test_df=pd.read_csv('data/application_test.csv')
# test_df_=test_df.loc[:, ~test_df.columns.isin(['SK_ID_CURR'])]
# test_df_transformed=pipe.transform(test_df_)

# submission_df=test_df[['SK_ID_CURR']]
# submission_df['TARGET']=opt.predict(test_df_transformed)
# submission_df.to_csv('submission.csv', index=False)

# Bayes Opt

In [46]:
# N_ITER=10

# estimator = XGBClassifier(eval_metric='auc', use_label_encoder=False)

# fit_params = {
#     'early_stopping_rounds': 10,
#     'eval_set':[(X_test, y_test)],
#     'verbose': False,
# }

# ratio = float(np.sum(y_train == 0)) / np.sum(y_train==1)

# search_space = {
#     'max_depth': (1, 6),
#     'n_estimators': (50, 500),
#     'min_child_weight': (1, 100),
#     'gamma': [0.5, 1, 1.5, 2, 5],
#     'subsample': [0.6, 0.8, 1.0],
#     'colsample_bytree': [0.6, 0.8, 1.0],
#     'scale_pos_weight' : (0, ratio),
# }

# opt = BayesSearchCV(
#     estimator=estimator,
#     search_spaces=search_space,
#     fit_params=fit_params,
#     cv=cv,
#     scoring="roc_auc",
#     random_state=42,
#     n_iter=N_ITER,
#     verbose=1,
#     return_train_score=True,
# )

# opt.fit(X_train_transformed, y_train)