In [1]:
import pandas as pd
import numpy as np

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from skopt import BayesSearchCV
from sklearn.model_selection import StratifiedKFold
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval

from sklearn import metrics

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, \
f1_score

from ax.service.ax_client import AxClient
from ax.utils.notebook.plotting import render, init_notebook_plotting

from tools import *

# Data

In [2]:
# read data
train = pd.read_csv('data/train.csv',parse_dates=['Original_Quote_Date'])
test = pd.read_csv('data/test.csv',parse_dates=['Original_Quote_Date'])

In [3]:
# create training and test subset
training_df, validation_df=training_validation_subset(train[:5000])

Training dataset rows:	 3500
Validation dataset rows:	 1500


In [4]:
# Null Accuracy Score
y_train=training_df['QuoteConversion_Flag']
lst = [0] * len(y_train)
print(f'Null Accuracy Score: {accuracy_score(y_train, pd.Series(lst))}')

Null Accuracy Score: 0.8165714285714286


# Feature Engineering

In [5]:
def mean_encoder(training_df, validation_df, test_df, col, response):

    """ function to mean encode categorical features 
        any missing values are imputed with mode """

    # Create dictionary
    mean_encoding=training_df.groupby(col)[response].mean().to_dict()

    # Apply to train
    training_df[col+'_ME']=training_df[col].replace(mean_encoding)
    training_df[col+'_ME']=training_df[col+'_ME'].fillna(training_df[col+'_ME'].mode()[0])

    # Apply to valid
    validation_df[col+'_ME']=validation_df[col].replace(mean_encoding)
    validation_df[col+'_ME']=validation_df[col+'_ME'].fillna(validation_df[col+'_ME'].mode()[0])

    # Apply to test
    test_df[col+'_ME']=test_df[col].replace(mean_encoding)
    test_df[col+'_ME']=test_df[col+'_ME'].fillna(test_df[col+'_ME'].mode()[0])

In [6]:
def response_outlier_capping(df, variable, multiplier):

    ''' cap and collar the response variable '''

    q1 = np.percentile(df[variable],25)
    q3 = np.percentile(df[variable],75)
    iqr = q3 - q1
    lower = q1 - (iqr * multiplier)
    upper = q3 + (iqr * multiplier)

    df[variable] = np.where(df[variable]<=lower, lower, df[variable])
    df[variable] = np.where(df[variable]>=upper, upper, df[variable])

    return df

# numeric_types = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'uint8']

In [7]:
cat_cols=return_categoric_columns(training_df)
cat_cols.remove('Original_Quote_Date')

for i in cat_cols:
    
    print(i+'_ME')
    mean_encoder(training_df, validation_df, test, i, 'QuoteConversion_Flag')
    print(training_df[i+'_ME'].unique())
    print(i, 'transformed...')
    print()

PersonalField18_ME
[0.15162455 0.19047619 0.23574144 0.17073171 0.27835052 0.22222222
 0.1        0.22972973 0.25358852 0.18137255 0.2244898  0.23076923
 0.25       0.11764706 0.13157895 0.31914894 0.2        0.2173913
 0.29411765 0.375      0.09090909 0.09375    0.2962963  0.14285714
 0.11111111 0.17647059 0.0625     0.         0.16666667 0.15
 0.14705882 0.21428571 0.5        0.27272727 0.28571429 0.33333333]
PersonalField18 transformed...

Field10_ME
[0.16466552 0.32397959 0.22543353 0.08485857 0.28219697 0.05925926
 0.11560694 0.10344828]
Field10 transformed...

PersonalField19_ME
[0.15162455 0.24020619 0.         0.20555556 0.10344828 0.125
 0.11940299 0.3        0.2        0.07692308 0.1        0.30769231
 0.22727273 0.07142857 0.4        0.18181818 0.16129032 0.16
 0.13043478 0.22222222 0.09090909 0.25       0.17647059 0.05555556
 0.66666667 0.5        0.33333333]
PersonalField19 transformed...

PropertyField38_ME
[0.18552169 0.12857143]
PropertyField38 transformed...

CoverageF

In [8]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

# numeric and categoric columns in train
train_numeric_predictors = train.select_dtypes(include=numerics).columns

In [9]:
# X_train=training_df.drop(['QuoteNumber','Original_Quote_Date','QuoteConversion_Flag'], axis=1)
# Y_train=training_df['QuoteConversion_Flag']
# X_valid=validation_df.drop(['QuoteNumber','Original_Quote_Date','QuoteConversion_Flag'], axis=1)
# Y_valid=validation_df['QuoteConversion_Flag']

In [10]:
X_train=training_df[train_numeric_predictors].drop(['QuoteNumber','QuoteConversion_Flag'], axis=1)
Y_train=training_df['QuoteConversion_Flag']
X_valid=validation_df[train_numeric_predictors].drop(['QuoteNumber','QuoteConversion_Flag'], axis=1)
Y_valid=validation_df['QuoteConversion_Flag']

In [11]:
# missing values
for i in list(X_train.columns):
    X_train[i]=X_train[i].fillna(X_train[i].median())
    
for i in list(X_valid.columns):
    X_valid[i]=X_valid[i].fillna(X_valid[i].median())

# Logistic Regression

In [12]:
logistic_model=LogisticRegression(max_iter=5000)

# define standard scaler
scaler = StandardScaler()

# transform train data
train_scaled = scaler.fit_transform(X_train)

#Fit the model:
logistic_model.fit(train_scaled,Y_train)

# transform valid
valid_scaled=scaler.transform(X_valid)

#Make predictions on training set:
predictions = logistic_model.predict(valid_scaled)

#Print accuracy
accuracy = metrics.accuracy_score(predictions,Y_valid)
print("Validation Accuracy : %s" % "{0:.3%}".format(accuracy))

f1=f1_score(Y_valid, predictions, average=None)
print(f'Validation F1 Score: {f1}')

Validation Accuracy : 85.600%
Validation F1 Score: [0.91212368 0.60147601]


In [13]:
# score on train and valid
training_df['logistic_prediction'] = logistic_model.predict_proba(train_scaled)[:, 1]
validation_df['logistic_prediction'] = logistic_model.predict_proba(valid_scaled)[:, 1]

# Elastic Net

In [14]:
elastic_net_model = LogisticRegression(penalty = 'elasticnet', solver = 'saga', l1_ratio = 0.5, max_iter=5000)

# define standard scaler
scaler = StandardScaler()

# transform train data
train_scaled = scaler.fit_transform(X_train)

# fit the model:
elastic_net_model.fit(train_scaled,Y_train)

# transform valid
valid_scaled=scaler.transform(X_valid)

# make predictions on training set:
predictions = elastic_net_model.predict(valid_scaled)

# print accuracy
accuracy = metrics.accuracy_score(predictions,Y_valid)
print("Validation Accuracy : %s" % "{0:.3%}".format(accuracy))

f1=f1_score(Y_valid, predictions, average=None)
print(f'Validation F1 Score: {f1}')

Validation Accuracy : 85.867%
Validation F1 Score: [0.91403082 0.60299625]


In [15]:
# score on train and valid
training_df['elastic_prediction'] = elastic_net_model.predict_proba(train_scaled)[:, 1]
validation_df['elastic_prediction'] = elastic_net_model.predict_proba(valid_scaled)[:, 1]

# SVM

In [16]:
# instantiate classifier with default hyperparameters with kernel=rbf, C=1.0 and gamma=auto
svc=SVC(probability=True)

# declare parameters for hyperparameter tuning
parameters = [ {'C':[1, 10], 'kernel':['linear']},
              ]

# # declare parameters for hyperparameter tuning
# parameters = [ {'C':[1, 10, 100, 1000], 'kernel':['linear']},
#                {'C':[1, 10, 100, 1000], 'kernel':['rbf'], 'gamma':[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]},
#                {'C':[1, 10, 100, 1000], 'kernel':['poly'], 'degree': [2,3,4] ,'gamma':[0.01,0.02,0.03,0.04,0.05]} 
#               ]

grid_search = GridSearchCV(estimator = svc,  
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 5,
                           verbose=0)

# define standard scaler
scaler = StandardScaler()

# transform train data
train_scaled = scaler.fit_transform(X_train)

grid_search.fit(train_scaled, Y_train)
             
# transform valid
valid_scaled=scaler.transform(X_valid)

# make predictions on training set:
predictions = grid_search.predict(valid_scaled)

# print accuracy
accuracy = metrics.accuracy_score(predictions,Y_valid)
print("Validation Accuracy : %s" % "{0:.3%}".format(accuracy))

f1=f1_score(Y_valid, predictions, average=None)
print(f'Validation F1 Score: {f1}')

Validation Accuracy : 84.867%
Validation F1 Score: [0.90761091 0.58195212]


In [18]:
# score on train and valid
training_df['svm_prediction'] = grid_search.predict_proba(train_scaled)[:, 1]
validation_df['svm_prediction'] = grid_search.predict_proba(valid_scaled)[:, 1]

# Decision Tree

In [19]:
tree_model = DecisionTreeClassifier()

#Fit the model:
tree_model.fit(X_train,Y_train)

#Make predictions on training set
predictions = tree_model.predict(X_valid)

#Print accuracy
accuracy = metrics.accuracy_score(predictions,Y_valid)
print("Validation Accuracy : %s" % "{0:.3%}".format(accuracy))

f1=f1_score(Y_valid, predictions, average=None)
print(f'Validation F1 Score: {f1}')

Validation Accuracy : 80.867%
Validation F1 Score: [0.88096226 0.51273345]


In [None]:
# score on train and valid
training_df['tree_prediction'] = tree_model.predict_proba(X_train)[:, 1]
validation_df['tree_prediction'] = tree_model.predict_proba(X_valid)[:, 1]

# Random Forest

In [None]:
# Use all the features of the nucleus
rf_model = RandomForestClassifier(n_estimators=100,min_samples_split=25, max_depth=7, max_features=2)

#Fit the model:
rf_model.fit(X_train,Y_train)

#Make predictions on training set
predictions = rf_model.predict(X_valid)

#Print accuracy
accuracy = metrics.accuracy_score(predictions,Y_valid)
print("Validation Accuracy : %s" % "{0:.3%}".format(accuracy))

f1=f1_score(Y_valid, predictions, average=None)
print(f'Validation F1 Score: {f1}')

Using all the features improves the prediction accuracy and the cross-validation score is great.

An advantage with Random Forest is that it returns a feature importance matrix which can be used to select features. So lets select the top 5 features and use them as predictors.

In [None]:
#Create a series with feature importances
featimp = pd.Series(rf_model.feature_importances_, index=list(X_train.columns)).sort_values(ascending=False)

In [None]:
# Using top 10 features
predictor_var = list(featimp[:10].keys())
rf2_model = RandomForestClassifier(n_estimators=100, min_samples_split=25, max_depth=7, max_features=2)

#Fit the model:
rf2_model.fit(X_train[predictor_var],Y_train)

#Make predictions on training set
predictions = rf2_model.predict(X_valid[predictor_var])

#Print accuracy
accuracy = metrics.accuracy_score(predictions,Y_valid)
print("Validation Accuracy : %s" % "{0:.3%}".format(accuracy))

f1=f1_score(Y_valid, predictions, average=None)
print(f'Validation F1 Score: {f1}')

In [None]:
# score on train and valid
training_df['rf_prediction'] = rf_model.predict_proba(X_train)[:, 1]
validation_df['rf_prediction'] = rf_model.predict_proba(X_valid)[:, 1]

# XGB (Hyperopt)

In [None]:
# Control the balance of positive and negative weights, useful for unbalanced classes. 
ratio = float(np.sum(training_df['QuoteConversion_Flag'] == 0)) / np.sum(training_df['QuoteConversion_Flag']==1)
scale_pos_weight_val=round(ratio,0)

In [None]:
# Choose hyperparameter search space
space = {
        'max_depth':hp.choice('max_depth', np.arange(2, 25, 1, dtype=int)),
        'n_estimators':hp.choice('n_estimators', np.arange(50, 12000, 10, dtype=int)),
        'colsample_bytree':hp.quniform('colsample_bytree', 0.4, 0.9, 0.1),
        'min_child_weight':hp.choice('min_child_weight', np.arange(1, 12, 1, dtype=int)),
        'scale_pos_weight':hp.choice('scale_pos_weight', np.arange(1, scale_pos_weight_val, 1, dtype=int)),   
        'lambda':hp.choice('lambda', np.arange(1, 5, 1, dtype=int)),    
        'subsample':hp.quniform('subsample', 0.6, 1.0, 0.1),
        'gamma':hp.quniform('gamm', 0, 10, 1),
        'objective':'binary:logistic',
        'eta':hp.quniform('eta', 0.01, 0.5, 0.1),
        'eval_metric': 'auc',
    }

def score(params):
    model = XGBClassifier(**params)
    
    model.fit(X_train, 
              Y_train, 
              eval_set=[(X_train, Y_train), (X_valid, Y_valid)],
              verbose=False, 
              early_stopping_rounds=25)
    Y_pred_train = model.predict(X_train)
    Y_pred_test = model.predict(X_valid)
    train_score = (-1*roc_auc_score(Y_train, Y_pred_train))
    test_score = (-1*roc_auc_score(Y_valid, Y_pred_test))
    print('Training loss:', round(abs(train_score),4), 'Test loss:', round(abs(test_score),4))
    return {'loss': test_score, 'status': STATUS_OK}  

def optimize(trials, space):
    
    best = fmin(score, space, algo=tpe.suggest, max_evals=25) # up this amount
    return best

In [None]:
# Optimise
trials = Trials()
best_params = optimize(trials, space)

In [None]:
# Return the best parameters
parameters=space_eval(space, best_params)

# Apply to model
xgb_model = XGBClassifier(colsample_bytree=parameters['colsample_bytree'],
 eta=parameters['eta'],
 eval_metric=parameters['eval_metric'],
 gamma=parameters['gamma'],
 max_depth=parameters['max_depth'],
 min_child_weight=parameters['min_child_weight'],
 n_estimators=parameters['n_estimators'],
 scale_pos_weight=parameters['scale_pos_weight'],
 objective=parameters['objective'],
 subsample=parameters['subsample'])

xgb_model.fit(
    X_train, 
    Y_train,
    eval_metric="auc", 
    eval_set=[(X_train, Y_train), (X_valid, Y_valid)], 
    verbose=True, 
    early_stopping_rounds=10)

In [None]:
#Make predictions on training set
predictions = xgb_model.predict(X_valid)

#Print accuracy
accuracy = metrics.accuracy_score(predictions,Y_valid)
print("Validation Accuracy : %s" % "{0:.3%}".format(accuracy))

f1=f1_score(Y_valid, predictions, average=None)
print(f'Validation F1 Score: {f1}')

In [None]:
# score on train and valid
training_df['xgbho_prediction'] = xgb_model.predict_proba(X_train)[:, 1]
validation_df['xgbho_prediction'] = xgb_model.predict_proba(X_valid)[:, 1]

# XGB (Skopt)

In [None]:
# SETTINGS - CHANGE THESE TO GET SOMETHING MEANINGFUL
ITERATIONS = 25 # 1000

In [None]:
# Classifier
bayes_cv_tuner = BayesSearchCV(
    estimator = xgb.XGBClassifier(
        n_jobs = 1,
        objective = 'binary:logistic',
        eval_metric = 'auc',
#         early_stopping_rounds=2,
#         silent=1,
        tree_method='approx'
    ),
    search_spaces = {
         'learning_rate': (0.001, 0.3, 'uniform'),
         'min_child_weight': (0, 10),
         'max_depth': (2, 30),
#         'max_delta_step': (0, 20),
         'subsample': (0.5, 1.0, 'uniform'),
#         'colsample_bytree': (0.01, 1.0, 'uniform'),
#         'colsample_bylevel': (0.01, 1.0, 'uniform'),
#         'reg_lambda': (1e-5, 1000, 'uniform'),
        'reg_alpha': (1e-5, 1.0, 'uniform'),
        'gamma': (1e-9, 0.5, 'uniform'),
        'n_estimators': (2, 100),
        'scale_pos_weight': (1, 500, 'uniform')
    },
    scoring = 'roc_auc',
    cv = StratifiedKFold(
        n_splits=3,
        shuffle=True,
        random_state=42
    ),
    n_jobs = 3,
    n_iter = ITERATIONS,   
    verbose = 0,
    refit = True,
    random_state = 45
)

def status_print(optim_result):
    """Status callback durring bayesian hyperparameter search"""
    
    # Get all the models tested so far in DataFrame format
    all_models = pd.DataFrame(bayes_cv_tuner.cv_results_)    
    
    # Get current parameters and the best parameters    
    best_params = pd.Series(bayes_cv_tuner.best_params_)
    print('Model #{}\nBest ROC-AUC: {}\nBest params: {}\n'.format(
        len(all_models),
        np.round(bayes_cv_tuner.best_score_, 4),
        bayes_cv_tuner.best_params_
    ))
    
    # Save all model results
    clf_name = bayes_cv_tuner.estimator.__class__.__name__
#     all_models.to_csv(clf_name+"_cv_results.csv")

In [None]:
# Fit the model
xgb_skopt = bayes_cv_tuner.fit(X_train, Y_train, callback=status_print)

In [None]:
#Make predictions on training set
predictions = xgb_skopt.predict(X_valid)

#Print accuracy
accuracy = metrics.accuracy_score(predictions,Y_valid)
print("Validation Accuracy : %s" % "{0:.3%}".format(accuracy))

f1=f1_score(Y_valid, predictions, average=None)
print(f'Validation F1 Score: {f1}')

In [None]:
# score on train and valid
training_df['xgbsko_prediction'] = xgb_skopt.predict_proba(X_train)[:, 1]
validation_df['xgbsko_prediction'] = xgb_skopt.predict_proba(X_valid)[:, 1]

# XGB (Ax)

In [None]:
dtrain = xgb.DMatrix(X_train, label=Y_train)

In [None]:
def xgboost_cv_score_ax(parameterization, weight=None):
    NFOLD = 7
    NUM_BOOST_ROUND = 500

    p_names = ['learning_rate', 'max_depth' 'subsample', 'min_split_loss', 'min_child_weight', 'colsample_bytree', 
              'colsample_bylevel', 'colsample_bynode', 'lambda', 'alpha']
    params = {}
    params['objective'] = 'binary:logistic'
    
    for p in p_names:
        params[p] = parameterization.get(p)
    
    # K-Fold cross validation score.
    cv_results = xgb.cv(dtrain=dtrain,
                        params=params,
                        nfold=NFOLD,
                        num_boost_round=NUM_BOOST_ROUND,
                        metrics="auc", 
                        as_pandas=True,
                        seed=987)
#     print(cv_results)
    
    mean = cv_results.tail(1)['test-auc-mean'].values[0]
    sem = cv_results.tail(1)['test-auc-std'].values[0]
    
    return mean, sem

In [None]:
# https://xgboost.readthedocs.io/en/latest/parameter.html#parameters-for-tree-booster
#

parameters=[
  {
      "name": "max_depth",
      "type": "range",
      "bounds": [1,18],
      "value_type": "int"
  },
  {
      "name": "learning_rate",
      "type": "range",
      "bounds": [0.000001,1],
      "value_type": "float"
  },
  {
      "name": "gamma",
      "type": "fixed",
      "value": 0.0,
      "value_type": "float"
  },
  {
      "name": "max_delta_step",
      "type": "fixed",
      "value": 0.0,
      "value_type": "float"
  },
  {
      "name": "min_child_weight",
      "type": "range",
      "bounds": [0,500],
      "value_type": "int"
  },
  {
      "name": "subsample",
      "type": "range",
      "bounds": [0.5,1],
      "value_type": "float"
  },
  {
      "name": "colsample_bytree",
      "type": "range",
      "bounds": [0.1,1],
      "value_type": "float"
  },
    {
      "name": "colsample_bylevel",
      "type": "range",
      "bounds": [0.1,1],
      "value_type": "float"
  },
  {
      "name": "colsample_bynode",
      "type": "range",
      "bounds": [0.1,1],
      "value_type": "float"
  },
  {
      "name": "reg_alpha",
      "type": "range",
      "bounds": [0,2000],
      "value_type": "float"
  },
  {
      "name": "reg_lambda",
      "type": "range",
      "bounds": [0,300],
      "value_type": "float"
  },
  {
      "name": "scale_pos_weight",
      "type": "range",
      "bounds": [1,500],
      "value_type": "float"
  }
]

In [None]:
# import more packages
init_notebook_plotting()

ax_client = AxClient()

# create the experiment.
ax_client.create_experiment(
    name="xgboost_experiment",
    parameters=parameters,
    objective_name='xgboost_cv',
    minimize=False) # false as AUC

# objective name has to be xgboost_cv
def evaluate(parameters):
    return {"xgboost_cv": xgboost_cv_score_ax(parameters)}

In [None]:
# how many trials.
# Uses Guassian Processes with Expected Improvement to do the trials. This seems to be the usual way people do hyperparameter tuning.
# Can use other models/strategies, but won't be covering it here.

for i in range(50):
    parameters, trial_index = ax_client.get_next_trial()
    
    print(parameters, trial_index)
    
    # Local evaluation here can be replaced with deployment to external system.
    ax_client.complete_trial(trial_index=trial_index, raw_data=evaluate(parameters))

In [None]:
# look at all the trials
ax_client.get_trials_data_frame().sort_values('trial_index')

In [None]:
best_parameters, values = ax_client.get_best_parameters()

# the best set of parameters
print(best_parameters)

# the best score achieved
means, covariances = values
print(means)

In [None]:
render(ax_client.get_optimization_trace()) # objective_optimum is optional

In [None]:
# can see contour plots for two of the features.

render(ax_client.get_contour_plot(param_x="learning_rate", param_y="max_depth", metric_name="xgboost_cv"))

In [None]:
# Apply to model
xgbax_model = XGBClassifier(max_depth=best_parameters['max_depth'],
learning_rate=best_parameters['learning_rate'],
min_child_weight=best_parameters['min_child_weight'],
subsample=best_parameters['subsample'],
colsample_bytree=best_parameters['colsample_bytree'],
colsample_bylevel=best_parameters['colsample_bylevel'],
colsample_bynode=best_parameters['colsample_bynode'],
reg_alpha=best_parameters['reg_alpha'], 
reg_lambda=best_parameters['reg_lambda'],
gamma=best_parameters['gamma'],    
max_delta_step=best_parameters['max_delta_step'],
scale_pos_weight=best_parameters['scale_pos_weight'],
objective = 'binary:logistic',
num_boost_round=5000)

xgbax_model.fit(
    X_train, 
    Y_train,
    eval_metric="auc", 
    eval_set=[(X_train, Y_train), (X_valid, Y_valid)], 
    verbose=True, 
    early_stopping_rounds=25)

In [None]:
#Make predictions on training set
predictions = xgbax_model.predict(X_valid)

#Print accuracy
accuracy = metrics.accuracy_score(predictions,Y_valid)
print("Validation Accuracy : %s" % "{0:.3%}".format(accuracy))

f1=f1_score(Y_valid, predictions, average=None)
print(f'Validation F1 Score: {f1}')

In [None]:
# score on train and valid
training_df['xgbax_prediction'] = xgbax_model.predict_proba(X_train)[:, 1]
validation_df['xgbax_prediction'] = xgbax_model.predict_proba(X_valid)[:, 1]

In [None]:
# # save results to json file.
# ax_client.save_to_json_file()

# # restore the client from json file. Handy if you want to do more trials or if your computer crashed in the middle of the trials.
# restored_ax_client = AxClient.load_from_json_file() 

# Stacking

In [None]:
cols=['logistic_prediction',
'elastic_prediction',
'svm_prediction',
'tree_prediction',
'rf_prediction',
'xgbho_prediction',
'xgbsko_prediction',
'xgbax_prediction']

training_df[cols].to_csv('data/all_predictions.csv', index=False)
print('predictions saved...')

# Stacking

In [None]:
# Take the average of all predictions
training_df['Prediction']=training_df[cols].mean(axis=1)
validation_df['Prediction']=validation_df[cols].mean(axis=1)

# Optimal Threshold

In [None]:
# find optimal cut off
val=optimal_cutoff(training_df['QuoteConversion_Flag'], training_df['Prediction'])

# apply cut off
validation_df['Prediction_Outcome']=np.where(validation_df['Prediction']<=val[0],0,1)  

#Print accuracy
accuracy = metrics.accuracy_score(validation_df['Prediction_Outcome'],Y_valid)
print("Validation Accuracy : %s" % "{0:.3%}".format(accuracy))

f1=f1_score(Y_valid, validation_df['Prediction_Outcome'], average=None)
print(f'Validation F1 Score: {f1}')

# Validation

In [None]:
# features
X_test=test[list(X_train.columns)]

In [None]:
for i in list(X_test.columns):
    X_test[i]=X_test[i].fillna(X_test[i].median())

In [None]:
# scoring
test_scaled=scaler.transform(X_test)

test['logistic_prediction'] = logistic_model.predict_proba(test_scaled)[:, 1]
test['elastic_prediction'] = elastic_net_model.predict_proba(test_scaled)[:, 1]
test['svm_prediction'] = grid_search.predict_proba(test_scaled)[:, 1]
test['tree_prediction'] = tree_model.predict_proba(X_test)[:, 1]
test['rf_prediction'] = rf_model.predict_proba(X_test)[:, 1]
test['xgbho_prediction'] = xgb_model.predict_proba(X_test)[:, 1]
test['xgbsko_prediction'] = xgb_skopt.predict_proba(X_test)[:, 1]
test['xgbax_prediction'] = xgbax_model.predict_proba(X_test)[:, 1]

In [None]:
# threshold
test['Prediction']=test[cols].mean(axis=1)

# apply cut off
test['QuoteConversion_Flag']=np.where(test['Prediction']<=val[0],0,1)

In [None]:
# output
test['QuoteConversion_Flag']=xgb_model.predict(X_test)
test[['QuoteNumber', 'QuoteConversion_Flag']].to_csv('data/predictions.csv', index=False)