Useful links: https://www.kaggle.com/buddhiniw/breast-cancer-prediction

In [1]:
import pandas as pd
import numpy as np

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from skopt import BayesSearchCV
from sklearn.model_selection import StratifiedKFold
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval

from sklearn import metrics

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, \
f1_score

from tools import *

# Data

In [2]:
# read data
train = pd.read_csv('train.csv',parse_dates=['Original_Quote_Date'])
test = pd.read_csv('test.csv',parse_dates=['Original_Quote_Date'])

In [3]:
# create training and test subset
training_df, validation_df=training_validation_subset(train)

Training dataset rows:	 182527
Validation dataset rows:	 78226


In [4]:
# Null Accuracy Score
y_train=training_df['QuoteConversion_Flag']
lst = [0] * len(y_train)
print(f'Null Accuracy Score: {accuracy_score(y_train, pd.Series(lst))}')

Null Accuracy Score: 0.8121866901883009


# Feature Engineering

In [5]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

# numeric and categoric columns in train
train_numeric_predictors = train.select_dtypes(include=numerics).columns

In [6]:
# X_train=training_df.drop(['QuoteNumber','Original_Quote_Date','QuoteConversion_Flag'], axis=1)
# Y_train=training_df['QuoteConversion_Flag']
# X_valid=validation_df.drop(['QuoteNumber','Original_Quote_Date','QuoteConversion_Flag'], axis=1)
# Y_valid=validation_df['QuoteConversion_Flag']

In [7]:
X_train=training_df[train_numeric_predictors].drop(['QuoteNumber','QuoteConversion_Flag'], axis=1)
Y_train=training_df['QuoteConversion_Flag']
X_valid=validation_df[train_numeric_predictors].drop(['QuoteNumber','QuoteConversion_Flag'], axis=1)
Y_valid=validation_df['QuoteConversion_Flag']

In [8]:
# missing values
for i in list(X_train.columns):
    X_train[i]=X_train[i].fillna(X_train[i].median())
    
for i in list(X_valid.columns):
    X_valid[i]=X_valid[i].fillna(X_valid[i].median())

# Logistic Regression

In [9]:
logistic_model=LogisticRegression(max_iter=5000)

# define standard scaler
scaler = StandardScaler()

# transform train data
train_scaled = scaler.fit_transform(X_train)

#Fit the model:
logistic_model.fit(train_scaled,Y_train)

# transform valid
valid_scaled=scaler.transform(X_valid)

#Make predictions on training set:
predictions = logistic_model.predict(valid_scaled)

#Print accuracy
accuracy = metrics.accuracy_score(predictions,Y_valid)
print("Validation Accuracy : %s" % "{0:.3%}".format(accuracy))

f1=f1_score(Y_valid, predictions, average=None)
print(f'Validation F1 Score: {f1}')

Validation Accuracy : 87.867%
Validation F1 Score: [0.92760102 0.62573445]


# Elastic Net

In [10]:
elastic_net_model = LogisticRegression(penalty = 'elasticnet', solver = 'saga', l1_ratio = 0.5, max_iter=5000)

# define standard scaler
scaler = StandardScaler()

# transform train data
train_scaled = scaler.fit_transform(X_train)

#Fit the model:
elastic_net_model.fit(train_scaled,Y_train)

# transform valid
valid_scaled=scaler.transform(X_valid)

#Make predictions on training set:
predictions = elastic_net_model.predict(valid_scaled)

#Print accuracy
accuracy = metrics.accuracy_score(predictions,Y_valid)
print("Validation Accuracy : %s" % "{0:.3%}".format(accuracy))

f1=f1_score(Y_valid, predictions, average=None)
print(f'Validation F1 Score: {f1}')

Validation Accuracy : 87.874%
Validation F1 Score: [0.92765185 0.62559204]


# Decision Tree

In [11]:
tree_model = DecisionTreeClassifier()

#Fit the model:
tree_model.fit(X_train,Y_train)

#Make predictions on training set
predictions = tree_model.predict(X_valid)

#Print accuracy
accuracy = metrics.accuracy_score(predictions,Y_valid)
print("Validation Accuracy : %s" % "{0:.3%}".format(accuracy))

f1=f1_score(Y_valid, predictions, average=None)
print(f'Validation F1 Score: {f1}')

Validation Accuracy : 85.224%
Validation F1 Score: [0.90884573 0.61008602]


# Random Forest

In [12]:
# Use all the features of the nucleus
rf_model = RandomForestClassifier(n_estimators=100,min_samples_split=25, max_depth=7, max_features=2)

#Fit the model:
rf_model.fit(X_train,Y_train)

#Make predictions on training set
predictions = rf_model.predict(X_valid)

#Print accuracy
accuracy = metrics.accuracy_score(predictions,Y_valid)
print("Validation Accuracy : %s" % "{0:.3%}".format(accuracy))

f1=f1_score(Y_valid, predictions, average=None)
print(f'Validation F1 Score: {f1}')

Validation Accuracy : 81.575%
Validation F1 Score: [0.8982413  0.02700331]


Using all the features improves the prediction accuracy and the cross-validation score is great.

An advantage with Random Forest is that it returns a feature importance matrix which can be used to select features. So lets select the top 5 features and use them as predictors.

In [13]:
#Create a series with feature importances
featimp = pd.Series(rf_model.feature_importances_, index=list(X_train.columns)).sort_values(ascending=False)

In [14]:
# Using top 10 features
predictor_var = list(featimp[:10].keys())
rf2_model = RandomForestClassifier(n_estimators=100, min_samples_split=25, max_depth=7, max_features=2)

#Fit the model:
rf2_model.fit(X_train[predictor_var],Y_train)

#Make predictions on training set
predictions = rf2_model.predict(X_valid[predictor_var])

#Print accuracy
accuracy = metrics.accuracy_score(predictions,Y_valid)
print("Validation Accuracy : %s" % "{0:.3%}".format(accuracy))

f1=f1_score(Y_valid, predictions, average=None)
print(f'Validation F1 Score: {f1}')

Validation Accuracy : 86.352%
Validation F1 Score: [0.91914326 0.56274574]


# XGB (Hyperopt)

In [15]:
# Control the balance of positive and negative weights, useful for unbalanced classes. 
ratio = float(np.sum(training_df['QuoteConversion_Flag'] == 0)) / np.sum(training_df['QuoteConversion_Flag']==1)
scale_pos_weight_val=round(ratio,0)

In [16]:
# Choose hyperparameter search space
space = {
        'max_depth':hp.choice('max_depth', np.arange(2, 25, 1, dtype=int)),
        'n_estimators':hp.choice('n_estimators', np.arange(50, 12000, 10, dtype=int)),
        'colsample_bytree':hp.quniform('colsample_bytree', 0.4, 0.9, 0.1),
        'min_child_weight':hp.choice('min_child_weight', np.arange(1, 12, 1, dtype=int)),
        'scale_pos_weight':hp.choice('scale_pos_weight', np.arange(1, scale_pos_weight_val, 1, dtype=int)),   
        'lambda':hp.choice('lambda', np.arange(1, 5, 1, dtype=int)),    
        'subsample':hp.quniform('subsample', 0.6, 1.0, 0.1),
        'gamma':hp.quniform('gamm', 0, 10, 1),
        'objective':'binary:logistic',
        'eta':hp.quniform('eta', 0.01, 0.5, 0.1),
        'eval_metric': 'auc',
    }

def score(params):
    model = XGBClassifier(**params)
    
    model.fit(X_train, 
              Y_train, 
              eval_set=[(X_train, Y_train), (X_valid, Y_valid)],
              verbose=False, 
              early_stopping_rounds=25)
    Y_pred_train = model.predict(X_train)
    Y_pred_test = model.predict(X_valid)
    train_score = (-1*roc_auc_score(Y_train, Y_pred_train))
    test_score = (-1*roc_auc_score(Y_valid, Y_pred_test))
    print('Training loss:', round(abs(train_score),2), 'Test loss:', round(abs(test_score),2)
    return {'loss': test_score, 'status': STATUS_OK}   
    
    
def optimize(trials, space):
    
    best = fmin(score, space, algo=tpe.suggest, max_evals=25) # up this amount
    return best

SyntaxError: invalid syntax (<ipython-input-16-935cedac22f0>, line 29)

In [None]:
# Optimise
trials = Trials()
best_params = optimize(trials, space)

In [None]:
# Return the best parameters
parameters=space_eval(space, best_params)

# Apply to model
xgb_model = XGBClassifier(colsample_bytree=parameters['colsample_bytree'],
 eta=parameters['eta'],
 eval_metric=parameters['eval_metric'],
 gamma=parameters['gamma'],
 max_depth=parameters['max_depth'],
 min_child_weight=parameters['min_child_weight'],
 n_estimators=parameters['n_estimators'],
 scale_pos_weight=parameters['scale_pos_weight'],
 objective=parameters['objective'],
 subsample=parameters['subsample'])

xgb_model.fit(
    X_train, 
    Y_train,
    eval_metric="auc", 
    eval_set=[(X_train, Y_train), (X_valid, Y_valid)], 
    verbose=True, 
    early_stopping_rounds=10)

In [None]:
#Make predictions on training set
predictions = xgb_model.predict(X_valid)

#Print accuracy
accuracy = metrics.accuracy_score(predictions,Y_valid)
print("Validation Accuracy : %s" % "{0:.3%}".format(accuracy))

f1=f1_score(Y_valid, predictions, average=None)
print(f'Validation F1 Score: {f1}')

In [None]:
# X_test=test[train_numeric_predictors].drop(['QuoteNumber','QuoteConversion_Flag'], axis=1)
# test['QuoteConversion_Flag']=best_model.predict(X_test)
# test[['QuoteNumber', 'QuoteConversion_Flag']].to_csv('predictions.csv', index=False)

# XGB (Skopt)

In [None]:
# SETTINGS - CHANGE THESE TO GET SOMETHING MEANINGFUL
ITERATIONS = 1 # 1000

In [None]:
# Classifier
bayes_cv_tuner = BayesSearchCV(
    estimator = xgb.XGBClassifier(
        n_jobs = 1,
        objective = 'binary:logistic',
        eval_metric = 'auc',
#         early_stopping_rounds=2,
#         silent=1,
        tree_method='approx'
    ),
    search_spaces = {
         'learning_rate': (0.001, 0.3, 'uniform'),
         'min_child_weight': (0, 10),
         'max_depth': (2, 30),
#         'max_delta_step': (0, 20),
         'subsample': (0.5, 1.0, 'uniform'),
#         'colsample_bytree': (0.01, 1.0, 'uniform'),
#         'colsample_bylevel': (0.01, 1.0, 'uniform'),
#         'reg_lambda': (1e-5, 1000, 'uniform'),
        'reg_alpha': (1e-5, 1.0, 'uniform'),
        'gamma': (1e-9, 0.5, 'uniform'),
        'n_estimators': (2, 100),
        'scale_pos_weight': (1, 500, 'uniform')
    },
    scoring = 'roc_auc',
    cv = StratifiedKFold(
        n_splits=3,
        shuffle=True,
        random_state=42
    ),
    n_jobs = 3,
    n_iter = ITERATIONS,   
    verbose = 0,
    refit = True,
    random_state = 45
)

def status_print(optim_result):
    """Status callback durring bayesian hyperparameter search"""
    
    # Get all the models tested so far in DataFrame format
    all_models = pd.DataFrame(bayes_cv_tuner.cv_results_)    
    
    # Get current parameters and the best parameters    
    best_params = pd.Series(bayes_cv_tuner.best_params_)
    print('Model #{}\nBest ROC-AUC: {}\nBest params: {}\n'.format(
        len(all_models),
        np.round(bayes_cv_tuner.best_score_, 4),
        bayes_cv_tuner.best_params_
    ))
    
    # Save all model results
    clf_name = bayes_cv_tuner.estimator.__class__.__name__
#     all_models.to_csv(clf_name+"_cv_results.csv")

In [None]:
# Fit the model
xgb_skopt = bayes_cv_tuner.fit(X_train, Y_train, callback=status_print)

In [None]:
#Make predictions on training set
predictions = xgb_skopt.predict(X_valid)

#Print accuracy
accuracy = metrics.accuracy_score(predictions,Y_valid)
print("Validation Accuracy : %s" % "{0:.3%}".format(accuracy))

f1=f1_score(Y_valid, predictions, average=None)
print(f'Validation F1 Score: {f1}')

# Stacking

In [None]:
# def stacking(df, [list_of_predictions], metric):
    
#     if metric=='average':


# Validation

In [None]:
X_test=test[train_numeric_predictors].drop(['QuoteNumber','QuoteConversion_Flag'], axis=1)
test['QuoteConversion_Flag']=best_model.predict(X_test)
test[['QuoteNumber', 'QuoteConversion_Flag']].to_csv('predictions.csv', index=False)