In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, \
    recall_score, f1_score, roc_auc_score, matthews_corrcoef

# Logistic Regression

In [2]:
train=pd.read_csv('data/train.csv')
valid=pd.read_csv('data/valid.csv')

df=pd.concat([train, valid])

In [19]:
df.describe()

Unnamed: 0,car_value_as_new,car_year,pol_holder_age,pol_holder_licence_length,car_top_speed,postcode_rating,credit_rating,criminal_convictions,large_collision
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0
mean,37529.0,2008.99397,53.761095,13.209535,139.28865,25.01494,24.97324,0.00402,0.20001
std,14262.633912,6.648848,20.536947,7.131568,13.287075,14.149966,14.127527,0.077678,0.400008
min,17000.0,1998.0,17.0,1.0,100.0,1.0,1.0,0.0,0.0
25%,26000.0,2003.0,36.0,7.0,130.0,13.0,13.0,0.0,0.0
50%,35000.0,2009.0,54.0,13.0,140.0,25.0,25.0,0.0,0.0
75%,47000.0,2015.0,71.0,19.0,150.0,37.0,37.0,0.0,0.0
max,100000.0,2020.0,90.0,25.0,200.0,49.0,49.0,2.0,1.0


In [20]:
cols=['car_value_as_new',
'car_year',
'pol_holder_age',
'pol_holder_licence_length',
'car_top_speed',
'postcode_rating',
'credit_rating',
'criminal_convictions']

In [21]:
X=df[cols]
y=df['large_collision']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [23]:
y_pred=logreg.predict(X_test)
y_pred_prob=logreg.predict_proba(X_test)
y_pred_prob=[p[1] for p in y_pred_prob]

In [24]:
print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('Precision Score:', precision_score(y_test, y_pred, average=None))
print('Recall Score:', recall_score(y_test, y_pred, average=None))
print('F1 Score:', f1_score(y_test, y_pred, average=None))
print('AUROC:', roc_auc_score(y_test, y_pred_prob))
print('MCC:', matthews_corrcoef(y_test, y_pred), '\n')

Accuracy Score: 0.8006666666666666
Precision Score: [0.80066667 0.        ]
Recall Score: [1. 0.]
F1 Score: [0.88930026 0.        ]
AUROC: 0.6327883240415595
MCC: 0.0 



  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


In [25]:
def model_performance_metrics(train_pred, ):
    ''' function to return the model performance metrics '''

    print(f"{model_name} Training Performance Metrics:\n")
    tn, fp, fn, tp = confusion_matrix(train_pred, y_train).ravel()
    print('True Negatives:', tn, 'False Positives:', fp, 'False Negatives:', fn, 'True Positives:', tp)
    print('Accuracy Score:', accuracy_score(y_train, train_pred))
    print('Precision Score:', precision_score(y_train, train_pred, average=None))
    print('Recall Score:', recall_score(y_train, train_pred, average=None))
    print('F1 Score:', f1_score(y_train, train_pred, average=None))
    print('AUROC:', roc_auc_score(y_train, train_prob))
    print('MCC:', matthews_corrcoef(y_train, train_pred), '\n')

# XGBoost

In [34]:
# split data into X and y
X = train[cols]
Y = train['large_collision']

In [36]:
# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [45]:
# make predictions for test data
y_test=valid['large_collision']
y_pred = model.predict(valid[cols])
y_pred_prob = model.predict_proba(valid[cols])
y_pred_prob=[p[1] for p in y_pred_prob]

In [46]:
# evaluate predictions
print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('Precision Score:', precision_score(y_test, y_pred, average=None))
print('Recall Score:', recall_score(y_test, y_pred, average=None))
print('F1 Score:', f1_score(y_test, y_pred, average=None))
print('AUROC:', roc_auc_score(y_test, y_pred_prob))
print('MCC:', matthews_corrcoef(y_test, y_pred), '\n')

Accuracy Score: 0.81036
Precision Score: [0.81468434 0.66643812]
Recall Score: [0.98784737 0.09751693]
F1 Score: [0.89294827 0.17013828]
AUROC: 0.7162941136787359
MCC: 0.20265903449097106 

