In [1]:
# import necessary dependencies
import pandas as pd
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, \
    recall_score, f1_score, roc_auc_score, matthews_corrcoef

In [2]:
# location of train, test and results csvs
train='train.csv'
test='test.csv'
results='results.csv'

In [3]:
# read in train, test and results using pandas
train_df=pd.read_csv(train)
test_df=pd.read_csv(test)
results_df=pd.read_csv(results)

In [4]:
list(train_df.columns)

['pol_ref',
 'name',
 'address',
 'gender',
 'car_make',
 'car_value_as_new',
 'car_year',
 'pol_holder_age',
 'pol_employment_type',
 'pol_employment_industry',
 'pol_holder_licence_length',
 'car_top_speed',
 'postcode_rating',
 'credit_rating',
 'other_named_driver',
 'criminal_convictions',
 'car_modified_indicator',
 'car_colour',
 'collision_date',
 'collision_description',
 'large_collision']

In [5]:
# select the features to be used as inputs to the model
features=['car_value_as_new',
'car_year',
'pol_holder_age',
'pol_holder_licence_length',
'car_top_speed',
'postcode_rating',
'credit_rating',
'criminal_convictions']

# select the target variable
target='large_collision'

In [6]:
# split train data into X and y
X_train = train_df[features]
y_train = train_df[target]

In [7]:
# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [8]:
# predict on test_df
y_pred = model.predict(train_df[features])

# actuals
y_test=train_df['large_collision']

# probability
y_pred_prob = model.predict_proba(test_df[features])
y_pred_prob=[p[1] for p in y_pred_prob]

In [9]:
# train results
print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('Precision Score:', precision_score(y_test, y_pred, average=None))
print('Recall Score:', recall_score(y_test, y_pred, average=None))
print('F1 Score:', f1_score(y_test, y_pred, average=None))
print('AUROC:', roc_auc_score(y_test, y_pred_prob))
print('MCC:', matthews_corrcoef(y_test, y_pred), '\n')

Accuracy Score: 0.81037
Precision Score: [0.81024724 0.81795355]
Recall Score: [0.99637609 0.06522827]
F1 Score: [0.89372362 0.12082155]
AUROC: 0.50099914208757
MCC: 0.19672292195138447 



In [10]:
# predict on test_df
y_pred = model.predict(test_df[features])

# actuals
y_test=test_df['large_collision']

# probability
y_pred_prob = model.predict_proba(test_df[features])
y_pred_prob=[p[1] for p in y_pred_prob]

In [11]:
# test results
print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('Precision Score:', precision_score(y_test, y_pred, average=None))
print('Recall Score:', recall_score(y_test, y_pred, average=None))
print('F1 Score:', f1_score(y_test, y_pred, average=None))
print('AUROC:', roc_auc_score(y_test, y_pred_prob))
print('MCC:', matthews_corrcoef(y_test, y_pred), '\n')

Accuracy Score: 0.79881
Precision Score: [0.80398759 0.43256815]
Recall Score: [0.99012102 0.03025438]
F1 Score: [0.88739891 0.05655334]
AUROC: 0.6024032728697895
MCC: 0.06942562442161904 



In [12]:
# predict on results_df
y_pred = model.predict(results_df[features])

# probability
y_pred_prob = model.predict_proba(results_df[features])
y_pred_prob=[p[1] for p in y_pred_prob]

In [13]:
# create output table with predictions
out_df=pd.DataFrame()
out_df['pol_ref']=results_df['pol_ref']
out_df['prediction']=y_pred
out_df['probability']=y_pred_prob

# save to csv and send predictions to Ben or Chelsea
out_df.to_csv('predictions.csv', index=False)