In [1]:
# import necessary dependencies
import pandas as pd
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, \
    recall_score, f1_score, roc_auc_score, matthews_corrcoef

In [2]:
# location of train, test and results csvs
train='train.csv'
test='test.csv'
results='results.csv'

In [None]:
# read in train, test and results using pandas
train_df=pd.read_csv(train)
test_df=pd.read_csv(test)
results_df=pd.read_csv(results)

In [None]:
list(train_df.columns)

In [None]:
# select the features to be used as inputs to the model
features=['car_value_as_new',
'car_year',
'pol_holder_age',
'pol_holder_licence_length',
'car_top_speed',
'postcode_rating',
'credit_rating',
'criminal_convictions']

# select the target variable
target='large_collision'

In [None]:
# split train data into X and y
X_train = train_df[features]
y_train = train_df[target]

In [None]:
# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)

In [None]:
# predict on test_df
y_pred = model.predict(train_df[features])

# actuals
y_test=train_df['large_collision']

# probability
y_pred_prob = model.predict_proba(test_df[features])
y_pred_prob=[p[1] for p in y_pred_prob]

In [None]:
# train results
print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('Precision Score:', precision_score(y_test, y_pred, average=None))
print('Recall Score:', recall_score(y_test, y_pred, average=None))
print('F1 Score:', f1_score(y_test, y_pred, average=None))
print('AUROC:', roc_auc_score(y_test, y_pred_prob))
print('MCC:', matthews_corrcoef(y_test, y_pred), '\n')

In [None]:
# predict on test_df
y_pred = model.predict(test_df[features])

# actuals
y_test=test_df['large_collision']

# probability
y_pred_prob = model.predict_proba(test_df[features])
y_pred_prob=[p[1] for p in y_pred_prob]

In [None]:
# test results
print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('Precision Score:', precision_score(y_test, y_pred, average=None))
print('Recall Score:', recall_score(y_test, y_pred, average=None))
print('F1 Score:', f1_score(y_test, y_pred, average=None))
print('AUROC:', roc_auc_score(y_test, y_pred_prob))
print('MCC:', matthews_corrcoef(y_test, y_pred), '\n')

In [None]:
# predict on results_df
y_pred = model.predict(results_df[features])

# probability
y_pred_prob = model.predict_proba(results_df[features])
y_pred_prob=[p[1] for p in y_pred_prob]

In [None]:
# create output table with predictions
out_df=pd.DataFrame()
out_df['pol_ref']=results_df['pol_ref']
out_df['prediction']=y_pred
out_df['probability']=y_pred_prob

# save to csv and send predictions to Ben or Chelsea
out_df.to_csv('predictions.csv', index=False)