In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime

#
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import imblearn

## Import raw data

In [2]:
df = pd.read_csv("DfTRoadSafety_Accidents_2014.csv")
df.head(2).T

Unnamed: 0,0,1
Accident_Index,201401BS70001,201401BS70002
Location_Easting_OSGR,524600,525780
Location_Northing_OSGR,179020,178290
Longitude,-0.206443,-0.189713
Latitude,51.4963,51.4895
Police_Force,1,1
Accident_Severity,3,3
Number_of_Vehicles,2,2
Number_of_Casualties,1,1
Date,09/01/2014,20/01/2014


### Convert date time 

In [4]:
def to_hour(time):
    try:
        hour = datetime.strptime(str(time), '%H:%M')
        return int(datetime.strftime(hour, '%H'))
    except Exception:
        return 0

df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['Hour_of_Day'] = df['Time'].apply(to_hour)
df["isWeekend"] = pd.to_datetime(df['Date'], format='%d/%m/%Y').apply(lambda x:  1.0 if x.weekday()>=5 else 0.0)

### Selected Features based on previous analysis

In [5]:
list1 = ["Accident_Severity", "Number_of_Vehicles"]#, "Number_of_Casualties"]
list2 = ["Day_of_Week" ,"Hour_of_Day"]
list3 = ["Weather_Conditions"]#, "Light_Conditions"]
list4 = ["Police_Force"] #,"Local_Authority_(District)"]
list5 = ["Location_Northing_OSGR","Location_Easting_OSGR"]
list6 = ["1st_Road_Number","Speed_limit"]#,"2nd_Road_Number",]
XColumns = list1 + list2 + list3 + list4 + list5 + list6
yColumns = ["Did_Police_Officer_Attend_Scene_of_Accident"]
print("XColumns :: {}".format(XColumns))
print("yColumns :: {}".format(yColumns))

XColumns :: ['Accident_Severity', 'Number_of_Vehicles', 'Day_of_Week', 'Hour_of_Day', 'Weather_Conditions', 'Police_Force', 'Location_Northing_OSGR', 'Location_Easting_OSGR', '1st_Road_Number', 'Speed_limit']
yColumns :: ['Did_Police_Officer_Attend_Scene_of_Accident']


In [7]:
indexColumns = set(["Accident_Index"])
columnY = ["Did_Police_Officer_Attend_Scene_of_Accident"]
object_columns = set(["Date","Time","Local_Authority_(Highway)","LSOA_of_Accident_Location"])
float64_columns = set(["Longitude","Latitude"])
highly_variable_features = set(["2nd_Road_Number","1st_Road_Number","Local_Authority_(District)","Location_Northing_OSGR","Location_Easting_OSGR"])
int64_columns = set(df.columns.tolist()++set(["isWeekend"])-indexColumns-object_columns-float64_columns
XColumns = list(int64_columns-set(columnY)-highly_variable_features)
yColumns = ["Did_Police_Officer_Attend_Scene_of_Accident"]

TypeError: unsupported operand type(s) for +: 'set' and 'set'

In [111]:
df[XColumns].nunique()

Special_Conditions_at_Site                  9
Light_Conditions                            5
Carriageway_Hazards                         7
Hour_of_Day                                24
Urban_or_Rural_Area                         2
2nd_Road_Class                              7
Day_of_Week                                 7
Junction_Detail                             9
Police_Force                               51
Number_of_Vehicles                         14
Speed_limit                                 6
Number_of_Casualties                       26
1st_Road_Class                              6
Weather_Conditions                          9
Accident_Severity                           3
Road_Surface_Conditions                     6
Junction_Control                            5
Month                                      12
Road_Type                                   6
Pedestrian_Crossing-Physical_Facilities     6
Count                                       1
Pedestrian_Crossing-Human_Control 

### Create Feature Encoding for both input features and target variables

In [112]:
X = df[XColumns]
y = df[yColumns].replace({2:0.0,1:1.0})
categorical_features = [c not in  ["Location_Northing_OSGR","Location_Easting_OSGR","1st_Road_Number"]  for c in XColumns ]
oneHotEncX = preprocessing.OneHotEncoder(categories="auto",sparse=False,categorical_features=None)
oneHotEncX.fit(X)

OneHotEncoder(categorical_features=None, categories='auto', drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=False)

### Split data into train and test data set 

In [113]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print("Train :: {}".format(y_train.Did_Police_Officer_Attend_Scene_of_Accident.value_counts()))
print("Test :: {}".format(y_test.Did_Police_Officer_Attend_Scene_of_Accident.value_counts()))

Train :: 1.0    80095
0.0    17940
Name: Did_Police_Officer_Attend_Scene_of_Accident, dtype: int64
Test :: 1.0    39512
0.0     8775
Name: Did_Police_Officer_Attend_Scene_of_Accident, dtype: int64


### Data Augmentation to reduce skewness in data

In [128]:
smt = imblearn.combine.SMOTEENN(sampling_strategy=0.8) #SMOTE()
XTrainSMOTEENN, yTrainSMOTEENN = smt.fit_sample(X_train, y_train.Did_Police_Officer_Attend_Scene_of_Accident)

In [126]:
sgd = SGDClassifier(loss="perceptron", penalty="elasticnet",learning_rate="adaptive",max_iter=1000,
                    alpha=0.0001, l1_ratio=0.15, fit_intercept=True,tol=0.01, shuffle=True, 
                    verbose=0, epsilon=0.1, n_jobs=4, random_state=None,eta0=0.0001, power_t=0.5, 
                    early_stopping=False,validation_fraction=0.1, n_iter_no_change=50, 
                    class_weight="balanced", warm_start=True, average=False)
sgd.fit(XTrainSMOTEENN,yTrainSMOTEENN)
# sgd.fit(oneHotEncX.transform(X_train),y_train)
prediction = sgd.predict(oneHotEncX.transform(X_test))
print("Balanced Accuracy :: {}".format(balanced_accuracy_score(y_test,prediction)))
print("Classification Report :: {}".format(classification_report(y_test,prediction,output_dict=False)))

  y = column_or_1d(y, warn=True)


Balanced Accuracy :: 0.5072248727928015
Classification Report ::               precision    recall  f1-score   support

         0.0       0.54      0.02      0.03      8775
         1.0       0.82      1.00      0.90     39512

    accuracy                           0.82     48287
   macro avg       0.68      0.51      0.47     48287
weighted avg       0.77      0.82      0.74     48287



In [129]:
##

print(confusion_matrix(y_test,prediction))



##

[[ 2482  6293]
 [ 6208 33304]]


array([[ 8775,     0],
       [39512,     0]])

In [26]:
pd.value_counts(y_test.Did_Police_Officer_Attend_Scene_of_Accident)
pd.value_counts(yTrainSMOTEENN)

2    39783
1    38043
dtype: int64

In [127]:
dtc = DecisionTreeClassifier()
# sgd.fit(XTrainSMOTEENN,yTrainSMOTEENN)
# dtc.fit(X_train,y_train)
# prediction = dtc.predict(X_test)
dtc.fit(oneHotEncX.transform(X_train),y_train)
prediction = dtc.predict(oneHotEncX.transform(X_test))
# print("Balanced Accuracy :: {}".format(balanced_accuracy_score(y_test,prediction)))
# print("Classification Report :: {}".format(classification_report(y_test,prediction,output_dict=False)))
print(confusion_matrix(y_test,prediction))
classification_report(y_test,prediction)

[[ 2482  6293]
 [ 6208 33304]]


'              precision    recall  f1-score   support\n\n         0.0       0.29      0.28      0.28      8775\n         1.0       0.84      0.84      0.84     39512\n\n    accuracy                           0.74     48287\n   macro avg       0.56      0.56      0.56     48287\nweighted avg       0.74      0.74      0.74     48287\n'

In [4]:
sgd_list = ["performance_matrix_sgd" + str(c+1) for c in range(14)]

In [5]:
sgd_list

['performance_matrix_sgd1',
 'performance_matrix_sgd2',
 'performance_matrix_sgd3',
 'performance_matrix_sgd4',
 'performance_matrix_sgd5',
 'performance_matrix_sgd6',
 'performance_matrix_sgd7',
 'performance_matrix_sgd8',
 'performance_matrix_sgd9',
 'performance_matrix_sgd10',
 'performance_matrix_sgd11',
 'performance_matrix_sgd12',
 'performance_matrix_sgd13',
 'performance_matrix_sgd14']