In [182]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.cluster import KMeans
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from tensorflow.keras import regularizers
from keras import backend as K
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Models/post_EDA.csv")
data.shape

(192021, 16)

In [35]:
null = data.isnull().sum()
null_sorted = null.sort_values(ascending = False)
print(null_sorted)
data = data.dropna()

time_of_day       1
season            1
severity          0
speed_limit       0
intersection      0
weather           0
lighting          0
month             0
day_of_week       0
hour              0
pedestrian        0
vehicle_other     0
car               0
large_vehicles    0
rail_vehicle      0
two_wheeled       0
dtype: int64


In [36]:
X  = data.drop(columns = "severity")
Y = data['severity'] - 1

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state = 123, test_size = .3)

In [38]:
log_model = LogisticRegression()

In [39]:
scaler = StandardScaler()

In [40]:
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

In [41]:
X_test_scaled = pd.DataFrame(scaler.fit_transform(X_test), columns = X_test.columns)

In [42]:
X_train_final, y_train_final = SMOTE().fit_resample(X_train_scaled, y_train)
np.bincount(y_train_final)

array([87869, 87869, 87869])

# **Log Reg**

In [132]:
steps =  [('pca', PCA(n_components=3)), ('m', LogisticRegression())]
model = Pipeline(steps = steps)

In [133]:
model.fit(X_train_scaled, y_train)

Pipeline(steps=[('pca', PCA(n_components=3)), ('m', LogisticRegression())])

In [134]:
y_pred_log = model.predict(X_test_scaled)

In [135]:
print(classification_report(y_test, y_pred_log))

              precision    recall  f1-score   support

           0       0.67      0.95      0.78     37736
           1       0.44      0.09      0.15     18796
           2       0.00      0.00      0.00      1074

    accuracy                           0.65     57606
   macro avg       0.37      0.35      0.31     57606
weighted avg       0.58      0.65      0.56     57606



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [174]:
def create_model(neurons= 5, optimizer = 'Adam', activation = 'sigmoid', layer_count = 2, learning_rate = .003):
  model = Sequential()    
  model.add(Dense(neurons, activation = activation, input_shape = (15, ), kernel_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4), bias_regularizer=regularizers.L2(1e-4), activity_regularizer=regularizers.L2(1e-5)))
  for i in range(layer_count - 1):    
    model.add(Dense(neurons, activation=activation, kernel_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4), bias_regularizer=regularizers.L2(1e-4), activity_regularizer=regularizers.L2(1e-5)))
  model.add(Dense(1, activation='hard_sigmoid'))  
  model.compile(loss='categorical_crossentropy', optimizer= optimizer, metrics=['categorical_accuracy'])
  K.set_value(model.optimizer.learning_rate, learning_rate)
  return model

In [175]:
nn = create_model()
nn.fit(X_train_scaled, y_train, epochs = 200, verbose = 0)

<keras.callbacks.History at 0x7f7fafac0590>

In [176]:
y_pred_net = nn.predict(X_test_scaled)

In [178]:
y_pred_net = np.argmax(y_pred_net, axis=-1)

In [179]:
print(classification_report(y_test, y_pred_net))

              precision    recall  f1-score   support

           0       0.66      1.00      0.79     37736
           1       0.00      0.00      0.00     18796
           2       0.00      0.00      0.00      1074

    accuracy                           0.66     57606
   macro avg       0.22      0.33      0.26     57606
weighted avg       0.43      0.66      0.52     57606



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [147]:
rf = RandomForestClassifier()

In [148]:
rf.fit(X_train_scaled, y_train)

RandomForestClassifier()

In [149]:
y_pred_rf = rf.predict(X_test_scaled)

In [145]:
#y_pred_rf = np.argmax(y_pred_rf, axis=-1)

In [150]:
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.76      0.85      0.81     37736
           1       0.62      0.49      0.55     18796
           2       0.51      0.27      0.35      1074

    accuracy                           0.72     57606
   macro avg       0.63      0.54      0.57     57606
weighted avg       0.71      0.72      0.71     57606



In [183]:
dt = DecisionTreeClassifier()

In [184]:
dt.fit(X_train_scaled, y_train)

DecisionTreeClassifier()

In [185]:
y_pred_dt = dt.predict(X_test_scaled)

In [186]:
print(classification_report(y_test, y_pred_dt))

              precision    recall  f1-score   support

           0       0.76      0.85      0.80     37736
           1       0.61      0.47      0.53     18796
           2       0.39      0.27      0.32      1074

    accuracy                           0.72     57606
   macro avg       0.59      0.53      0.55     57606
weighted avg       0.70      0.72      0.70     57606



In [187]:
ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())

In [188]:
ada.fit(X_train_scaled, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier())

In [189]:
y_pred_ada = ada.predict(X_test_scaled)

In [190]:
print(classification_report(y_test, y_pred_ada))

              precision    recall  f1-score   support

           0       0.76      0.84      0.80     37736
           1       0.60      0.50      0.54     18796
           2       0.46      0.28      0.35      1074

    accuracy                           0.72     57606
   macro avg       0.61      0.54      0.57     57606
weighted avg       0.71      0.72      0.71     57606



In [191]:
from sklearn.neighbors import KNeighborsClassifier

In [193]:
knn = KNeighborsClassifier(n_neighbors = 5, metric = 'manhattan', weights = 'distance', leaf_size = 15)
knn.fit(X_train_scaled, y_train)

KNeighborsClassifier(leaf_size=15, metric='manhattan', weights='distance')

In [194]:
y_pred_knn = knn.predict(X_test_scaled)

In [195]:
print(classification_report(y_test, y_pred_knn))

              precision    recall  f1-score   support

           0       0.76      0.82      0.79     37736
           1       0.58      0.50      0.54     18796
           2       0.49      0.26      0.34      1074

    accuracy                           0.71     57606
   macro avg       0.61      0.53      0.56     57606
weighted avg       0.70      0.71      0.70     57606

