In [2]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy.stats as st
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import missingno as msno
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.cluster import KMeans
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

In [3]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [None]:
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ML Project/output_data.csv")
data.shape

In [None]:
data.select_dtypes(include = ['object']).columns.tolist()

In [None]:
data.head()

In [None]:
null = data.isnull().sum()
null_sorted = null.sort_values(ascending = False)
print(null_sorted)

In [None]:
data['severity'].value_counts()

In [None]:
mapping = {"minor_injury" : 0, "serious_injury" : 1, "fatality" : 2}
data["severity"] = data["severity"].map(mapping)

In [None]:
correlation = data.corr()
plt.figure(figsize =(30, 20))
sns.heatmap(correlation, annot=True)

In [None]:
plt.title('Speed Limit with Severity of 1')
sns.distplot(data['speed_limit'][data['severity'] == 1], bins = 15)

In [None]:
plt.title('Speed Limit with Severity of 2')
sns.distplot(data['speed_limit'][data['severity'] == 2], bins = 15)

In [None]:
plt.title('Speed Limit with Severity of 3')
sns.distplot(data['speed_limit'][data['severity'] == 3], bins = 15)

In [None]:
data["severity"][data["intersection"] == 0].value_counts()

In [None]:
plt.title('Severity of Crash not at Intersection')
plt.pie(data["severity"][data["intersection"] == 0].value_counts(), labels = data["severity"].unique(), shadow = True)
plt.legend()
plt.show() 

In [None]:
data["severity"][data["intersection"] == 1].value_counts()

In [None]:
plt.title('Severity of Crash at Intersection')
plt.pie(data["severity"][data["intersection"] == 1].value_counts(), labels = data["severity"].unique(), shadow = True)
plt.legend()
plt.show() 

In [None]:
data = data.drop(columns = ['midblock'])

In [None]:
data['car'] = data["car_sedan"] + data["car_utility"] + data["car_van"] + data["car_4x4"]

In [None]:
data = data.drop(columns = ["car_sedan", "car_utility", "car_van", "car_4x4"])

In [None]:
data['car'] = data['car'] + data['car_station_wagon']

In [None]:
data = data.drop(columns = ['car_station_wagon'])

In [None]:
data['car'] = data['car'] + data['taxi']

In [None]:
data["large_vehicles"] = data['truck_small'] + data['truck_large'] + data['bus']

In [None]:
data = data.drop(columns = ["truck_small", "truck_large", "bus", "taxi"])

In [None]:
data['rail_vehicle'] = data['train'] + data ['tram']

In [None]:
data = data.drop(columns = ["train", "tram"])

In [None]:
data = data.drop(columns = ['bicycle', 'animals'])

In [None]:
data = data.drop(columns = ['road_sealed', 'road_wet', 'year', 'inanimate'])

In [None]:
data["two_wheeled"] = data['scooter'] + data['motor_cycle']

In [None]:
data = data.drop(columns = ["scooter", "motor_cycle"])

In [None]:
data['hour'].value_counts()

In [None]:
data.loc[(data['hour'] >= 0) & (data['hour'] < 8), 'time_of_day'] = 0
data.loc[(data['hour'] >= 8) & (data['hour'] < 16), 'time_of_day'] = 1
data.loc[(data['hour'] >= 16) & (data['hour'] <= 23), 'time_of_day'] = 2

In [None]:
data['month'].value_counts()

Using Austrailia seasons 0 = Spring, 1 = Summer, 2 = Fall, 3 = Winter

In [None]:
data.loc[(data['month'] >= 9) & (data['month'] <= 11), 'season'] = 0 #Spring Sept-Nov
data.loc[(data['month'] == 12) | (data['month'] <= 2 ), 'season'] = 1 #Summer Dec-Feb
data.loc[(data['month'] >= 3) & (data['month'] <= 5), 'season'] = 2 #Fall March-May
data.loc[(data['month'] >= 6) & (data['month'] <= 8), 'season'] = 3 # Winter June-Aug

In [None]:
correlation = data.corr()
plt.figure(figsize =(30, 20))
sns.heatmap(correlation, annot=True)

In [None]:
filename = 'post_EDA.csv'

data.to_csv(filename,index=False)

print('Saved file: ' + filename)

In [None]:
y = data['severity']
X = data.drop('severity', axis = 1)

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, random_state = 123, test_size = .3, stratify = y)

In [None]:
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, random_state = 123, test_size = .5, stratify = y_temp)

In [None]:
kmeans = KMeans(n_clusters = 3)

In [None]:
kmeans.fit(X_train, y_train)

In [None]:
y_pred = kmeans.predict(X_temp)

In [None]:
print(classification_report(y_temp, y_pred))