In [None]:
#Importing required libraries
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib as mpl
import matplotlib.pyplot as plt
#import warnings
#warning.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score,f1_score
from collections import Counter

# This section focuses on Cleaning the RedLight Camera Dataframe

In [None]:
#Load Redlight camera data from database extracts
df_RL=pd.read_csv("../DB_Extracts/red_light_camera.csv")
df_RL.head(5)

In [None]:
#show column names for RedLight dataframe
list(df_RL.columns)

In [None]:
# Remove unneeded columns from RedLight dataframe
df_RL_clean1=df_RL.drop(['id','INTERSECTION_ID','LINEAR_NAME_FULL_1', 'LINEAR_NAME_FULL_2','ID1','X','Y','LONGITUDE',
 'LATITUDE','OBJECTID','MID_BLOCK','POLICE_DIVISION_2','POLICE_DIVISION_3','SIDE2','SIDE1','MI_PRINX','PRIVATE_ACCESS',
 'TCS', 'ADDITIONAL_INFO', 'POLICE_DIVISION_1', 'WARD_1', 'WARD_3', 'WARD_2', 'WARD_4','MAIN'], axis = 1)
df_RL_clean1.head(5)

In [None]:
df_RL_clean1.info()

In [None]:
RL=df_RL_clean1.shape
print(RL)

In [None]:
#parse out geo-coordinates
import re
df_RL_clean1["geometry2"] = df_RL_clean1["geometry"].str.extract(r"\((.*?)\)")

In [None]:
print(df_RL_clean1['geometry2'])

In [None]:
# split geometry2 into two columns: long & Lat

df_RL_clean1[['Longitude', 'Latitude']] = df_RL_clean1['geometry2'].str.split(',', 1, expand=True)

print(df_RL_clean1['Longitude'], df_RL_clean1['Latitude'])

In [None]:
## Standardize Lat & Long Length to match between the files
df_RL_clean1['Longitude']=df_RL_clean1.Longitude.map(lambda l: l[:8])

In [None]:
df_RL_clean1['Latitude']=df_RL_clean1.Latitude.map(lambda l: l[:8].replace(" ",''))

In [None]:
df_RL=df_RL_clean1
df_RL.head(5)

In [None]:
# Remove unneeded columns from RedLight dataframe
df_RL=df_RL.drop(['geometry'], axis = 1)

In [None]:
#re-order the columns for readability
df_RL = df_RL[["RLC", "ACTIVATION_DATE","ACTIVATION_TIME","CLIENT_STREET_2","CLIENT_STREET_1","NAME","DISTRICT","geometry2","Longitude","Latitude"]]
df_RL.head(5)

## This section focuses on Cleaning the Seriously Injured Dataframe

In [None]:
# This setting lets you see all columns in the output
pd.set_option('display.max_columns',None)

In [None]:
df_KSI=pd.read_csv("../DB_Extracts/seriously_injured_data.csv")
df_KSI.head(5)

In [None]:
# Number of rows and columns in each dataset
KSI=df_KSI.shape
print(KSI)

In [None]:
df_KSI.info()

In [None]:
# Remove all rows where injury is Null as these are only Vehicle or property owners

df_KSI_clean1=df_KSI.dropna(subset=['INJURY_ID'])

KSI=df_KSI_clean1.shape
print(KSI)

In [None]:
# remove non-driver types

#define list of values
values = ["Cyclist","Cyclist Passenger","Pedestrian - Not Hit","In-Line Skater","Motorcycle Passenger","Other", "Other Property Owner","Passenger","Pedestrian","Vehicle Owner","Wheelchair"]

#drop any rows not equal to values
df_KSI_clean1 = df_KSI_clean1[df_KSI_clean1.INVTYPE.isin(values) == False]

In [None]:
# count unique values in "INVTYPE" column

df_KSI_clean1.groupby(['INVTYPE']).count()

In [None]:
# count unique values in Accident Classification column

df_KSI_clean1.groupby(['ACCLASS']).count()

In [None]:
# storing unique value in a variable
unique_value = df_KSI_clean1["ACCNUM"].nunique()
  
# printing value
print(unique_value)

In [None]:
# Remove duplicate accident data - raw data has a row per person involved in the accident.

# sorting by ACCNUM
df_KSI_clean1.sort_values("ACCNUM", inplace = True)
 
# dropping ALL duplicate ACCNUM rows
df_KSI_clean2=df_KSI_clean1.drop_duplicates(subset ="ACCNUM",keep = 'first')

df_KSI_clean2.head(5)

In [None]:
#show column names for df_KSI_clean2 dataframe
list(df_KSI_clean2.columns)

In [None]:
#changing null values to Other in Imactype column for single record

df_KSI_clean2.loc[df_KSI_clean2['ACCNUM']==9002389784,"IMPACTYPE"]="Other"
df_KSI_clean2.loc[df_KSI_clean2['ACCNUM']==9002389784]

In [None]:
#changing null values to Other in Road Surface condition column for select records

values={7000795019,7001983024,8000303628,8000662467,8000874551,8000879182,8000973371,8008069034,9000524060,9002403050}

for i in values:
    df_KSI_clean2.loc[df_KSI_clean2['ACCNUM']== i,"RDSFCOND"]="Other"


In [None]:
#changing null values to Other in Visibility column for select records

values={7001983024,8000303628,8000662467,8000874551,8000879182,8000973371,8008069034,9000419013,9002403050}

for i in values:
    df_KSI_clean2.loc[df_KSI_clean2['ACCNUM']== i,"VISIBILITY"]="Other"

In [None]:
# drop unneeded columns

df_KSI_clean3=df_KSI_clean2.drop(['OFFSET','DISTRICT','WARDNUM','DIVISION','ACCLOC','FATAL_NO',
 'INITDIR','PEDTYPE','PEDACT','PEDCOND','CYCLISTYPE','CYCACT','CYCCOND','POLICE_DIVISION','HOOD_ID','NEIGHBOURHOOD'], axis = 1)
df_KSI_clean3.head(5)

In [None]:
df_KSI_clean3.info()

In [None]:
#Convert all columns with Yes & Null Values to 1=Yes 0=Null


df_KSI_clean3['PEDESTRIAN'] = df_KSI_clean3['PEDESTRIAN'].apply(lambda x:1 if x == 'Yes' else 0)
df_KSI_clean3['CYCLIST'] = df_KSI_clean3['CYCLIST'].apply(lambda x:1 if x == 'Yes' else 0)
df_KSI_clean3['AUTOMOBILE'] = df_KSI_clean3['AUTOMOBILE'].apply(lambda x:1 if x == 'Yes' else 0)
df_KSI_clean3['MOTORCYCLE'] = df_KSI_clean3['MOTORCYCLE'].apply(lambda x:1 if x == 'Yes' else 0)
df_KSI_clean3['TRUCK'] = df_KSI_clean3['TRUCK'].apply(lambda x:1 if x == 'Yes' else 0)
df_KSI_clean3['TRSN_CITY_VEH'] = df_KSI_clean3['TRSN_CITY_VEH'].apply(lambda x:1 if x == 'Yes' else 0)
df_KSI_clean3['EMERG_VEH'] = df_KSI_clean3['EMERG_VEH'].apply(lambda x:1 if x == 'Yes' else 0)
df_KSI_clean3['PASSENGER'] = df_KSI_clean3['PASSENGER'].apply(lambda x:1 if x == 'Yes' else 0)
df_KSI_clean3['SPEEDING'] = df_KSI_clean3['SPEEDING'].apply(lambda x:1 if x == 'Yes' else 0)
df_KSI_clean3['AG_DRIV'] = df_KSI_clean3['AG_DRIV'].apply(lambda x:1 if x == 'Yes' else 0)
df_KSI_clean3['REDLIGHT'] = df_KSI_clean3['REDLIGHT'].apply(lambda x:1 if x == 'Yes' else 0)
df_KSI_clean3['ALCOHOL'] = df_KSI_clean3['ALCOHOL'].apply(lambda x:1 if x == 'Yes' else 0)
df_KSI_clean3['DISABILITY'] = df_KSI_clean3['DISABILITY'].apply(lambda x:1 if x == 'Yes' else 0)

print(df_KSI_clean3[['PEDESTRIAN','CYCLIST','AUTOMOBILE','MOTORCYCLE','TRUCK','TRSN_CITY_VEH','EMERG_VEH','PASSENGER','SPEEDING','AG_DRIV','REDLIGHT','ALCOHOL','ALCOHOL']]) 

In [None]:
df_KSI_clean3.info()

In [None]:
#parse out geo-coordinates

df_KSI_clean3["geometry2"] = df_KSI_clean3["geometry"].str.extract(r"\((.*?)\)")


In [None]:
print(df_KSI_clean3['geometry2'])

In [None]:
# split geometry2 into two columns: long & Lat

df_KSI_clean3[['Longitude', 'Latitude']] = df_KSI_clean3['geometry2'].str.split(',', 1, expand=True)

print(df_KSI_clean3[['Longitude','Latitude']])

In [None]:
df_KSI_clean3.head(5)

In [None]:
# Standardize Lat & Long Length to match between the files

df_KSI_clean3['Longitude']=df_KSI_clean3.Longitude.map(lambda l: l[:8])

df_KSI_clean3['Latitude']=df_KSI_clean3.Latitude.map(lambda l: l[:8].replace(" ",''))


In [None]:
df_KSI_clean3.head(5)

In [None]:
# Remove null values in road_class_ID before changing column to int type
df_KSI_clean3.dropna(subset = ['ROAD_CLASS_ID'], inplace=True)

# converting 'Injury_ID' and 'ROAD_CLASS_ID' from float to int
df_KSI_clean3['INJURY_ID'] = df_KSI_clean3['INJURY_ID'].astype(int)
df_KSI_clean3['ROAD_CLASS_ID'] = df_KSI_clean3['ROAD_CLASS_ID'].astype(int)  

# displaying the datatypes
#display(df_KSI_clean3.dtypes)

# displaying the info
#df_KSI_clean3.info()

In [None]:
#Remove rows where ACCLASS = Property Damage Only

#define list of values
values = ["Property Damage Only"]


df_KSI_clean3 = df_KSI_clean3[df_KSI_clean1.ACCLASS.isin(values) == False]

# count unique values in "ACCLASS" column

df_KSI_clean3.groupby(['ACCLASS']).count()


In [None]:
#drop all rows with null values

df_KSI_clean3.dropna(axis=0, how='any',inplace=True)
df_KSI_clean3.head(5)

## This section adds Redlight data to KSI dataframe

In [None]:
print(df_KSI_clean3[['ACCNUM','Longitude','Latitude']])

In [None]:
print(df_RL[['Longitude','Latitude']])

In [None]:
# Assign a copy of df_KSI_clean3 to a new Dataframe temporarily
test_df=df_KSI_clean3.copy()

In [None]:
#Create the redlight column and set all values to zero
test_df['IsRedlight']=0

In [None]:
#Create a dataframe which is the inner join of the Redlight camera dataframe and KSI dataframe on Latitude and Longitude.
geomerge_df=pd.merge(df_RL, df_KSI_clean3,how='inner',on=['Longitude','Latitude'])

In [None]:
# shows how many unique accident numbers are in the 
geomerge_df['ACCNUM'].nunique()

In [None]:
#Shows how many records and row there are.  reveals that there are duplicate ACCNUM's
geomerge_df.shape

In [None]:
#Create the toggle redlight function to change the red light value from 0 to 1 when there is a match in Accident number
def toggleredlight (AccidentNum=None):
    if AccidentNum:
        test_df.loc[test_df['ACCNUM']==AccidentNum,"IsRedlight"]=1

In [None]:
# Using the ACCNUM column in Geomerge, map to the function, which contains the test_df dataframe and if ACCNUM if it is equal toggle the 0 to 1
geomerge_df['ACCNUM'].map(toggleredlight)

In [None]:
#print(test_df['IsRedlight'])

# Shows how many redlights mapped to our test_df
test_df.groupby(['IsRedlight']).count()

#pd.set_option("display.max_rows", None)
#test_df

In [None]:
# Assign test_df back to the df_KSI_clean3 dataframe
df_KSI_clean4=test_df

In [None]:
#drop columns which can't be used in ML model

df_KSI_clean5=df_KSI_clean4.drop(['VEHTYPE','MANOEUVER','DRIVACT','DRIVCOND','PEDESTRIAN','CYCLIST','AUTOMOBILE','MOTORCYCLE','TRUCK','TRSN_CITY_VEH','EMERG_VEH','PASSENGER','INJURY_ID','ACCNUM','ObjectId','INVTYPE','INVAGE','YEAR','DATE','TIME','STREET1','STREET2','geometry','geometry2','Longitude','Latitude'],axis=1)
df_KSI_clean5

In [None]:
df_KSI_clean5.loc[df_KSI_clean5['ACCLASS']!='Fatal','REDLIGHT'].value_counts().plot.pie()

In [None]:
df_KSI_clean5.dtypes

In [None]:
#categorical data
categorical_cols = ['HOUR', 'ROAD_CLASS_ID','LOCCOORD','TRAFFCTL','VISIBILITY','LIGHT','RDSFCOND','IMPACTYPE'] 

#import pandas as pd
df_KSI_clean6 = pd.get_dummies(df_KSI_clean5, columns = categorical_cols)

In [None]:
#create a mapping dictionary, you can just enumerate the categories using a dictionary comprehension
#{col: {n: cat for n, cat in enumerate(df_KSI_clean4[col].astype('category').cat.categories)}
 #    for col in df_KSI_clean4}

#convert the columns to categoricals
#df_KSI_clean5=pd.DataFrame({col: df_KSI_clean4[col].astype('category').cat.codes for col in df_KSI_clean4}, index=df_KSI_clean4.index)


In [None]:
#df_KSI_clean6.info()

In [None]:
# from sklearn.preprocessing import LabelEncoder
#import pandas
#from sklearn.preprocessing import LabelEncoder

#le = LabelEncoder()

#df_KSI_clean5['ACCLASS'] = le.fit_transform(df_KSI_clean5['ACCLASS'])
#df_KSI_clean5

## Fixing Class Imbalance

In [None]:
df_KSI_clean6.groupby('ACCLASS').count()['SPEEDING']# / df_KSI_clean4.shape[0]

def encode_acclass(row_val):
    """
    Take in the row value and check the value. 
    Then encode the row based on the value.
    """
    
    if row_val =='Non-Fatal Injury':
        return 0
    else:
        return 1
    
df_KSI_clean6['target']= df_KSI_clean6['ACCLASS'].apply(encode_acclass)

In [None]:
# selecting rows based on Accident Classification
Acclass1_df = df_KSI_clean6[df_KSI_clean6['target'] == 1]
Acclass0_df = df_KSI_clean6[df_KSI_clean6['target'] == 0]

print(Acclass1_df.shape)
print(Acclass0_df.shape)

In [None]:
#
import random 
from random import sample

Acclass0_df=Acclass0_df.sample(2000)
Acclass1_df=Acclass1_df.sample(1000, replace=True)

print(Acclass1_df.shape)
print(Acclass0_df.shape)

In [None]:
df_KSI_clean7=pd.concat([Acclass0_df, Acclass1_df])
df_KSI_clean7.shape

## Random Forest

In [None]:
def generate_model_report (y_actual, predictions):
    print("Accuracy =" , accuracy_score(y_actual, predictions))
    print("Precision =" , precision_score(y_actual, predictions))
    print("Recall =" , recall_score(y_actual, predictions))
    print("F1 Score =" , f1_score(y_actual, predictions))
    pass


In [None]:
#Split the data into Training & Testing

KSI_df=df_KSI_clean6.copy()

# Create our features
X = KSI_df.drop(["ACCLASS", "target"], axis=1)

# Create our target
y=KSI_df["target"].values

print(y)


X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1,stratify=y)
X_train.shape

In [None]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
balanced = BalancedRandomForestClassifier(n_estimators = 500, random_state=1)
balanced.fit(X_train, y_train)

In [None]:
# Calculated the balanced accuracy score
y_pred = balanced.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
generate_model_report(y_test, y_pred)

In [None]:
# List the features sorted in descending order by feature importance
dict(sorted(zip(X.columns,balanced.feature_importances_), reverse=True,key = lambda record:record[1]))

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

## gradient boosted tree

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Create a classifier object
learning_rates = [0.01, 0.05, 0.1, 0.15, 0.20, 0.25, 0.30, 0.50]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=500,
                                            learning_rate=learning_rate,
                                            max_features=5,
                                            max_depth=3,
                                            random_state=0)

    # Fit the model
    classifier.fit(X_train, y_train)
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            X_train,
            y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            X_test,
            y_test)))
    print()

In [None]:
# Choose a learning rate and create classifier
classifier = GradientBoostingClassifier(n_estimators=3000,
                                        learning_rate=0.5,
                                        max_features=5,
                                        max_depth=3,
                                        random_state=0)

# Fit the model
classifier.fit(X_train, y_train)

# Make Prediction
predictions = classifier.predict(X_test)
outcome_df=pd.DataFrame({"Prediction": predictions, "Actual": y_test})
outcome_df

In [None]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

In [None]:
generate_model_report(y_test, y_pred)

In [None]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)

# Displaying results
display(cm_df)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, predictions)

In [None]:
# Generate classification report
print("Classification Report")
print(classification_report(y_test, predictions))

## Unused code

In [None]:
# #from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()

# #'VEHTYPE','MANOEUVER','DRIVACT','DRIVCOND','LOCCOORD','TRAFFCTL','VISIBILITY','LIGHT','RDSFCOND','ACCLASS','IMPACTYPE']

# df_KSI_clean4 = df_KSI_clean3.copy()
# df_KSI_clean4['VEHTYPE_ENC'] = le.fit_transform(df_KSI_clean4['VEHTYPE'])
# df_KSI_clean4['MANOEUVER_ENC'] = le.fit_transform(df_KSI_clean4['MANOEUVER'])
# df_KSI_clean4['DRIVACT_ENC'] = le.fit_transform(df_KSI_clean4['DRIVACT'])
# df_KSI_clean4['DRIVCOND_ENC'] = le.fit_transform(df_KSI_clean4['DRIVCOND'])
# df_KSI_clean4['LOCCOORD_ENC'] = le.fit_transform(df_KSI_clean4['LOCCOORD'])
# df_KSI_clean4['TRAFFCTL_ENC'] = le.fit_transform(df_KSI_clean4['TRAFFCTL'])
# df_KSI_clean4['VISIBILITY_ENC'] = le.fit_transform(df_KSI_clean4['VISIBILITY'])
# df_KSI_clean4['LIGHT'] = le.fit_transform(df_KSI_clean4['LIGHT'])
# df_KSI_clean4['RDSFCOND'] = le.fit_transform(df_KSI_clean4['RDSFCOND'])
# df_KSI_clean4['ACCLASS'] = le.fit_transform(df_KSI_clean4['ACCLASS'])
# df_KSI_clean4['IMPACTYPE'] = le.fit_transform(df_KSI_clean4['IMPACTYPE'])
# df_KSI_clean4.head()

In [None]:
#Load Road class Table
df_RC_DB=pd.read_csv("../DB_Extracts/road_class.csv")
df_RC_DB.head(5)

In [None]:
#Load Injury table
df_Injury_DB=pd.read_csv("../DB_Extracts/injury.csv")
df_Injury_DB.head(5)