<a href="https://colab.research.google.com/github/aqualytics/INFT6201-Assignment3/blob/main/INFT6201_Analysis_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!pip install --upgrade matplotlib
#!pip install plotly-geo
#!pip install geopandas==0.8.1
#!pip install pyshp==1.2.10
#!pip install shapely==1.6.3

In [None]:
# Basic libraries
import pandas as pd 
import numpy as np
import re

# Visualisation libraries
import matplotlib.pyplot as plt 
import seaborn as sns


# Modelling libraries
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.cluster import KMeans
from sklearn import svm
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler

# Set styles and display options
sns.set_style('ticks')
pd.options.display.precision = 3

In [None]:
# Functions
def farenheit_to_celsius(farenheit):
  return ((farenheit - 32) * (5/9))

def m_to_km(m):
  return (m*1.609344)

def in_to_mm(i):
  return (i*25.4)

# Function to retrieve performance metrics
def scores(name, y_test, y_pred):
        list = [name,
                precision_score(y_test, y_pred, average='weighted'),
                recall_score(y_test, y_pred, average='weighted'),
                f1_score(y_test, y_pred, average='weighted'),
                accuracy_score(y_test, y_pred)]
        return list

In [None]:
# Load data
# ny = pd.read_csv('ny_accidents.csv')

In [None]:
# Import data for Google Collab. Remove before submission
url = 'https://raw.githubusercontent.com/aqualytics/INFT6201-Assignment3/main/ny_accidents.csv?token=GHSAT0AAAAAABZNNFLGDHXVZMJZZRLCVUMQYZXUG3Q'
df = pd.read_csv(url)


In [None]:
# Drop columns that won't be used in the analysis
try: # remove exception handling before submission
  cols = ['Unnamed: 0','Number', 'Airport_Code', 'Timezone', 'Country', 'State', 'Civil_Twilight','Nautical_Twilight','Astronomical_Twilight','Turning_Loop']
  df.drop(cols, axis=1, inplace=True)
except:
  pass

# Drop duplicates
df.drop_duplicates()

# Drop rows with na values
df.dropna(inplace = True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36779 entries, 0 to 39536
Data columns (total 38 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ID                 36779 non-null  object 
 1   Severity           36779 non-null  int64  
 2   Start_Time         36779 non-null  object 
 3   End_Time           36779 non-null  object 
 4   Start_Lat          36779 non-null  float64
 5   Start_Lng          36779 non-null  float64
 6   End_Lat            36779 non-null  float64
 7   End_Lng            36779 non-null  float64
 8   Distance.mi.       36779 non-null  float64
 9   Description        36779 non-null  object 
 10  Street             36779 non-null  object 
 11  Side               36779 non-null  object 
 12  City               36779 non-null  object 
 13  County             36779 non-null  object 
 14  Zipcode            36779 non-null  object 
 15  Weather_Timestamp  36779 non-null  object 
 16  Temperature.F.     367

In [None]:
# Pre-process of data
# Rename columns 
df.rename(columns = {'Distance.mi.':'Distance',
                     'Wind_Speed.mph.':'Wind_Speed',
                     'Temperature.F.':'Temperature',
                     'Wind_Chill.F.':'Wind_Chill',
                     'Humidity...':'Humidity',
                     'Pressure.in.':'Pressure',
                     'Visibility.mi.':'Visibility',
                     'Precipitation.in.':'Precipitation'}, inplace=True)

# Variable groups as lists
weather = ['Weather_Timestamp', 'Temperature', 'Wind_Chill', 'Humidity', 'Pressure','Visibility','Wind_Direction','Wind_Speed','Precipitation','Weather_Condition']
poi = ['Amenity','Bump','Crossing','Give_Way','Junction','No_Exit','Railway','Roundabout','Station','Stop','Traffic_Calming','Traffic_Signal']

# Convert dates & times to datetime type
cols = ['Start_Time','End_Time','Weather_Timestamp']
df[cols] = df[cols].apply(pd.to_datetime, errors='coerce')

# Convert imperial units to metric
df[['Temperature', 'Wind_Chill']] = df[['Temperature', 'Wind_Chill']].apply(farenheit_to_celsius) # farenheit to celsius
df[['Visibility', 'Wind_Speed']] = df[['Visibility', 'Wind_Speed']].apply(m_to_km) # miles to kilometres
df['Precipitation'] = df['Precipitation'].apply(farenheit_to_celsius) # inches to mm

# Add severity_cat column
conditions  = [ df['Severity'] == 1, df['Severity'] == 2, df['Severity'] == 3, df['Severity'] == 4 ]
choices     = [ 'Short', 'Medium-Short','Medium-Long', 'Long']
df['severitycat'] = np.select(conditions, choices, default=np.nan)

# Add incident_road_type column
keywords = ['Ave', 'St', 'Rd', 'Blvd', 'Dr', 'Avenue', 'Parkway', 'Pkwy']
df['road_type'] = 'high_speed_road'
df.loc[df['Street'].str.contains('|'.join(keywords), case = False) == True, 'road_type'] = 'local_road'

# Add weather_cat column to consolidate weather conditions
conditions = [df['Weather_Condition'].str.contains('Clear|Fair', case=False) == True,
              df['Weather_Condition'].str.contains('Cloud|Overcast', case=False) == True,
              df['Weather_Condition'].str.contains('Rain|Storm', case=False) == True,
              df['Weather_Condition'].str.contains('Heavy Rain|Rain Shower|Heavy T-Storm|Heavy Thunderstorms', case=False) == True,
              df['Weather_Condition'].str.contains('Snow|Sleet|Ice', case=False) == True,
              df['Weather_Condition'].str.contains('Heavy Snow|Heavy Sleet|Heavy Ice Pellets|Snow Showers|Squalls', flags=re.IGNORECASE, regex=True) == True,
              df['Weather_Condition'].str.contains('Fog', case=False) == True]
choices = ['Clear', 'Cloud', 'Rain', 'Heavy_rain', 'Snow', 'Heavy_Snow','Fog']
df['weather_cat'] = np.select(conditions, choices, default='Other') # Assign null value as clear

# Add season category
df['month'] = df['Start_Time'].dt.month
conditions = (df.month < 3, df.month < 5, df.month < 8, df.month <12, df.month == 12)
choices = ['winter', 'spring','summer', 'fall', 'winter']
df['season'] = np.select(conditions, choices, default=np.nan)
df.drop('month', axis=1, inplace=True)

# Add incident duration
df['duration'] = (df['End_Time'] - df['Start_Time']).dt.total_seconds().div(60).astype(int)

# Add traffic calming
df['Traffic_Calming_alt'] = df['Traffic_Calming'] 
df.loc[df['Bump'] == True, 'Traffic_Calming_alt'] = True

In [None]:
# Summarise weather data
num_weather = ['Temperature', 'Wind_Chill', 'Humidity', 'Pressure','Visibility','Wind_Speed','Precipitation'] # weather columns containing numerical variables
df_weather_stats = df[num_weather].describe().T
df_weather_stats = pd.concat([df_weather_stats, df[num_weather].skew()], axis=1).rename(columns={0:'Skewness'})
df_weather_stats = pd.concat([df_weather_stats, df[num_weather].kurt()], axis=1).rename(columns={0:'Kurtosis'})
display(df_weather_stats)

# Boxplot each variable

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,Skewness,Kurtosis
Temperature,36779.0,11.406,9.183,-24.444,4.444,11.111,18.333,35.556,0.029,-0.539
Wind_Chill,36779.0,9.998,10.708,-34.667,1.667,11.111,18.333,35.556,-0.146,-0.678
Humidity,36779.0,66.184,20.53,13.0,50.0,68.0,84.0,100.0,-0.222,-0.999
Pressure,36779.0,29.695,0.392,27.55,29.45,29.72,29.97,30.71,-0.726,1.506
Visibility,36779.0,14.524,4.32,0.0,16.093,16.093,16.093,32.187,-1.847,3.666
Wind_Speed,36779.0,14.602,9.435,0.0,8.047,12.875,20.921,64.374,0.679,0.736
Precipitation,36779.0,-17.775,0.015,-17.778,-17.778,-17.778,-17.778,-17.317,10.237,149.763


In [None]:
# Predictive Models

# Prepare training and test data
exclude = ['ID', 'Start_Time', 'End_Time','Description', 'Street', 'Side','City','County','Zipcode','road_type','Weather_Timestamp', 'Wind_Direction','Severity', 'Weather_Condition', 'Sunrise_Sunset','weather_cat', 'severitycat','season']
df_feat = df.copy().drop(columns = exclude)
df_target = df.Severity.copy()

# Split training and test data
X_train, X_test, y_train, y_test = train_test_split(df_feat, df_target, test_size = 0.5, random_state=42)

# Balance training dataset
from imblearn.combine import SMOTETomek
smo_tek = SMOTETomek(random_state=0)
X_train, y_train = smo_tek.fit_resample(X_train, y_train)

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)

# Create df to store model performance 
results = pd.DataFrame(columns = ['Model','Precision', 'Recall','F1-Score','Accuracy'])

In [None]:
# Classification of severity
# Supervised Learning Algorithms
# Logistic regression
parameters = {
    "solver":['newton-cg'],
    "penalty":['l2'],
    "C":[0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

# Find the best value of n parameters for accuracy
grid = GridSearchCV(LogisticRegression(), param_grid = parameters, cv=5, scoring='f1_weighted')
grid.fit(X_train, y_train)

# Return parameters of model with highest accuracy
optimal_model = grid.best_estimator_
print('Best parameters are: ')
print( grid.best_params_)

grid_predictions = grid.predict(X_test) # Perform prediction on test data
results.loc[len(results)] = scores('Logistic Regression', y_test, grid_predictions) # Store metrics to results df

In [None]:
# K-Nearest Neghbours
parameters = {'n_neighbors': np.arange(1, 20)}

# Find the best value of n parameters for accuracy
grid = RandomizedSearchCV(KNeighborsClassifier(), parameters, cv = 2, random_state=1, scoring='f1_weighted')
grid.fit(X_train, y_train)

# Return parameters of model with highest accuracy
optimal_model = grid.best_estimator_
print('Best parameters are: ')
print( grid.best_params_)

grid_predictions = grid.predict(X_test) # Perform prediction on test data
results.loc[len(results)] = scores('KNN', y_test, grid_predictions) # Store metrics to results df

In [None]:
# SVM
# Defining parameter range for grid search
parameters ={
    "C":[0.1,1],
    "kernel":['linear'],
    "gamma":['scale', 'auto']
}

# Find the best SVM model parameters for accuracy
grid = GridSearchCV(svm.SVC(), param_grid = parameters, cv = 2, verbose=2, scoring='f1_weighted')
grid.fit(X_train, y_train)

# Return parameters of model with highest accuracy
optimal_model = grid.best_estimator_
print('Best parameters are: ')
print( grid.best_params_)

grid_predictions = grid.predict(X_test) # Perform prediction on test data
results.loc[len(results)] = scores('SVM', y_test, grid_predictions) # Store metrics to results df

In [None]:
# Unsupervised Learning Algorithms
# Random Forest
mod = RandomForestClassifier(random_state=0).fit(X_train, y_train) # train model
predicted_class = mod.predict(X_test) # perform prediction

results.loc[len(results)] = scores('Random Forest', y_test, predicted_class) # store performance metrics to results df

In [None]:
# AdaBoost
mod = AdaBoostClassifier(random_state=0).fit(X_train, y_train) # train model
predicted_class = mod.predict(X_test) # perform prediction

results.loc[len(results)] = scores('AdaBoost', y_test, predicted_class) # store performance metrics to results df

In [None]:
# GradientBoosting
mod = GradientBoostingClassifier(random_state=0).fit(X_train, y_train) # train model
predicted_class = mod.predict(X_test) # perform prediction

results.loc[len(results)] = scores('GradientBoosting', y_test, predicted_class) # store performance metrics to results df

In [None]:
# Output results of models
display(results.round(4).style.hide(axis = 'index'))