# Issues faced:
* The size of the data set is very large
* The data is very unbalanced

# Importing necessary libraries

In [None]:
# Importing data processing libraries
import math
import numpy as np
from numpy import log
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import category_encoders as ce
from category_encoders.binary import BinaryEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion
import csv
from collections import Counter
from imblearn.over_sampling import SMOTE
import time
from nltk.corpus import stopwords
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import RepeatedStratifiedKFold

# Importing data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Model libraries 
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.naive_bayes import CategoricalNB, BernoulliNB, MultinomialNB, GaussianNB
import xgboost

# Model evaluation 
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

# Hyperparameter tunning libraries
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
import random


# Importing Data

In [None]:
# Function to import dataset
def importData():
    data = pd.read_csv("../input/us-accidents/US_Accidents_Dec20_Updated.csv")
    split = StratifiedShuffleSplit(n_splits=1, test_size = 0.2, random_state = 21)
    for train_index, test_index in split.split(data, data["Severity"]):
        train = data.loc[train_index]
        test = data.loc[test_index]
    return [train, test]

In [None]:
# Stitified splitting the data
train, test = importData()

In [None]:
train.info()

In [None]:
test.info()

# Taking a quick look into the data

In [None]:
# train.head(10)

In [None]:
# n = len(train["Severity"])
# classes = [(clas,float(count)) for clas,count in Counter(train["Severity"]).items()]
# k = len(classes)

# H = -sum([ (count/n) * log((count/n)) for clas,count in classes]) #Shannon entropy

# print(H/log(k))

***Data is very imbalanced***

# Visualizing Imbalance

In [None]:
# severity_counts = train["Severity"].value_counts()

# plt.figure(figsize=(10, 8))
# plt.title("Histogram for the severity")
# sns.barplot(x = severity_counts.index,y = severity_counts.values)
# sns.set(style="darkgrid")
# sns.set_context("talk")
# plt.xlabel("Severity")
# plt.ylabel("Number of Accidents")
# plt.show()
# print(severity_counts)

# Checking correlation

In [None]:
# corr_matrix = train.corr()

# plt.figure(figsize=(12, 12))
# sns.heatmap(corr_matrix, vmin=-1, vmax=1, cmap="seismic")
# plt.gca().patch.set(hatch="X", edgecolor="#0080ff")
# plt.show()

***Few very highly correlated features are***
* Start_Lat and End_Lat
* Start_Lng and End_Lng
* Start_Lng and End_Lng with Temperature and WindChill
* Temperature and Wind_Chill
* Humidity with Temperature and Wind_Chill
* Visibility and Humidity
* Traffic_Signal and Crossing
* Traffic_Calming and Bump

# Data Preprocessing

In [None]:
# train.info()

# Dropping Few Features
* ID
* End_Lat
* End_Lng
* Description
* Weather_Timestamp
* Country

In [None]:
class dropCompletelyUnnecessaryFeatures(BaseEstimator, TransformerMixin):
    def __init__(self,arg=None):
        self.features = arg
    def fit(self,X,y=None):
        return self
    def transform(self,data,y=None):
        features = list(data.columns.values) if self.features == None else self.features
        data.drop(features, axis = 1, inplace = True)
        return data
    
# temp = dropCompletelyUnnecessaryFeatures(['ID', 'End_Lat','End_Lng', 'Description','Weather_Timestamp','Country'])
# temp.transform(train)

# Converting Start_Time and End_Time from object to float/int

In [None]:
class convertStartAndEndTime(BaseEstimator, TransformerMixin):
    def __init__(self,arg=None):
        self.arg = arg
    def fit(self,X,y=None):
        return self
    def transform(self,df,y=None):
        # Cast Start_Time to datetime
        df["Start_Time"] = pd.to_datetime(df["Start_Time"])
        df["End_Time"] = pd.to_datetime(df["End_Time"])
        # Extract year, month, weekday, day, hour and minute
        df["Year"] = df["Start_Time"].dt.year
        df["Month"] = df["Start_Time"].dt.month
        df["Weekday"] = df["Start_Time"].dt.weekday
        df["Day"] = df["Start_Time"].dt.day
        df["Hour"] = df["Start_Time"].dt.hour
        df["Minute"] = df["Start_Time"].dt.minute
        df["Duration"] = df["End_Time"] - df["Start_Time"]
        df["Duration"] = df["Duration"].dt.total_seconds()
        df.drop(['End_Time',"Start_Time"], axis = 1, inplace = True)
        return df
    
# temp = convertStartAndEndTime()
# temp.transform(train)
# train.head(10)

# Finding unique entries

In [None]:
# categorical_features = set(['Sunrise_Sunset','Civil_Twilight','Nautical_Twilight','Astronomical_Twilight','Weather_Condition',
#                            'Wind_Direction','Street','State','Side','City','County','Zipcode','Timezone','Airport_Code'])

# for cat in categorical_features:
#     train[cat] = train[cat].astype("category")

# train.info()

# print("\nUnique classes for each categorical feature:")
# for cat in categorical_features:
#     print("{:15s}".format(cat), "\t", len(train[cat].unique()))

# Checking missing entries

In [None]:
# train.isna().sum()

# Dropping a few features 
* County, State, Zipcode, Timezone, Airport_Code, Street. (As we are just keeping the record "City" wise)
* Number and Precipitation (As they contain many  missing entries)
* Wind_Chill (As it contains many missing entries and is highly correlated with Temperature)

In [None]:
class dropFewMoreFeatures(BaseEstimator, TransformerMixin):
    def __init__(self,arg=None):
        self.features = arg
    def fit(self,X,y=None):
        return self
    def transform(self,data,y=None):
        features = list(data.columns.values) if self.features == None else self.features
        data.drop(features, axis = 1, inplace = True)
        return data
    
# temp = dropFewMoreFeatures(['County', 'State','Zipcode', 'Timezone','Airport_Code','Street', 'Number', 'Precipitation(in)','Wind_Chill(F)'])
# temp.transform(train)

In [None]:
# train.describe()

***As the continuous features with missing values do not have much variance, we can use mean as a measure to fill their missing values. We will drop entries of categorical variables having missing values***

# Filling missing values

In [None]:
class fillNA(BaseEstimator, TransformerMixin):
    def __init__(self, arg = None):
        self.featuresToFill = arg
    def fit(self,X,y=None):
        return self
    def transform(self,X,y = None):
        X[self.featuresToFill] = X[self.featuresToFill].fillna(X[self.featuresToFill].mean())
        X.dropna(inplace = True)
        return X

# temp = fillNA(['Temperature(F)','Humidity(%)','Pressure(in)','Visibility(mi)','Wind_Speed(mph)'])
# temp.transform(train)
# train.info()

In [None]:
# train.info()

# Class for replacing boolean values

In [None]:
class replaceBoolean(BaseEstimator, TransformerMixin):
    def __init__(self,arg=None):
        self.features = arg
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        features = list(X.columns.values) if self.features == None else self.features
        for feature in features:
            X.loc[X[feature] == True, feature] = 1
            X.loc[X[feature] == False, feature] = 0
        return X

# Creating a preprocessing pipeline

In [None]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    """Select only specified columns."""
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X[self.columns]

In [None]:
def preprocessData():
    
    data = pd.read_csv("../input/us-accidents/US_Accidents_Dec20_Updated.csv")
    
    def customTransforms(data):
        temp = dropCompletelyUnnecessaryFeatures(['ID', 'End_Lat','End_Lng', 'Description','Weather_Timestamp','Country'])
        data = temp.transform(data)
        temp = convertStartAndEndTime()
        data = temp.transform(data)
        temp = dropFewMoreFeatures(['County', 'State','Zipcode', 'Timezone','Airport_Code','Street', 'Number', 'Precipitation(in)','Wind_Chill(F)'])
        data = temp.transform(data)
        temp = fillNA(['Temperature(F)','Humidity(%)','Pressure(in)','Visibility(mi)','Wind_Speed(mph)'])
        data = temp.transform(data)
        temp = replaceBoolean(['Amenity','Bump','Crossing','Give_Way','Junction','No_Exit','Railway','Roundabout','Station','Stop','Traffic_Calming','Traffic_Signal','Turning_Loop'])
        data = temp.transform(data)
        data.drop_duplicates(inplace=True)
        return data
    
    finalChangesPipeine = ColumnTransformer([
        ('binary_encoder', BinaryEncoder(cols = ['City','Wind_Direction','Weather_Condition'], return_df=True),['City','Wind_Direction','Weather_Condition']),
        ('one_hot_encoder',ce.OneHotEncoder(),['Sunrise_Sunset','Civil_Twilight','Nautical_Twilight','Astronomical_Twilight','Side']),
        ('scalar', StandardScaler(),['Start_Lat','Start_Lng','Distance(mi)','Temperature(F)','Humidity(%)','Pressure(in)','Visibility(mi)','Wind_Speed(mph)','Duration','Month','Weekday','Day','Hour','Minute','Year']),
    ],n_jobs = 1, verbose = True, remainder='passthrough')
    
    data = customTransforms(data)
    
    y = data["Severity"].tolist()
    data.drop("Severity", axis = 1, inplace = True)
    
    data = finalChangesPipeine.fit_transform(data)
    
    data = pd.DataFrame(data, columns = [x for x in range(1,len(data[0])+1)])
    data = data.astype(dtype = np.float32)
    
    data["Severity"] = y
    split = StratifiedShuffleSplit(n_splits=1, test_size = 0.2, random_state = 21)
    for train_index, test_index in split.split(data, data["Severity"]):
        train = data.loc[train_index]
        test = data.loc[test_index]
    return [train, test]

train, test = preprocessData()

# Trying different models

# Random Forest

In [None]:
rnf_clf = RandomForestClassifier(max_depth= 192, max_features=29, n_estimators=211)
# 0.8422375933500217

#Output
# Training Accuracy RandomForestClassifier 0.9990186502845451
# F1-Score for Training RandomForestClassifier 0.9990149141866291
# Classification Report for Training RandomForestClassifier               precision    recall  f1-score   support

#            1       1.00      1.00      1.00     16834
#            2       1.00      1.00      1.00   1247437
#            3       1.00      1.00      1.00    388821
#            4       1.00      0.98      0.99     71064

#     accuracy                           1.00   1724156
#    macro avg       1.00      0.99      1.00   1724156
# weighted avg       1.00      1.00      1.00   1724156

# Validation Accuracy RandomForestClassifier 0.8318067928730513
# F1-Score for Validation RandomForestClassifier 0.8200637863625027
# Classification Report for Validation RandomForestClassifier               precision    recall  f1-score   support

#            1       0.84      0.49      0.62      4209
#            2       0.85      0.95      0.89    311860
#            3       0.76      0.56      0.64     97205
#            4       0.75      0.40      0.52     17766

#     accuracy                           0.83    431040
#    macro avg       0.80      0.60      0.67    431040
# weighted avg       0.82      0.83      0.82    431040

# Extra Trees Classifier

In [None]:
ext_clf = ExtraTreesClassifier()

#Output
# Training Accuracy ExtraTreesClassifier 0.999025610211605
# F1-Score for Training ExtraTreesClassifier 0.9990205684096188
# Classification Report for Training               precision    recall  f1-score   support

#            1       1.00      1.00      1.00     16834
#            2       1.00      1.00      1.00   1247437
#            3       1.00      1.00      1.00    388821
#            4       1.00      0.98      0.99     71064

#     accuracy                           1.00   1724156
#    macro avg       1.00      0.99      1.00   1724156
# weighted avg       1.00      1.00      1.00   1724156

# Validation Accuracy ExtraTreesClassifier 0.7990209725315516
# F1-Score for Validation ExtraTreesClassifier 0.7793423318222337
# Classification Report for Validation               precision    recall  f1-score   support

#            1       0.78      0.43      0.55      4209
#            2       0.82      0.94      0.87    311860
#            3       0.70      0.46      0.56     97205
#            4       0.75      0.25      0.38     17766

#     accuracy                           0.80    431040
#    macro avg       0.76      0.52      0.59    431040
# weighted avg       0.79      0.80      0.78    431040

# XGB Classifer

In [None]:
xgb_clf = xgboost.XGBClassifier()

#Output
# Training Accuracy XGBClassifier 0.8285155171573801
# F1-Score for Training XGBClassifier 0.8190635931663616
# Classification Report for Training               precision    recall  f1-score   support

#            1       0.76      0.70      0.73     16834
#            2       0.86      0.93      0.89   1247437
#            3       0.73      0.57      0.64    388821
#            4       0.69      0.42      0.52     71064

#     accuracy                           0.83   1724156
#    macro avg       0.76      0.66      0.70   1724156
# weighted avg       0.82      0.83      0.82   1724156

# Validation Accuracy XGBClassifier 0.824072011878248
# F1-Score for Validation XGBClassifier 0.8143256111124326
# Classification Report for Validation               precision    recall  f1-score   support

#            1       0.74      0.67      0.70      4209
#            2       0.85      0.93      0.89    311860
#            3       0.72      0.56      0.63     97205
#            4       0.66      0.41      0.50     17766

#     accuracy                           0.82    431040
#    macro avg       0.74      0.64      0.68    431040
# weighted avg       0.81      0.82      0.81    431040

# Ada Boost Classifier

In [None]:
adab_clf = AdaBoostClassifier()

#Output
# Training Accuracy AdaBoostClassifier 0.7255921157946265
# F1-Score for Training AdaBoostClassifier 0.6823642127666705
# Classification Report for Training               precision    recall  f1-score   support

#            1       0.37      0.07      0.12     16834
#            2       0.76      0.93      0.83   1247437
#            3       0.52      0.22      0.31    388821
#            4       0.25      0.13      0.17     71064

#     accuracy                           0.73   1724156
#    macro avg       0.48      0.34      0.36   1724156
# weighted avg       0.68      0.73      0.68   1724156

# Validation Accuracy AdaBoostClassifier 0.7255939123979213
# F1-Score for Validation AdaBoostClassifier 0.6824048586340324
# Classification Report for Validation               precision    recall  f1-score   support

#            1       0.41      0.08      0.13      4209
#            2       0.76      0.93      0.83    311860
#            3       0.52      0.22      0.31     97205
#            4       0.25      0.13      0.17     17766

#     accuracy                           0.73    431040
#    macro avg       0.48      0.34      0.36    431040
# weighted avg       0.68      0.73      0.68    431040

# Gradient Boosting Classifier

In [None]:
gb_clf = GradientBoostingClassifier()

# Voting Classifier

In [None]:
voting_clf = VotingClassifier(
    estimators=[('rf', rnf_clf), ('xgb_clf', xgb_clf)], n_jobs=2
)

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size = 0.2, random_state = 21)
X_train = X_valid = 0
for train_index, valid_index in split.split(train, train["Severity"]):
    X_train = train.iloc[train_index]
    X_valid = train.iloc[valid_index]

y_train = X_train['Severity'].copy()
X_train.drop(["Severity"], axis = 1, inplace = True)
y_valid = X_valid['Severity'].copy()
X_valid.drop(["Severity"], axis = 1, inplace = True)

voting_clf.fit(X_train,y_train)
y_pred = voting_clf.predict(X_train)
y_valid_pred = voting_clf.predict(X_valid)
print("Training Accuracy", voting_clf.__class__.__name__, accuracy_score(y_train, y_pred))
print("F1-Score for Training", voting_clf.__class__.__name__, f1_score(y_train, y_pred, average="weighted"))
print("Classification Report for Training", classification_report(y_train, y_pred))
print("Validation Accuracy", voting_clf.__class__.__name__, accuracy_score(y_valid, y_valid_pred))
print("F1-Score for Validation", voting_clf.__class__.__name__, f1_score(y_valid, y_valid_pred, average="weighted"))
print("Classification Report for Validation", classification_report(y_valid, y_valid_pred))
    
# for clf in (xgb):
#     clf.fit(X_train,y_train)
#     y_pred = clf.predict(X_train)
#     y_valid_pred = clf.predict(X_valid)
#     print("Training Accuracy", clf.__class__.__name__, accuracy_score(y_train, y_pred))
#     print("F1-Score for Training", clf.__class__.__name__, f1_score(y_train, y_pred, average="weighted"))
#     print("Classification Report for Training", clf.__class__.__name__, classification_report(y_train, y_pred))
#     print("Validation Accuracy", clf.__class__.__name__, accuracy_score(y_valid, y_valid_pred))
#     print("F1-Score for Validation", clf.__class__.__name__, f1_score(y_valid, y_valid_pred, average="weighted"))
#     print("Classification Report for Validation", clf.__class__.__name__, classification_report(y_valid, y_valid_pred))

In [None]:
# voting_clf.fit(X_train,y_train)
# y_pred = voting_clf.predict(X_train)
# y_valid_pred = voting_clf.predict(X_valid)
# print("Training Accuracy", voting_clf.__class__.__name__, accuracy_score(y_train, y_pred))
# print("F1-Score for Training", voting_clf.__class__.__name__, f1_score(y_train, y_pred, average="weighted"))
# print("Classification Report for Training", classification_report(y_train, y_pred))
# print("Validation Accuracy", voting_clf.__class__.__name__, accuracy_score(y_valid, y_valid_pred))
# print("F1-Score for Validation", voting_clf.__class__.__name__, f1_score(y_valid, y_valid_pred, average="weighted"))
# print("Classification Report for Validation", classification_report(y_valid, y_valid_pred))

In [None]:
# param_distribs = {
#     'n_estimators': randint(low=200, high=300),
#     'max_features': randint(low=30, high=60),
#     'max_depth':randint(low=185, high = 205)
# }

# forest_clf = RandomForestClassifier(random_state=21)
# forest_clf_search = RandomizedSearchCV(forest_clf, param_distributions=param_distribs, n_jobs = 2, return_train_score = True,
#                                 n_iter=5, cv=2, random_state=21)
# forest_clf_search.fit(X_train,y_train)

# print(forest_clf_search.best_params_)
# print(forest_clf_search.best_score_)
# pd.DataFrame(forest_clf_search.cv_results_)

# {'max_depth': 192, 'max_features': 29, 'n_estimators': 211}
# 0.8422375933500217