In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from matplotlib import pyplot as plt

In [3]:
#Read/Use Clean DataFrame
person_df = pd.read_csv("../person.CSV")
vehicle_df = pd.read_csv("../vehicle.CSV")
accident_df = pd.read_csv("../accident.CSV")
distract_df = pd.read_csv("../distract.CSV")

In [None]:
#Checking person dataframe
person_df.head()

In [4]:
#Only keeping certain columns from person dataframe
people_df = person_df[['CASENUM', 'PER_NO', 'SEXNAME', 'AGE', 'AIR_BAGNAME', 'DRINKINGNAME', 'REST_USENAME']]
people_df.head()

Unnamed: 0,CASENUM,PER_NO,SEXNAME,AGE,AIR_BAGNAME,DRINKINGNAME,REST_USENAME
0,201901174219,1,Male,39,Not Deployed,No (Alcohol Not Involved),Shoulder and Lap Belt Used
1,201901176655,1,Male,20,Not Deployed,Not Reported,Shoulder and Lap Belt Used
2,201901176655,2,Male,19,Not Deployed,Not Reported,None Used/Not Applicable
3,201901176655,3,Male,999,Not Deployed,Not Reported,Reported as Unknown
4,201901176655,4,Male,999,Not Deployed,Not Reported,Reported as Unknown


In [None]:
#Checking vehicle dataframe
vehicle_df.head()

In [5]:
#Creating new vehicle dataframe with only certain columns
new_vehicle_df = vehicle_df[['CASENUM', 'VEH_NO', 'MAKENAME', 'MAK_MODNAME', 'MOD_YEAR', 'TRAV_SPNAME']]
new_vehicle_df.head()

Unnamed: 0,CASENUM,VEH_NO,MAKENAME,MAK_MODNAME,MOD_YEAR,TRAV_SPNAME
0,201901174219,1,Ford,Ford Ranger,2006,Not Reported
1,201901176655,1,Dodge,Dodge Dart (2013 on. See model 001 for 1960-19...,2015,Not Reported
2,201901176655,2,Honda,Honda CR-V,2006,Not Reported
3,201901176667,1,Nissan/Datsun,Nissan/Datsun Xterra,2000,040 MPH
4,201901176667,2,Chevrolet,"Chevrolet C, K, R, V-series pickup/Silverado",1979,Not Reported


In [None]:
#Checking accident dataframe
accident_df.head()

In [6]:
#Creating new accident dataframe with only certain columns
new_accident_df = accident_df[['CASENUM', 'STRATUMNAME', 'REGIONNAME', 'URBANICITY', 'MONTHNAME', 'ALCOHOLNAME', 'HARM_EVNAME', 'WEATHERNAME',
                      'HOUR', 'REL_ROADNAME', 'MAN_COLLNAME', 'MAX_SEVNAME']]
new_accident_df.head()

Unnamed: 0,CASENUM,STRATUMNAME,REGIONNAME,URBANICITY,MONTHNAME,ALCOHOLNAME,HARM_EVNAME,WEATHERNAME,HOUR,REL_ROADNAME,MAN_COLLNAME,MAX_SEVNAME
0,201901174219,Stratum 10 - Other,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",2,January,No Alcohol Involved,Curb,Cloudy,8,On Median,The First Harmful Event was Not a Collision wi...,No Apparent Injury (O)
1,201901176655,Stratum 6 - LMY PV Minor Injury,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",2,January,Reported as Unknown,Motor Vehicle In-Transport,Clear,2,On Roadway,Angle,Possible Injury (C)
2,201901176667,Stratum 8 - NLMY PV Minor Injury,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",1,January,Alcohol Involved,Motor Vehicle In-Transport,Clear,5,On Roadway,Angle,Suspected Minor Injury (B)
3,201901176694,Stratum 9 - LMY PV No Injuries in Crash,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",1,January,No Alcohol Involved,Traffic Sign Support,Clear,4,On Roadside,The First Harmful Event was Not a Collision wi...,No Apparent Injury (O)
4,201901176702,Stratum 6 - LMY PV Minor Injury,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",1,January,No Alcohol Involved,Motor Vehicle In-Transport,Clear,9,On Roadway,Angle,Possible Injury (C)


In [None]:
#Checking distract dataframe
distract_df.head()

In [7]:
#Creating new distract dataframe with only certain columns
new_distract_df = distract_df[['CASENUM', 'VEH_NO', 'MDRDSTRDNAME']]
new_distract_df.head()

Unnamed: 0,CASENUM,VEH_NO,MDRDSTRDNAME
0,201901174219,1,Not Distracted
1,201901176655,1,Not Reported
2,201901176655,2,Not Reported
3,201901176667,1,"Inattention (Inattentive), Details Unknown"
4,201901176667,2,Not Reported


In [8]:
#Merging the new accident and vehicle dataframes together
data1_df = new_accident_df.merge(new_vehicle_df, on='CASENUM')
data1_df.head()

Unnamed: 0,CASENUM,STRATUMNAME,REGIONNAME,URBANICITY,MONTHNAME,ALCOHOLNAME,HARM_EVNAME,WEATHERNAME,HOUR,REL_ROADNAME,MAN_COLLNAME,MAX_SEVNAME,VEH_NO,MAKENAME,MAK_MODNAME,MOD_YEAR,TRAV_SPNAME
0,201901174219,Stratum 10 - Other,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",2,January,No Alcohol Involved,Curb,Cloudy,8,On Median,The First Harmful Event was Not a Collision wi...,No Apparent Injury (O),1,Ford,Ford Ranger,2006,Not Reported
1,201901176655,Stratum 6 - LMY PV Minor Injury,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",2,January,Reported as Unknown,Motor Vehicle In-Transport,Clear,2,On Roadway,Angle,Possible Injury (C),1,Dodge,Dodge Dart (2013 on. See model 001 for 1960-19...,2015,Not Reported
2,201901176655,Stratum 6 - LMY PV Minor Injury,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",2,January,Reported as Unknown,Motor Vehicle In-Transport,Clear,2,On Roadway,Angle,Possible Injury (C),2,Honda,Honda CR-V,2006,Not Reported
3,201901176667,Stratum 8 - NLMY PV Minor Injury,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",1,January,Alcohol Involved,Motor Vehicle In-Transport,Clear,5,On Roadway,Angle,Suspected Minor Injury (B),1,Nissan/Datsun,Nissan/Datsun Xterra,2000,040 MPH
4,201901176667,Stratum 8 - NLMY PV Minor Injury,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",1,January,Alcohol Involved,Motor Vehicle In-Transport,Clear,5,On Roadway,Angle,Suspected Minor Injury (B),2,Chevrolet,"Chevrolet C, K, R, V-series pickup/Silverado",1979,Not Reported


In [9]:
#Merging the new distract dataframe into information
data2_df = data1_df.merge(new_distract_df, on='CASENUM')
data2_df.head()

Unnamed: 0,CASENUM,STRATUMNAME,REGIONNAME,URBANICITY,MONTHNAME,ALCOHOLNAME,HARM_EVNAME,WEATHERNAME,HOUR,REL_ROADNAME,MAN_COLLNAME,MAX_SEVNAME,VEH_NO_x,MAKENAME,MAK_MODNAME,MOD_YEAR,TRAV_SPNAME,VEH_NO_y,MDRDSTRDNAME
0,201901174219,Stratum 10 - Other,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",2,January,No Alcohol Involved,Curb,Cloudy,8,On Median,The First Harmful Event was Not a Collision wi...,No Apparent Injury (O),1,Ford,Ford Ranger,2006,Not Reported,1,Not Distracted
1,201901176655,Stratum 6 - LMY PV Minor Injury,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",2,January,Reported as Unknown,Motor Vehicle In-Transport,Clear,2,On Roadway,Angle,Possible Injury (C),1,Dodge,Dodge Dart (2013 on. See model 001 for 1960-19...,2015,Not Reported,1,Not Reported
2,201901176655,Stratum 6 - LMY PV Minor Injury,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",2,January,Reported as Unknown,Motor Vehicle In-Transport,Clear,2,On Roadway,Angle,Possible Injury (C),1,Dodge,Dodge Dart (2013 on. See model 001 for 1960-19...,2015,Not Reported,2,Not Reported
3,201901176655,Stratum 6 - LMY PV Minor Injury,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",2,January,Reported as Unknown,Motor Vehicle In-Transport,Clear,2,On Roadway,Angle,Possible Injury (C),2,Honda,Honda CR-V,2006,Not Reported,1,Not Reported
4,201901176655,Stratum 6 - LMY PV Minor Injury,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",2,January,Reported as Unknown,Motor Vehicle In-Transport,Clear,2,On Roadway,Angle,Possible Injury (C),2,Honda,Honda CR-V,2006,Not Reported,2,Not Reported


In [10]:
#Merging people dataframe into final dataframe
data_df = data2_df.merge(people_df, on='CASENUM')
data_df.head()

Unnamed: 0,CASENUM,STRATUMNAME,REGIONNAME,URBANICITY,MONTHNAME,ALCOHOLNAME,HARM_EVNAME,WEATHERNAME,HOUR,REL_ROADNAME,...,MOD_YEAR,TRAV_SPNAME,VEH_NO_y,MDRDSTRDNAME,PER_NO,SEXNAME,AGE,AIR_BAGNAME,DRINKINGNAME,REST_USENAME
0,201901174219,Stratum 10 - Other,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",2,January,No Alcohol Involved,Curb,Cloudy,8,On Median,...,2006,Not Reported,1,Not Distracted,1,Male,39,Not Deployed,No (Alcohol Not Involved),Shoulder and Lap Belt Used
1,201901176655,Stratum 6 - LMY PV Minor Injury,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",2,January,Reported as Unknown,Motor Vehicle In-Transport,Clear,2,On Roadway,...,2015,Not Reported,1,Not Reported,1,Male,20,Not Deployed,Not Reported,Shoulder and Lap Belt Used
2,201901176655,Stratum 6 - LMY PV Minor Injury,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",2,January,Reported as Unknown,Motor Vehicle In-Transport,Clear,2,On Roadway,...,2015,Not Reported,1,Not Reported,2,Male,19,Not Deployed,Not Reported,None Used/Not Applicable
3,201901176655,Stratum 6 - LMY PV Minor Injury,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",2,January,Reported as Unknown,Motor Vehicle In-Transport,Clear,2,On Roadway,...,2015,Not Reported,1,Not Reported,3,Male,999,Not Deployed,Not Reported,Reported as Unknown
4,201901176655,Stratum 6 - LMY PV Minor Injury,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",2,January,Reported as Unknown,Motor Vehicle In-Transport,Clear,2,On Roadway,...,2015,Not Reported,1,Not Reported,4,Male,999,Not Deployed,Not Reported,Reported as Unknown


In [11]:
#Removing duplicate case number rows
data_df = data_df.drop_duplicates(subset=['CASENUM'])
data_df.head()

Unnamed: 0,CASENUM,STRATUMNAME,REGIONNAME,URBANICITY,MONTHNAME,ALCOHOLNAME,HARM_EVNAME,WEATHERNAME,HOUR,REL_ROADNAME,...,MOD_YEAR,TRAV_SPNAME,VEH_NO_y,MDRDSTRDNAME,PER_NO,SEXNAME,AGE,AIR_BAGNAME,DRINKINGNAME,REST_USENAME
0,201901174219,Stratum 10 - Other,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",2,January,No Alcohol Involved,Curb,Cloudy,8,On Median,...,2006,Not Reported,1,Not Distracted,1,Male,39,Not Deployed,No (Alcohol Not Involved),Shoulder and Lap Belt Used
1,201901176655,Stratum 6 - LMY PV Minor Injury,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",2,January,Reported as Unknown,Motor Vehicle In-Transport,Clear,2,On Roadway,...,2015,Not Reported,1,Not Reported,1,Male,20,Not Deployed,Not Reported,Shoulder and Lap Belt Used
25,201901176667,Stratum 8 - NLMY PV Minor Injury,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",1,January,Alcohol Involved,Motor Vehicle In-Transport,Clear,5,On Roadway,...,2000,040 MPH,1,"Inattention (Inattentive), Details Unknown",1,Male,37,Not Deployed,Yes (Alcohol Involved),Shoulder and Lap Belt Used
33,201901176694,Stratum 9 - LMY PV No Injuries in Crash,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",1,January,No Alcohol Involved,Traffic Sign Support,Clear,4,On Roadside,...,2016,040 MPH,1,Not Distracted,1,Male,26,Deployed- Front,No (Alcohol Not Involved),None Used/Not Applicable
34,201901176702,Stratum 6 - LMY PV Minor Injury,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",1,January,No Alcohol Involved,Motor Vehicle In-Transport,Clear,9,On Roadway,...,2001,005 MPH,1,Reported as Unknown if Distracted,1,Male,56,Not Deployed,No (Alcohol Not Involved),Shoulder and Lap Belt Used


In [None]:
#Checking columns
data_df.columns

In [12]:
#Deleting repeating columns
del data_df['VEH_NO_y']

In [13]:
#Checking Columns again
data_df.columns

Index(['CASENUM', 'STRATUMNAME', 'REGIONNAME', 'URBANICITY', 'MONTHNAME',
       'ALCOHOLNAME', 'HARM_EVNAME', 'WEATHERNAME', 'HOUR', 'REL_ROADNAME',
       'MAN_COLLNAME', 'MAX_SEVNAME', 'VEH_NO_x', 'MAKENAME', 'MAK_MODNAME',
       'MOD_YEAR', 'TRAV_SPNAME', 'MDRDSTRDNAME', 'PER_NO', 'SEXNAME', 'AGE',
       'AIR_BAGNAME', 'DRINKINGNAME', 'REST_USENAME'],
      dtype='object')

In [14]:
#Renaming columns
data_rename_df = data_df.rename(columns={"CASENUM": "Case Number", "STRATUMNAME": "Crash Description", "REGIONNAME": "Region", "URBANICITY": "Urban or City",
                       "MONTHNAME":"Month", "ALCOHOLNAME":"Alcohol Involved?", "HARM_EVNAME":"Harmful Event", "WEATHERNAME":"Weather",
                       "HOUR":"Hour of Day", "REL_ROADNAME":"Position on Road", "MAN_COLLNAME":"Type of Collision", "VEH_NO_x":"Number of Vehicles",
                       "MAKENAME":"Make of Vehicle", "MAK_MODNAME":"Make/Model of Vehicle", "MOD_YEAR":"Year of Vehicle",
                        "TRAV_SPNAME":"Speed", "MDRDSTRDNAME":"Type of Distraction", "PER_NO":"Number of Persons", "SEXNAME":"Gender",
                        "AGE":"Age", "AIR_BAGNAME":"Air Bag Deployment", "REST_USENAME":"Restraint Usage",
                        "MAX_SEVNAME":"Severe Injury?"})
data_rename_df.head()

Unnamed: 0,Case Number,Crash Description,Region,Urban or City,Month,Alcohol Involved?,Harmful Event,Weather,Hour of Day,Position on Road,...,Make/Model of Vehicle,Year of Vehicle,Speed,Type of Distraction,Number of Persons,Gender,Age,Air Bag Deployment,DRINKINGNAME,Restraint Usage
0,201901174219,Stratum 10 - Other,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",2,January,No Alcohol Involved,Curb,Cloudy,8,On Median,...,Ford Ranger,2006,Not Reported,Not Distracted,1,Male,39,Not Deployed,No (Alcohol Not Involved),Shoulder and Lap Belt Used
1,201901176655,Stratum 6 - LMY PV Minor Injury,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",2,January,Reported as Unknown,Motor Vehicle In-Transport,Clear,2,On Roadway,...,Dodge Dart (2013 on. See model 001 for 1960-19...,2015,Not Reported,Not Reported,1,Male,20,Not Deployed,Not Reported,Shoulder and Lap Belt Used
25,201901176667,Stratum 8 - NLMY PV Minor Injury,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",1,January,Alcohol Involved,Motor Vehicle In-Transport,Clear,5,On Roadway,...,Nissan/Datsun Xterra,2000,040 MPH,"Inattention (Inattentive), Details Unknown",1,Male,37,Not Deployed,Yes (Alcohol Involved),Shoulder and Lap Belt Used
33,201901176694,Stratum 9 - LMY PV No Injuries in Crash,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",1,January,No Alcohol Involved,Traffic Sign Support,Clear,4,On Roadside,...,Mazda Mazda3,2016,040 MPH,Not Distracted,1,Male,26,Deployed- Front,No (Alcohol Not Involved),None Used/Not Applicable
34,201901176702,Stratum 6 - LMY PV Minor Injury,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",1,January,No Alcohol Involved,Motor Vehicle In-Transport,Clear,9,On Roadway,...,Toyota Sequoia,2001,005 MPH,Reported as Unknown if Distracted,1,Male,56,Not Deployed,No (Alcohol Not Involved),Shoulder and Lap Belt Used


In [None]:
#Code to clean speed column
#??

In [15]:
#Code to remove rows with unreasonable ages
data_rename_df.drop(data_rename_df.index[data_rename_df['Age'] > 120], inplace=True)
print(data_rename_df['Age'])

0         39
1         20
25        37
33        26
34        56
          ..
607493    55
607494    23
607502    59
607503    22
607515    56
Name: Age, Length: 50793, dtype: int64


In [16]:
#Sorting data to check ages
data_rename_df.sort_values(by=['Age'], inplace=True)
data_rename_df.tail()

Unnamed: 0,Case Number,Crash Description,Region,Urban or City,Month,Alcohol Involved?,Harmful Event,Weather,Hour of Day,Position on Road,...,Make/Model of Vehicle,Year of Vehicle,Speed,Type of Distraction,Number of Persons,Gender,Age,Air Bag Deployment,DRINKINGNAME,Restraint Usage
500054,201902031496,Stratum 2 - Not a MV Occupant Any Injury,"Northeast (PA, NJ, NY, NH, VT, RI, MA, ME, CT)",1,September,Reported as Unknown,Pedalcyclist,Cloudy,17,On Roadway,...,Ford Unknown (light truck),2018,Not Reported,Not Reported,1,Female,99,Not a Motor Vehicle Occupant,Not Reported,Not a Motor Vehicle Occupant
542111,201902099325,Stratum 9 - LMY PV No Injuries in Crash,"Midwest (OH, IN, IL, MI, WI, MN, ND, SD, NE, I...",1,December,Reported as Unknown,Motor Vehicle In-Transport,Clear,11,On Roadway,...,Pontiac G6,2008,020 MPH,Not Distracted,1,Female,99,Not Deployed,Not Reported,Shoulder and Lap Belt Used
479836,201901999722,Stratum 9 - LMY PV No Injuries in Crash,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",2,November,No Alcohol Involved,Motor Vehicle In-Transport,Clear,11,On Roadway,...,Chrysler Pacifica (Light Trucks),2018,001 MPH,Not Reported,1,Male,99,Not Deployed,No (Alcohol Not Involved),Shoulder and Lap Belt Used
73698,201901371008,Stratum 10 - Other,"Midwest (OH, IN, IL, MI, WI, MN, ND, SD, NE, I...",1,March,No Alcohol Involved,Motor Vehicle In-Transport,Snow,7,On Roadway,...,Chevrolet S-10/T-10 Pickup,1994,Not Reported,Not Distracted,1,Male,117,Not Deployed,No (Alcohol Not Involved),Shoulder and Lap Belt Used
34862,201901290453,Stratum 9 - LMY PV No Injuries in Crash,"Midwest (OH, IN, IL, MI, WI, MN, ND, SD, NE, I...",1,February,No Alcohol Involved,Motor Vehicle In-Transport,Clear,15,On Roadway,...,Honda Odyssey,2016,Not Reported,Not Distracted,1,Female,118,Not Deployed,No (Alcohol Not Involved),Shoulder and Lap Belt Used


In [17]:
data_rename_df.drop(data_rename_df.index[data_rename_df['Year of Vehicle'] > 2025], inplace=True)

In [18]:
#Sorting data to check ages
data_rename_df.sort_values(by=['Year of Vehicle'], inplace=True)
data_rename_df.tail()

Unnamed: 0,Case Number,Crash Description,Region,Urban or City,Month,Alcohol Involved?,Harmful Event,Weather,Hour of Day,Position on Road,...,Make/Model of Vehicle,Year of Vehicle,Speed,Type of Distraction,Number of Persons,Gender,Age,Air Bag Deployment,DRINKINGNAME,Restraint Usage
542340,201902100238,Stratum 9 - LMY PV No Injuries in Crash,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",1,December,No Alcohol Involved,Motor Vehicle In-Transport,Rain,17,On Roadway,...,Toyota Corolla,2020,020 MPH,Not Distracted,1,Male,63,Not Deployed,No (Alcohol Not Involved),Shoulder and Lap Belt Used
570655,201902154187,Stratum 7 - M/H Truck or Bus,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",1,November,No Alcohol Involved,Motor Vehicle In-Transport,Rain,12,On Roadway,...,Volvo Medium/Heavy - CBE,2020,Not Reported,Not Reported,1,Male,36,Not Deployed,No (Alcohol Not Involved),Shoulder and Lap Belt Used
599992,201902234304,Stratum 9 - LMY PV No Injuries in Crash,"Northeast (PA, NJ, NY, NH, VT, RI, MA, ME, CT)",1,November,Reported as Unknown,Motor Vehicle In-Transport,Cloudy,17,On Roadway,...,Dodge Unknown (light truck),2020,Not Reported,Not Reported,1,Female,30,Not Deployed,Not Reported,Lap Belt Only Used
366282,201901831334,Stratum 4 - LMY PV Serious Injury,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",2,September,No Alcohol Involved,Motor Vehicle In-Transport,Clear,5,On Roadway,...,Ford Explorer (For 2019 on. For model years 1...,2020,Reported as Unknown,Not Reported,1,Female,67,Deployed- Combination,No (Alcohol Not Involved),Shoulder and Lap Belt Used
559403,201902129484,Stratum 7 - M/H Truck or Bus,"Midwest (OH, IN, IL, MI, WI, MN, ND, SD, NE, I...",2,December,No Alcohol Involved,Motor Vehicle In-Transport,Rain,16,On Roadway,...,Kenworth Medium/Heavy - CBE,2020,Not Reported,Not Distracted,1,Male,55,Not Deployed,No (Alcohol Not Involved),Shoulder and Lap Belt Used


In [19]:
MachineLearningModel_df = data_rename_df.drop(columns=['Crash Description', 'Case Number', 'Make/Model of Vehicle', 'Speed',
                                                   'DRINKINGNAME'])
MachineLearningModel_df

Unnamed: 0,Region,Urban or City,Month,Alcohol Involved?,Harmful Event,Weather,Hour of Day,Position on Road,Type of Collision,Severe Injury?,Number of Vehicles,Make of Vehicle,Year of Vehicle,Type of Distraction,Number of Persons,Gender,Age,Air Bag Deployment,Restraint Usage
346517,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",1,August,No Alcohol Involved,Motor Vehicle In-Transport,Clear,21,On Roadway,Front-to-Rear,No Apparent Injury (O),1,Ford,1934,Not Reported,1,Male,65,Not Deployed,Shoulder and Lap Belt Used
582326,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",1,December,No Alcohol Involved,Motor Vehicle In-Transport,Clear,17,On Roadway,Front-to-Rear,No Apparent Injury (O),1,Chevrolet,1940,Not Reported,1,Male,19,Not Deployed,Shoulder and Lap Belt Used
513306,"West (MT, ID, WA, OR, CA, NV, NM, AZ, UT, CO, ...",1,October,No Alcohol Involved,Motor Vehicle In-Transport,Clear,8,On Roadway,Angle,Suspected Minor Injury (B),1,Jeep / Kaiser-Jeep / Willys- Jeep,1953,Not Reported,1,Male,85,Not Deployed,None Used/Not Applicable
245785,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",2,July,Reported as Unknown,Motor Vehicle In-Transport,Clear,16,On Roadway,Angle,No Apparent Injury (O),1,Chevrolet,1954,Not Distracted,1,Male,75,Not Deployed,Shoulder and Lap Belt Used
184789,"West (MT, ID, WA, OR, CA, NV, NM, AZ, UT, CO, ...",1,May,Reported as Unknown,Traffic Sign Support,Clear,23,On Roadside,The First Harmful Event was Not a Collision wi...,Suspected Serious Injury (A),1,Chevrolet,1954,Not Reported,1,Male,15,Not Deployed,None Used/Not Applicable
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542340,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",1,December,No Alcohol Involved,Motor Vehicle In-Transport,Rain,17,On Roadway,Front-to-Rear,No Apparent Injury (O),1,Toyota,2020,Not Distracted,1,Male,63,Not Deployed,Shoulder and Lap Belt Used
570655,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",1,November,No Alcohol Involved,Motor Vehicle In-Transport,Rain,12,On Roadway,Front-to-Rear,No Apparent Injury (O),1,Volvo,2020,Not Reported,1,Male,36,Not Deployed,Shoulder and Lap Belt Used
599992,"Northeast (PA, NJ, NY, NH, VT, RI, MA, ME, CT)",1,November,Reported as Unknown,Motor Vehicle In-Transport,Cloudy,17,On Roadway,Front-to-Rear,No Apparent Injury (O),1,Dodge,2020,Not Reported,1,Female,30,Not Deployed,Lap Belt Only Used
366282,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",2,September,No Alcohol Involved,Motor Vehicle In-Transport,Clear,5,On Roadway,Angle,Suspected Serious Injury (A),1,Ford,2020,Not Reported,1,Female,67,Deployed- Combination,Shoulder and Lap Belt Used


In [20]:
#Exporting a Dataframe as a csv
data_rename_df.to_csv('merged_data.csv')

MACHINE LEARNING MODEL BELOW

In [22]:
#Set features and target variables
X = MachineLearningModel_df.drop(columns=["Severe Injury?"])

#Target variable is whether or not accident included a fatality
y = MachineLearningModel_df['Severe Injury?']

In [24]:
# Generate our categorical variable lists
categories = MachineLearningModel_df.dtypes[MachineLearningModel_df.dtypes == "object"].index.tolist()
categories

['Region',
 'Month',
 'Alcohol Involved?',
 'Harmful Event',
 'Weather',
 'Position on Road',
 'Type of Collision',
 'Severe Injury?',
 'Make of Vehicle',
 'Type of Distraction',
 'Gender',
 'Air Bag Deployment',
 'Restraint Usage']

In [None]:
#Import dependencies for scaling the data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#Train/Test/Split data into training and testing groups
X_train, X_test, y_train, y_test = train_test_split(X, y)

#Fit scaler to the training set of features
scaler = StandardScaler().fit(X_train)

#Transforming both the training and testing datasets into scaled data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
#Use RandomForestClassifier on scaled data
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=500).fit(X_train_scaled, y_train)

In [None]:
#Finding the most important features of the dataset
features1 = clf.feature_importances_
print(features1)
plt.bar(x = range(len(features1)), height=features1)
plt.show()

In [None]:
# Use the following code to plot the most important features
features = sorted(zip(X.columns, features1), key = lambda x: x[1])
cols = [f[0] for f in features]
width = [f[1] for f in features]

fig, ax = plt.subplots()

fig.set_size_inches(5,20)
plt.margins(y=0.001)

ax.barh(y=cols, width=width)

plt.show()

In [None]:
#Fitting most important features to a model
from sklearn.feature_selection import SelectFromModel
sel = SelectFromModel(clf)
sel.fit(X_train_scaled, y_train)
sel.get_support()

In [None]:
#Training and testing the selected features
X_selected_train, X_selected_test, y_train, y_test = train_test_split(sel.transform(X), y)

In [None]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy="auto").fit_resample(
    X_selected_train, y_train
)
Counter(y_resampled)

In [None]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_selected_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))