This data comes from https://www.nhtsa.gov/crash-data-systems/crash-report-sampling-system
Information about naming conventions, and other information can be found https://static.nhtsa.gov/nhtsa/downloads/CRSS/Manuals/Analytical%20User%20Manual/2016-2019%20CRSS%20Analytical%20User%20Manual%20-%20DOT%20HS%20813%20022.pdf

In [1]:
# import dependencies
import pandas as pd
import os
import psycopg2
import config as creds
from sqlalchemy import create_engine
import re

In [2]:
db_string = f"postgresql://postgres:{creds.PGPASSWORD}@crashstats.crogg2abmvvo.us-east-2.rds.amazonaws.com/postgres"
engine = create_engine(db_string)

In [3]:
# import all the data
directory = r'Accidents_data'
files = {}
for filename in os.listdir(directory):
    if filename.endswith(".csv") or filename.endswith(".CSV"):
        files[filename[:-4]] = os.path.join(directory, filename)
        assert files[filename[:-4]] == os.path.join(directory, filename)
    else:
        continue
dfs = files.copy()

In [4]:
# Create a dictionary of the dataframes for easy referencing
for key in dfs:
    dfs[key] = pd.read_csv(dfs[key])

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
# Setup Vehicle subset of data
cols_to_keep = ['CASENUM','VEH_NO','MAKENAME','MAK_MODNAME','MOD_YEAR']
vehicle_subset = dfs['vehicle'][cols_to_keep]
vehicle_subset.columns = vehicle_subset.columns.str.lower()
vehicle_subset.head(3)

Unnamed: 0,casenum,veh_no,makename,mak_modname,mod_year
0,201901174219,1,Ford,Ford Ranger,2006
1,201901176655,1,Dodge,Dodge Dart (2013 on. See model 001 for 1960-19...,2015
2,201901176655,2,Honda,Honda CR-V,2006


In [6]:
# Setup Accident subset of data
cols_to_keep = ['CASENUM','STRATUMNAME','REGIONNAME','URBANICITYNAME',
               'MONTHNAME','HOUR','ALCOHOLNAME','WEATHERNAME','MAN_COLLNAME','HARM_EVNAME', 'REL_ROADNAME','MAX_SEVNAME']
accident_subset = dfs['accident'][cols_to_keep]
accident_subset.columns = accident_subset.columns.str.lower()
accident_subset.head(3)

Unnamed: 0,casenum,stratumname,regionname,urbanicityname,monthname,hour,alcoholname,weathername,man_collname,harm_evname,rel_roadname
0,201901174219,Stratum 10 - Other,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",Rural Area,January,8,No Alcohol Involved,Cloudy,The First Harmful Event was Not a Collision wi...,Curb,On Median
1,201901176655,Stratum 6 - LMY PV Minor Injury,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",Rural Area,January,2,Reported as Unknown,Clear,Angle,Motor Vehicle In-Transport,On Roadway
2,201901176667,Stratum 8 - NLMY PV Minor Injury,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",Urban Area,January,5,Alcohol Involved,Clear,Angle,Motor Vehicle In-Transport,On Roadway


In [7]:
# Setup Person subset of data
cols_to_keep = ['CASENUM','VEH_NO','PER_NO','SEXNAME','AGE','AIR_BAGNAME','REST_USENAME']
person_subset = dfs['person'][cols_to_keep]
person_subset.columns = person_subset.columns.str.lower()
person_subset.head(10)

Unnamed: 0,casenum,veh_no,per_no,sexname,age,air_bagname,rest_usename
0,201901174219,1,1,Male,39,Not Deployed,Shoulder and Lap Belt Used
1,201901176655,1,1,Male,20,Not Deployed,Shoulder and Lap Belt Used
2,201901176655,1,2,Male,19,Not Deployed,None Used/Not Applicable
3,201901176655,1,3,Male,999,Not Deployed,Reported as Unknown
4,201901176655,1,4,Male,999,Not Deployed,Reported as Unknown
5,201901176655,2,1,Female,42,Not Deployed,Shoulder and Lap Belt Used
6,201901176655,2,2,Male,47,Not Deployed,Shoulder and Lap Belt Used
7,201901176667,1,1,Male,37,Not Deployed,Shoulder and Lap Belt Used
8,201901176667,2,1,Male,50,Not Deployed,Shoulder and Lap Belt Used
9,201901176694,1,1,Male,26,Deployed- Front,None Used/Not Applicable


In [8]:
# Setup Accident subset of data
cols_to_keep = ['CASENUM','VEH_NO','MDRDSTRDNAME']
distract_subset = dfs['distract'][cols_to_keep]
distract_subset = distract_subset.rename(columns={"MDRDSTRDNAME":"Distraction"})
distract_subset.columns = distract_subset.columns.str.lower()
distract_subset.head(3)

Unnamed: 0,casenum,veh_no,distraction
0,201901174219,1,Not Distracted
1,201901176655,1,Not Reported
2,201901176655,2,Not Reported


In [14]:
vehicle_subset.to_sql("vehicle", con=engine, index=False, if_exists='replace')

In [None]:
accident_subset.to_sql("accident", con=engine, index=False, if_exists='replace')

In [None]:
distract_subset.to_sql("distract", con=engine, index=False, if_exists='replace')

In [None]:
person_subset.to_sql("person", con=engine, index=False, if_exists='replace')

In [9]:
test_data = pd.read_sql_table("vehicledist", engine)
test_data.head(3)

Unnamed: 0,casenum,veh_no,makename,mak_modname,mod_year,distraction
0,201901176655,1,Dodge,Dodge Dart (2013 on. See model 001 for 1960-19...,2015,Not Reported
1,201901176667,2,Chevrolet,"Chevrolet C, K, R, V-series pickup/Silverado",1979,Not Reported
2,201901176702,2,Toyota,Toyota Corolla,2018,Reported as Unknown if Distracted
