- We have the CRSS dataset in 
    - Big_Files/CRSS_2020_Update/
- In one directory for each year,
    - CRSS2016CSV
    - CRSS2017CSV
    - CRSS2018CSV
    - CRSS2019CSV
    - CRSS2020CSV    
- In each year, the CRSS dataset comes in three main files, 
    - Accident.csv
    - Vehicle.csv
    - Person.csv
- Collect those and merge into three files,
    - Accident_Raw.csv
    - Vehicle_Raw.csv
    - Person_Raw.csv
- and also three files with category names,
    - Accident_Raw_with_Names.csv
    - Vehicle_Raw_with_Names.csv
    - Person_Raw_with_Names.csv


In [1]:
%%latex
\tableofcontents

<IPython.core.display.Latex object>

# Setup
## Import Libraries

In [6]:
import sys, copy, math, time, os

print ('Python version: {}'.format(sys.version))

import numpy as np
print ('NumPy version: {}'.format(np.__version__))
np.set_printoptions(suppress=True)


import pandas as pd
print ('Pandas version:  {}'.format(pd.__version__))
pd.set_option('display.max_rows', 500)

# Library for reading Microsoft Access files
import pandas_access as mdb


# Set Randomness.  Copied from https://www.kaggle.com/code/abazdyrev/keras-nn-focal-loss-experiments
import random


Python version: 3.9.7 (default, Oct 22 2021, 13:24:00) 
[Clang 13.0.0 (clang-1300.0.29.3)]
NumPy version: 1.21.3
Pandas version:  1.2.4


# Import Data

### accident.csv from CRSS

In [7]:
def Import_Data_Accident(NAMES):
    print ('Import_Data_Accident()')

    df = pd.DataFrame([])
#    for year in ['2018']:
    for year in ['2016','2017','2018']:
        filename = '../../Big_Files/CRSS_2020_Update/CRSS' + year + 'CSV/ACCIDENT.CSV'
        temp = pd.read_csv(filename, index_col=None)
        print (year, len(temp))
        df = df.append(temp)

#    for year in ['2020']:
    for year in ['2019','2020']:
        filename = '../../Big_Files/CRSS_2020_Update/CRSS' + year + 'CSV/accident.csv'
        temp = pd.read_csv(filename, index_col=None)
        print (year, len(temp))
        df = df.append(temp)
    
    if NAMES==0:
        for feature in df:
            if 'NAME' in feature:
                df.drop(columns=[feature], inplace=True)

    print (df.shape)
    print ()
    return df

## vehicle.csv from CRSS

In [8]:
def Import_Data_Vehicle(NAMES):
    print ('Import_Data_Vehicle()')

    df = pd.DataFrame([])
    for year in ['2016','2017','2018']:
        filename = '../../Big_Files/CRSS_2020_Update/CRSS' + year + 'CSV/VEHICLE.CSV'
        temp = pd.read_csv(filename, index_col=None, low_memory=False)
        print (year, len(temp))
        df = df.append(temp)

    for year in ['2019','2020']:
        filename = '../../Big_Files/CRSS_2020_Update/CRSS' + year + 'CSV/vehicle.csv'
        temp = pd.read_csv(filename, index_col=None, encoding='latin1', low_memory=False)
        print (year, len(temp))
        df = df.append(temp)

    if NAMES==0:
        for feature in df:
            if 'NAME' in feature:
                df.drop(columns=[feature], inplace=True)

    print (df.shape)
    print ()
    return df

### person.csv from CRSS

In [9]:
def Import_Data_Person(NAMES):
    print ('Import_Data_Person()')

    df = pd.DataFrame([])
    for year in ['2016','2017','2018']:
        filename = '../../Big_Files/CRSS_2020_Update/CRSS' + year + 'CSV/PERSON.CSV'
        temp = pd.read_csv(filename, index_col=None)
        print (year, len(temp))
        df = df.append(temp)

    for year in ['2019','2020']:
        filename = '../../Big_Files/CRSS_2020_Update/CRSS' + year + 'CSV/person.csv'
        temp = pd.read_csv(filename, index_col=None, encoding='latin1')
        print (year, len(temp))
        df = df.append(temp)

    if NAMES==0:
        for feature in df:
            if 'NAME' in feature:
                df.drop(columns=[feature], inplace=True)

    print (df.shape)
    print ()
    return df

## Get Data
- The Get_Data_from_Original() reads the (original) CRSS files from the CRSS directory, preprocesses it, and writes it to files in a folder outside this GitHub repo (because the files are too large for my subscription), and returns the dataframes.
- The Get_Data_from_Temp_Files() reads the temp files and returns the dataframes.  I created this option for running repeatedly during writing and debugging, because it's much faster.

In [10]:
def Get_Data_from_Original():
    print ('Get_Data_from_Original()')
    
    df_Accident = Import_Data_Accident(0)
    df_Vehicle = Import_Data_Vehicle(0)
    df_Person = Import_Data_Person(0)
    
    df_Accident.to_csv('../../Big_Files/Accident_Raw.csv', index=False)
    df_Vehicle.to_csv('../../Big_Files/Vehicle_Raw.csv', index=False)
    df_Person.to_csv('../../Big_Files/Person_Raw.csv', index=False)
    

    df_Accident = Import_Data_Accident(1)
    df_Vehicle = Import_Data_Vehicle(1)
    df_Person = Import_Data_Person(1)
    
    df_Accident.to_csv('../../Big_Files/Accident_Raw_with_NAMES.csv', index=False)
    df_Vehicle.to_csv('../../Big_Files/Vehicle_Raw_with_NAMES.csv', index=False)
    df_Person.to_csv('../../Big_Files/Person_Raw_with_NAMES.csv', index=False)
    

    return df_Accident, df_Vehicle, df_Person

In [21]:
def Check_New_Files():
    print ('Check_New_Files')
    Files = [
        'Accident_Raw',
        'Vehicle_Raw',
        'Person_Raw',
        'Accident_Raw_with_Names',
        'Vehicle_Raw_with_Names',
        'Person_Raw_with_Names'
    ]
    for filename in Files:
        df = pd.read_csv('../../Big_Files/' + filename + '.csv', low_memory=False)
        print (filename, df.shape)
    
    return 0    

In [12]:
df_Accident, df_Vehicle, df_Person = Get_Data_from_Original()

Get_Data_from_Original()
Import_Data_Accident()
2016 46511
2017 54969
2018 48443
2019 54409
2020 54745
(259077, 51)

Import_Data_Vehicle()
2016 82149
2017 97625
2018 86105
2019 96717
2020 94718
(457314, 97)

Import_Data_Person()
2016 117759
2017 138913
2018 120230
2019 135410
2020 131962
(644274, 67)

Import_Data_Accident()
2016 46511
2017 54969
2018 48443
2019 54409
2020 54745
(259077, 90)

Import_Data_Vehicle()
2016 82149
2017 97625
2018 86105
2019 96717
2020 94718
(457314, 184)

Import_Data_Person()
2016 117759
2017 138913
2018 120230
2019 135410
2020 131962
(644274, 117)



In [22]:
Check_New_Files()

Check_New_Files
Accident_Raw (259077, 51)
Vehicle_Raw (457314, 97)
Person_Raw (644274, 67)
Accident_Raw_with_Names (259077, 90)
Vehicle_Raw_with_Names (457314, 184)
Person_Raw_with_Names (644274, 117)


0