In [1]:
%%latex
\tableofcontents

<IPython.core.display.Latex object>

# readme
## Directory Structure

We designed this code to fit in a GitHub repository with files under 100MB.  Many of the files we input are over this limit, and after preprocessing (selecting features, binning features) we saved the data as a .csv file of about 150 MB so we can tweak later code without having to run the preprocessing again.  We saved it again after imputing missing data.  To keep the files in our GitHub repository under 100 MB, we saved these into a different directory.

- CRSS Data Files
    - We use six years of data, 2016-2021. Once later years come out, they can be easily added. 
    - Each year's data is 100-200 MB
    - The files we're really interested in each year are these.  The names were uppercase until 2018, then lowercase, and also after 2018 the file sizes jumped.
        - accident.csv or ACCIDENT.csv, now about 30 MB
        - vehicle.csv or VEHICLE.csv, now about 180 MB
        - person.csv or PERSON.csv, now about 150 MB

- Big_Files
    - CRSS_Files
        - CRSS2016CSV (22 files, 160 MB)
        - CRSS2017CSV (22 files, 189 MB)
        - CRSS2018CSV (22 files, 169 MB)
        - CRSS2019CSV (23 files, 633 MB)
        - CRSS2020CSV (29 files, 719 MB)
        - CRSS2021CSV (29 files, 736 MB)
        
        
    - *Intermediate .csv files*
- GitHub_Repository
    - Code_Files
        - Analyze_Proba
        - Confusion_Matrices
        - Images

# Setup

## Import Libraries

In [2]:
print ('Install Packages')

import sys, copy, math, time, os

print ('Python version: {}'.format(sys.version))

#from collections import Counter

import numpy as np
print ('NumPy version: {}'.format(np.__version__))
np.set_printoptions(suppress=True)

import scipy as sc
print ('SciPy version:  {}'.format(sc.__version__))

import tensorflow as tf
print ('TensorFlow version:  {}'.format(tf.__version__))
tf.config.run_functions_eagerly(True)
tf.data.experimental.enable_debug_mode()

from tensorflow import keras
print ('Keras version:  {}'.format(keras.__version__))

from keras import layers
import keras.backend as K
from keras.layers import IntegerLookup
from keras.layers import Normalization
from keras.layers import StringLookup
from keras.utils import get_custom_objects
from keras.utils import tf_utils

from keras.models import Sequential
from keras.layers import Dense

#from keras.wrappers.scikit_learn import KerasClassifier
from scikeras.wrappers import KerasClassifier

import pandas as pd
print ('Pandas version:  {}'.format(pd.__version__))
pd.set_option('display.max_rows', 500)

import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
#    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
})

import matplotlib.pyplot as plt
%matplotlib inline

# Library for reading Microsoft Access files
#import pandas_access as mdb

import sklearn
print ('SciKit-Learn version: {}'.format(sklearn.__version__))
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import class_weight

from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

import imblearn
print ('Imbalanced-Learn version: {}'.format(imblearn.__version__))
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import CondensedNearestNeighbour
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import RUSBoostClassifier
from imblearn.ensemble import EasyEnsembleClassifier

#!pip install pydot

# Set Randomness.  Copied from https://www.kaggle.com/code/abazdyrev/keras-nn-focal-loss-experiments
import random
#np.random.seed(42) # NumPy
#random.seed(42) # Python
#tf.random.set_seed(42) # Tensorflow

import warnings
warnings.filterwarnings('ignore')

print ('Finished Installing Packages')

Install Packages
Python version: 3.10.9 | packaged by conda-forge | (main, Feb  2 2023, 20:26:08) [Clang 14.0.6 ]
NumPy version: 1.24.2
SciPy version:  1.7.3




TensorFlow version:  2.11.0
Keras version:  2.11.0
Pandas version:  1.5.3
SciKit-Learn version: 1.2.2
Imbalanced-Learn version: 0.10.1
Finished Installing Packages


# Get Data and Preprocess

## Read CRSS Files
- We have the CRSS dataset in 
    - Big_Files/CRSS_2020_Update/
- In one directory for each year,
    - CRSS2016CSV
    - CRSS2017CSV
    - CRSS2018CSV
    - CRSS2019CSV
    - CRSS2020CSV    
    - CRSS2021CSV    
- In each year, the CRSS dataset comes in three main files, 
    - Accident.csv
    - Vehicle.csv
    - Person.csv
- Collect those and merge into three files,
    - Accident_Raw.csv
    - Vehicle_Raw.csv
    - Person_Raw.csv
- and also three files with category names,
    - Accident_Raw_with_Names.csv
    - Vehicle_Raw_with_Names.csv
    - Person_Raw_with_Names.csv


### accident.csv from CRSS

In [3]:
def Import_Data_Accident(NAMES):
    print ('Import_Data_Accident()')

    df = pd.DataFrame([])
#    for year in ['2018']:
    for year in ['2016','2017','2018']:
        filename = '../../Big_Files/CRSS_2020_Update/CRSS' + year + 'CSV/ACCIDENT.CSV'
        temp = pd.read_csv(filename, index_col=None)
        print (year, len(temp))
        df = df.append(temp)

#    for year in ['2020']:
    for year in ['2019','2020','2021']:
        filename = '../../Big_Files/CRSS_2020_Update/CRSS' + year + 'CSV/accident.csv'
        temp = pd.read_csv(filename, index_col=None)
        print (year, len(temp))
        df = df.append(temp)
    
    if NAMES==0:
        for feature in df:
            if 'NAME' in feature:
                df.drop(columns=[feature], inplace=True)

    print (df.shape)
    print ()
    return df

### vehicle.csv from CRSS

In [4]:
def Import_Data_Vehicle(NAMES):
    print ('Import_Data_Vehicle()')

    df = pd.DataFrame([])
    for year in ['2016','2017','2018']:
        filename = '../../Big_Files/CRSS_2020_Update/CRSS' + year + 'CSV/VEHICLE.CSV'
        temp = pd.read_csv(filename, index_col=None, low_memory=False)
        print (year, len(temp))
        df = df.append(temp)

    for year in ['2019','2020','2021']:
        filename = '../../Big_Files/CRSS_2020_Update/CRSS' + year + 'CSV/vehicle.csv'
        temp = pd.read_csv(filename, index_col=None, encoding='latin1', low_memory=False)
        print (year, len(temp))
        df = df.append(temp)

    if NAMES==0:
        for feature in df:
            if 'NAME' in feature:
                df.drop(columns=[feature], inplace=True)

    print (df.shape)
    print ()
    return df

### person.csv from CRSS

In [5]:
def Import_Data_Person(NAMES):
    print ('Import_Data_Person()')

    df = pd.DataFrame([])
    for year in ['2016','2017','2018']:
        filename = '../../Big_Files/CRSS_2020_Update/CRSS' + year + 'CSV/PERSON.CSV'
        temp = pd.read_csv(filename, index_col=None)
        print (year, len(temp))
        df = df.append(temp)

    for year in ['2019','2020','2021']:
        filename = '../../Big_Files/CRSS_2020_Update/CRSS' + year + 'CSV/person.csv'
        temp = pd.read_csv(filename, index_col=None, encoding='latin1')
        print (year, len(temp))
        df = df.append(temp)

    if NAMES==0:
        for feature in df:
            if 'NAME' in feature:
                df.drop(columns=[feature], inplace=True)

    print (df.shape)
    print ()
    return df

### Get Data
- The Get_Data_from_Original() reads the (original) CRSS files from the CRSS directory, preprocesses it, and writes it to files in a folder outside this GitHub repo (because the files are too large for my subscription), and returns the dataframes.
- The Get_Data_from_Temp_Files() reads the temp files and returns the dataframes.  I created this option for running repeatedly during writing and debugging, because it's much faster.

In [6]:
def Get_Data_from_Original():
    print ('Get_Data_from_Original()')
    
    df_Accident = Import_Data_Accident(0)
    df_Vehicle = Import_Data_Vehicle(0)
    df_Person = Import_Data_Person(0)
    
    df_Accident.to_csv('../../Big_Files/Accident_Raw.csv', index=False)
    df_Vehicle.to_csv('../../Big_Files/Vehicle_Raw.csv', index=False)
    df_Person.to_csv('../../Big_Files/Person_Raw.csv', index=False)
    

    df_Accident = Import_Data_Accident(1)
    df_Vehicle = Import_Data_Vehicle(1)
    df_Person = Import_Data_Person(1)
    
    df_Accident.to_csv('../../Big_Files/Accident_Raw_with_NAMES.csv', index=False)
    df_Vehicle.to_csv('../../Big_Files/Vehicle_Raw_with_NAMES.csv', index=False)
    df_Person.to_csv('../../Big_Files/Person_Raw_with_NAMES.csv', index=False)
    

    return df_Accident, df_Vehicle, df_Person

#df_Accident, df_Vehicle, df_Person = Get_Data_from_Original()

In [7]:
def Check_New_Files():
    print ('Check_New_Files')
    Files = [
        'Accident_Raw',
        'Vehicle_Raw',
        'Person_Raw',
        'Accident_Raw_with_Names',
        'Vehicle_Raw_with_Names',
        'Person_Raw_with_Names'
    ]
    for filename in Files:
        df = pd.read_csv('../../Big_Files/' + filename + '.csv', low_memory=False)
        print (filename, df.shape)
    
    return 0    

#Check_New_Files()

In [8]:
def Get_Data_from_Temp_Files():
    print ('Get_Data')
    df_Acc = pd.read_csv('../../Big_Files/Accident_Raw.csv', low_memory=False)
    df_Veh = pd.read_csv('../../Big_Files/Vehicle_Raw.csv', low_memory=False)
    df_Per = pd.read_csv('../../Big_Files/Person_Raw.csv', low_memory=False)
    
    print ('df_Acc.shape = ', df_Acc.shape)
    print ('df_Veh.shape = ', df_Veh.shape)
    print ('df_Per.shape = ', df_Per.shape)
    print ()
    
    return df_Acc, df_Veh, df_Per

#df_Acc, df_Veh, df_Per = Get_Data_from_Temp_Files()

## Drop Features

- We now have three dataframes from the Accident, Vehicle, and Person files.  
- Some features are repeated, so we will drop the ones in Vehicle or Person that appear in Accident, and drop those in Person that appear in Vehicle. 
- There are two repeated features we need to keep for merging the three data:
    - CASENUM tells us to which accident the vehicle and person correspond
    - VEH_NO tells us which vehicle the person was in.
- Some features have no predictive power and/or resemble random numbers, like the VIN (Vehicle Identification Number) and the minute of the accident time.  
- For details on the features, see the *Crash Report Sampling System Analytical User's Manual 2016-2020.*

### Drop Repeated Features

In [9]:
def Drop_Repeated_Features(df_Acc, df_Veh, df_Per):
    print ('Drop_Repeated_Features()')
    Acc_Cols = df_Acc.columns.tolist()
    Veh_Cols = df_Veh.columns.tolist()
    Per_Cols = df_Per.columns.tolist()
    
    Drop_Veh = [x for x in Veh_Cols if x in Acc_Cols]
    Drop_Per = [x for x in Per_Cols if (x in Acc_Cols or x in Veh_Cols)]
        
    """
    print ('Drop_Veh:')
    for item in Drop_Veh:
        print (item)
    print ()

    print ('Drop_Per:')
    for item in sorted(Drop_Per):
        print (item)
    print ()
    """    
    
    # We need to keep these for merging the dataframes.
    Drop_Veh.remove('CASENUM')
    Drop_Per.remove('CASENUM')
    Drop_Per.remove('VEH_NO')
    
    df_Veh.drop(columns=Drop_Veh, inplace=True)
    df_Per.drop(columns=Drop_Per, inplace=True)

    print ('df_Acc.shape = ', df_Acc.shape)
    print ('df_Vet.shape = ', df_Veh.shape)
    print ('df_Per.shape = ', df_Per.shape)
    print ()
    
    return df_Acc, df_Veh, df_Per
                                        

### Drop Irrelevant Features

In [10]:
def Drop_Irrelevant_Features(df_Acc, df_Veh, df_Per):
    
    print ('Drop_Irrelevant_Features')
    
    Drop_Accident = [
        'CF1',
        'CF2',
        'CF3',
        'MINUTE',
        'MINUTE_IM',
        'PSU_VAR',
        'PSUSTRAT',
        'STRATUM',
        'WEATHER1',
        'WEATHER2',
        'WEIGHT',
    ]
    
    df_Acc.drop(columns=Drop_Accident, inplace=True)
    
    # List of features in df_Veh that aren't repeats from df_Acc 
    # that we don't want to use, even for imputation, because
    # they're only for some years or are like random numbers
    Drop_Vehicle = [
        'DR_SF1',
        'DR_SF2',
        'DR_SF3',
        'DR_SF4',
        'GVWR',
        'GVWR_FROM',
        'GVWR_TO',
        'HAZ_ID',
        'ICFINALBODY',
        'MCARR_I1',
        'MCARR_I2',
        'MCARR_ID',
        'TRLR1GVWR',
        'TRLR1VIN',
        'TRLR2GVWR',
        'TRLR2VIN',
        'TRLR3GVWR',
        'TRLR3VIN',
        'UNITTYPE',
        'V_CONFIG',
        'V_Config',
        'VEH_SC1',
        'VEH_SC2',
        'VIN',
        'VPICBODYCLASS',
        'VPICMAKE',
        'VPICMODEL',
    ]
    
    df_Veh.drop(columns=Drop_Vehicle, inplace=True)
    
    Drop_Person = [
        'ATST_TYP',
        'DRUGRES1',
        'DRUGRES2',
        'DRUGRES3',
        'DRUGTST1',
        'DRUGTST2',
        'DRUGTST3',
        'DSTATUS',
        'HELM_MIS',
        'HELM_USE',
        'P_SF1',
        'P_SF2',
        'P_SF3',
        'STR_VEH',
    ]
    
    df_Per.drop(columns=Drop_Person, inplace=True)
    
    
    print ('df_Acc.shape = ', df_Acc.shape)
    print ('df_Veh.shape = ', df_Veh.shape)
    print ('df_Per.shape = ', df_Per.shape)
    print ()
    
    
    return df_Acc, df_Veh, df_Per

## Merge Accident, Vehicle, and Person Dataframes

In [11]:
def Merge(df_Acc, df_Veh, df_Per):
    print ('Merge()')
    print ()

    data = pd.merge(
        df_Acc, df_Veh, 
        on=['CASENUM'],
        how="inner", sort=False
    )
    
    print ('df_Acc.shape')
    print (df_Acc.shape)
    print ('df_Veh.shape')
    print (df_Veh.shape)
    print ('data.shape')
    print (data.shape)
    print ()

    
    data = pd.merge(
        data, df_Per, 
        on=['CASENUM', 'VEH_NO'],
        how="inner", sort=False
    )
    
    print ('df_Acc.shape')
    print (df_Acc.shape)
    print ('df_Veh.shape')
    print (df_Veh.shape)
    print ('df_Per.shape')
    print (df_Per.shape)
    print ('data.shape')
    print (data.shape)
    print ()


    print (data.head())

    return data

## Drop Pedestrian Crashes

A vehicle hitting another vehicle, a tree, or something else large can result in sudden deceleration different enough from hard braking to trigger an automated notification, but an impact with a pedestrian or bicycle is not.  Our work needs to focus on crashes likely to trigger an automated notification, so we will drop pedestrian crashes from our dataset.  

In [12]:
def Remove_Pedestrian_Crashes(data):
    print ('Remove_Pedestrian_Crashes()')
    display(data.PEDS.value_counts())
    n = len(data[data.PEDS>0])
    print ('Removing %d crashes that involve a pedestrian.' % n)
    data = data[data.PEDS==0]
    print ('data.shape: ', data.shape)
    return data

## Run:  Get Data and Preprocess

In [13]:
def Preprocess_Data():
    print ('Preprocess_Data()')
#    df_Acc, df_Veh, df_Per = Get_Data_from_Original()
    df_Acc, df_Veh, df_Per = Get_Data_from_Temp_Files()
    df_Acc, df_Veh, df_Per = Drop_Repeated_Features(df_Acc, df_Veh, df_Per)    
    df_Acc, df_Veh, df_Per = Drop_Irrelevant_Features (df_Acc, df_Veh, df_Per)

    data = Merge (df_Acc, df_Veh, df_Per)
    
    data = Remove_Pedestrian_Crashes(data)
    
    # Bin the target variable.  
    # Either the person went to the hospital or didn't; we don't care how the person got to the hospital.
    data['HOSPITAL'] = data['HOSPITAL'].apply(lambda x:1 if x in [1,2,3,4,5] else 0)
      
    data.to_csv('../../Big_Files/CRSS_Merged_Raw_Data.csv', index=False)
    print ('Finished Preprocess_Data()')

Preprocess_Data()

Preprocess_Data()
Get_Data
df_Acc.shape =  (313277, 51)
df_Veh.shape =  (553099, 98)
df_Per.shape =  (778008, 69)

Drop_Repeated_Features()
df_Acc.shape =  (313277, 51)
df_Vet.shape =  (553099, 84)
df_Per.shape =  (778008, 38)

Drop_Irrelevant_Features
df_Acc.shape =  (313277, 40)
df_Veh.shape =  (553099, 57)
df_Per.shape =  (778008, 24)

Merge()

df_Acc.shape
(313277, 40)
df_Veh.shape
(553099, 57)
data.shape
(553099, 96)

df_Acc.shape
(313277, 40)
df_Veh.shape
(553099, 57)
df_Per.shape
(778008, 24)
data.shape
(747342, 118)

        CASENUM  PSU   PJ  VE_TOTAL  VE_FORMS  PVH_INVL  PEDS  PERMVIT  \
0  201600014311   44  388         2         2         0     0        2   
1  201600014311   44  388         2         2         0     0        2   
2  201600014315   44  388         2         2         0     0        4   
3  201600014315   44  388         2         2         0     0        4   
4  201600014315   44  388         2         2         0     0        4   

   PERNOTMVIT  NUM_INJ  

0     713566
1      32648
2        974
3        110
4         32
6          8
5          1
11         1
7          1
8          1
Name: PEDS, dtype: int64

Removing 33776 crashes that involve a pedestrian.
data.shape:  (713566, 118)
Finished Preprocess_Data()
