- We have the CRSS dataset in 
    - Big_Files/CRSS_2020_Update/
        - Accident_Raw.csv
        - Vehicle_Raw.csv
        - Person_Raw.csv
- Upload the files
- Drop repeated features
- Drop features that are useless, like VINs, or only appear in some years
- Discretize the features with too many values
- Merge the three dataframes

In [1]:
%%latex
\tableofcontents

<IPython.core.display.Latex object>

# Setup
## Import Libraries

In [2]:
import sys, copy, math, time, os

print ('Python version: {}'.format(sys.version))

import numpy as np
print ('NumPy version: {}'.format(np.__version__))
np.set_printoptions(suppress=True)


import pandas as pd
print ('Pandas version:  {}'.format(pd.__version__))
pd.set_option('display.max_rows', 500)

# Library for reading Microsoft Access files
import pandas_access as mdb


# Set Randomness.  Copied from https://www.kaggle.com/code/abazdyrev/keras-nn-focal-loss-experiments
import random


Python version: 3.9.7 (default, Oct 22 2021, 13:24:00) 
[Clang 13.0.0 (clang-1300.0.29.3)]
NumPy version: 1.21.3
Pandas version:  1.2.4


# Import Data

## Get Data
- The Get_Data_from_Original() reads the (original) CRSS files from the CRSS directory, preprocesses it, and writes it to files in a folder outside this GitHub repo (because the files are too large for my subscription), and returns the dataframes.
- The Get_Data_from_Temp_Files() reads the temp files and returns the dataframes.  I created this option for running repeatedly during writing and debugging, because it's much faster.

In [3]:
def Get_Data():
    print ('Get_Data')
    df_Acc = pd.read_csv('../../Big_Files/Accident_Raw.csv', low_memory=False)
    df_Veh = pd.read_csv('../../Big_Files/Vehicle_Raw.csv', low_memory=False)
    df_Per = pd.read_csv('../../Big_Files/Person_Raw.csv', low_memory=False)
    
    print ('df_Acc.shape = ', df_Acc.shape)
    print ('df_Veh.shape = ', df_Veh.shape)
    print ('df_Per.shape = ', df_Per.shape)
    print ()
    
    return df_Acc, df_Veh, df_Per

In [4]:
df_Acc, df_Veh, df_Per = Get_Data()


Get_Data
df_Acc.shape =  (259077, 51)
df_Veh.shape =  (457314, 97)
df_Per.shape =  (644274, 67)



## Drop Repeated Features

In [5]:
def Drop_Repeated_Features(df_Acc, df_Veh, df_Per):
    print ('Drop_Repeated_Features()')
    Acc_Cols = df_Acc.columns.tolist()
    Veh_Cols = df_Veh.columns.tolist()
    Per_Cols = df_Per.columns.tolist()
    
    Drop_Veh = [x for x in Veh_Cols if x in Acc_Cols]
    Drop_Per = [x for x in Per_Cols if (x in Acc_Cols or x in Veh_Cols)]
        
    """
    print ('Drop_Veh:')
    for item in Drop_Veh:
        print (item)
    print ()

    print ('Drop_Per:')
    for item in sorted(Drop_Per):
        print (item)
    print ()
    """    
    
    # We need to keep these for merging the dataframes.
    Drop_Veh.remove('CASENUM')
    Drop_Per.remove('CASENUM')
    Drop_Per.remove('VEH_NO')
    
    df_Veh.drop(columns=Drop_Veh, inplace=True)
    df_Per.drop(columns=Drop_Per, inplace=True)

    print ('df_Acc.shape = ', df_Acc.shape)
    print ('df_Vet.shape = ', df_Veh.shape)
    print ('df_Per.shape = ', df_Per.shape)
    print ()
    
    return df_Acc, df_Veh, df_Per
                                        

In [6]:
df_Acc, df_Veh, df_Per = Drop_Repeated_Features(df_Acc, df_Veh, df_Per)

Drop_Repeated_Features()
df_Acc.shape =  (259077, 51)
df_Vet.shape =  (457314, 83)
df_Per.shape =  (644274, 38)



## Drop Irrelevant Features

In [7]:
def Drop_Irrelevant_Features(df_Acc, df_Veh, df_Per):
    
    print ('Drop_Irrelevant_Features')
    
    Drop_Accident = [
        'CF1',
        'CF2',
        'CF3',
        'MINUTE',
        'MINUTE_IM',
        'PSU_VAR',
        'PSUSTRAT',
        'STRATUM',
        'WEATHER1',
        'WEATHER2',
        'WEIGHT',
    ]
    
    df_Acc.drop(columns=Drop_Accident, inplace=True)
    
    # List of features in df_Veh that aren't repeats from df_Acc 
    # that we don't want to use, even for imputation, because
    # they're only for some years or are like random numbers
    Drop_Vehicle = [
        'DR_SF1',
        'DR_SF2',
        'DR_SF3',
        'DR_SF4',
        'DR_ZIP',
        'GVWR',
        'GVWR_FROM',
        'GVWR_TO',
        'HAZ_ID',
        'ICFINALBODY',
        'MCARR_I1',
        'MCARR_I2',
        'MCARR_ID',
        'TRLR1GVWR',
        'TRLR1VIN',
        'TRLR2GVWR',
        'TRLR2VIN',
        'TRLR3GVWR',
        'TRLR3VIN',
        'UNITTYPE',
        'V_CONFIG',
        'V_Config',
        'VEH_SC1',
        'VEH_SC2',
        'VIN',
        'VPICBODYCLASS',
        'VPICMAKE',
        'VPICMODEL',
    ]
        
    df_Veh.drop(columns=Drop_Vehicle, inplace=True)
    
    Drop_Person = [
        'ATST_TYP',
        'DRUGRES1',
        'DRUGRES2',
        'DRUGRES3',
        'DRUGTST1',
        'DRUGTST2',
        'DRUGTST3',
        'DSTATUS',
        'HELM_MIS',
        'HELM_USE',
        'P_SF1',
        'P_SF2',
        'P_SF3',
        'STR_VEH',
    ]
    
    df_Per.drop(columns=Drop_Person, inplace=True)
    
    
    print ('df_Acc.shape = ', df_Acc.shape)
    print ('df_Veh.shape = ', df_Veh.shape)
    print ('df_Per.shape = ', df_Per.shape)
    print ()
    
    
    return df_Acc, df_Veh, df_Per

In [8]:
df_Acc, df_Veh, df_Per = Drop_Irrelevant_Features(df_Acc, df_Veh, df_Per)

Drop_Irrelevant_Features
df_Acc.shape =  (259077, 40)
df_Veh.shape =  (457314, 55)
df_Per.shape =  (644274, 24)



# Binning

## Tools

In [9]:
def Build_Individual_Feature_with_Dict(df, data, feature, A):
    D = {}
    for B in A:
        for b in B[1]:
            D[b] = B[0]

    data[feature] = df[feature].replace(D)
    
#    print (feature)
#    print (df[feature].value_counts())
#    print ('isna(): ', df[feature].isna().sum())
#    print (data[feature].value_counts())
#    print ()

    return data

In [10]:
def Analyze_Binning(df):
    print ('Analyze_Binning')
    Cols = df.columns.values.tolist()
    Cols = sorted(Cols)
    for feature in Cols:
        u =  len(df[feature].unique())
        print (feature, u)
        
    print ()
    return 0
    

## Bin Accident Dataframe

In [11]:
def Build_Accident_Dataset(df_Acc):
    print ('Build_Accident_Dataset()')
    data = pd.DataFrame()
    
    # Reference
    data['CASENUM'] = df_Acc['CASENUM']
    
    feature = 'HOUR_IM'
    A = [
        ['Early_Morn', [5,6]],
        ['Morning', [7,8,9,10]],
        ['Mid_Day', [11,12,13,14]],
        ['Rush_Hour', [15,16,17]],
        ['Early_Eve', [18,19]],
        ['Evening', [20,21,22]],
        ['Late_Nght',[23,0,1,2,3,4]],
        ['Unknown', [99]],
             ]
    
    feature = 'HOUR_IM'
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)
    feature = 'HOUR'
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)

    feature = 'INT_HWY'
    A = [
        [0, [0,9]],
        [1, [1]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)

    A =  [
        ['Dark', [2]],
        ['Dawn_Lighted', [3,4,6]],
        ['Dusk', [5]],
        ['Daylight', [1,7]],
        ['Unknown', [8,9]],
    ]
    feature = 'LGTCON_IM'
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)
    feature = 'LGT_COND'
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)

    feature = 'MONTH'
    A = [
        ['Winter', [1,2,3,12]],
        ['Spring_Fall', [4,5,10,11]],
        ['Summer', [6,7,8,9]],
    ]
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)

    feature = 'PEDS'
    B = [x for x in list(df_Acc[feature].unique()) if x not in [0,1]]
#    print (B)
    A = [
        ['0', [0]],
        ['1', [1]],
        ['Multiple', B]
    ]
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)

    feature = 'PERMVIT'
    B = [x for x in list(df_Acc[feature].unique()) if x not in [1,2]]
#    print (B)
    A = [
        ['1', [1]],
        ['2', [2]],
        ['Multiple', B]
    ]
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)

    feature = 'REL_ROAD'
    A =  [
        ['Not_on_Road', [2,3,4,5,6,8,10,12,98,99]],
        ['On_Road', [1,11]],
        ['Parking_Area', [7]],
    ]
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)

    A = [
        ['A', [2,5,6,19]],
        ['B', [1,7,16]],
        ['C', [4,8,18]],
        ['D', [3,17,20]],
        ['Unknown', [98,99]],
    ]
    feature = 'RELJCT2_IM'
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)
    feature = 'RELJCT2'
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)


    feature = 'SCH_BUS'
    A = [
        ['0', [0]],
        ['1', [1]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)


    feature = 'URBANICITY'
    A = [
        ['1', [1]],
        ['2', [2]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)

    feature = 'VE_TOTAL'
    B = [x for x in list(df_Acc[feature].unique()) if x not in [1,2,3]]
#    print (B)
    A = [
        ['1', [1]],
        ['2', [2]],
        ['3', [3]],
        ['Multiple', B]
    ]
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)

    A = [
        ['A', [3,5]],
        ['B', [1]],
        ['C', [2]],
        ['D', [10]],
        ['E', [4,6,7,8,11,12]],
        ['Unknown', [98,99]],
    ]
    feature = 'WEATHR_IM'
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)
    feature = 'WEATHER'
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)

    A = [
        ['Weekend', [1,7]],
        ['Weekday', [2,3,4,5,6]],
        ['Unknown', [9]],
    ]
    feature = 'WKDY_IM'
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)
    feature = 'DAY_WEEK'
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)


    feature = 'WRK_ZONE'
    A = [
        ['0', [0]],
        ['1', [1,4]],
        ['2', [2]],
        ['3', [3]],
    ]
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)

    
    feature = 'VE_FORMS'
    B = [x for x in list(df_Acc[feature].unique()) if x not in [1,2,3]]
#    print (B)
    A = [
        ['1', [1]],
        ['2', [2]],
        ['3', [3]],
        ['Multiple', B]
    ]
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)
    
    feature = 'PVH_INVL'
    B = [x for x in list(df_Acc[feature].unique()) if x not in [0]]
#    print (B)
    A = [
        ['0', [0]],
        ['1+', B]
    ]
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)
    
    feature = 'PERNOTMVIT'
    B = [x for x in list(df_Acc[feature].unique()) if x not in [0]]
#    print (B)
    A = [
        ['0', [0]],
        ['1+', B]
    ]
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)
    
    feature = 'NUM_INJ'
    B = [x for x in list(df_Acc[feature].unique()) if x not in [0,1,2,3,99]]
#    print (B)
    A = [
        ['0', [0]],
        ['1', [1]],
        ['2', [2]],
        ['3', [3]],
        ['4+', B],
        ['Unknown', [99]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)
    
    feature = 'NO_INJ_IM'
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)
        
    # Split into five bins, each about 20% of samples, ordered by correlation
    feature = 'PSU'
    A = [
        ['0', [15,75,34,57,40,66,76,80,52,64,68,60,50,10,24,55,47,49,31,]],
        ['1', [62,53,63,72,17,56,30,48,35,]],
        ['2', [65,82,25,32,83,78,12,45,58,13,]],
        ['3', [67,14,26,70,28,22,33,81,29,20,54,77,]],
        ['4', [27,61,39,41,51,59,38,37,46,44,]],
    ]
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)
    
    # Split into five bins, each about 20% of samples, ordered by correlation
    feature = 'PJ'
    A = [
        ['0', [3087,147,2904,3292,3069,47,3090,1225,2800,149,610,598,2705,1645,587,1688,2211,2171,2412,3089,2679,453,2139,2537,2764,1222,1801,4113,2514,2582,2722,2298,189,1741,1750,85,1766,1223,1684,2513,2775,171,4144,4056,1231,173,4047,2330,299,172,1362,1634,2793,1747,308,256,307,1692,1055,1838,96,1070,1227,1392,1678,1230,3262,3106,2735,3224,2160,91,1805,313,4147,2286,3076,1315,2586,1460,1757,1709,1802,1800,2591,542,209,46,305,4107,3122,565,1693,2001,1763,1811,2881,3077,268,1308,3247,1762,1804,2883,1219,4016,3073,2035,295,606,205,1829,260,718,4149,839,2592,4152,1764,1921,]],
        ['1', [618,1803,4135,234,250,2702,2087,1695,1733,526,2906,2905,206,2018,2854,2973,4125,2749,245,4015,174,297,2972,1053,306,311,2292,970,2598,1708,321,4150,1290,359,1723,257,2803,1197,92,2670,1207,458,322,1208,285,86,261,267,315,2811,455,1036,2809,1714,459,4148,232,4151,4138,1646,640,4141,161,4153,2034,1710,2365,3296,2607,591,2807,1259,]],
        ['2', [1736,2851,432,210,625,936,262,461,4114,3004,437,1191,1056,4142,45,2857,4143,1374,4055,1746,2509,1635,3294,4019,2364,457,97,4146,1482,1041,4145,1265,2091,1459,892,87,1114,1637,4139,893,456,2136,2682,2810,466,170,1919,516,2808,448,314,460,3070,3119,]],
        ['3', [464,3139,4028,3291,1075,2799,1069,2907,318,329,469,4045,123,1088,4093,1724,1052,652,1835,148,1925,1571,452,3245,2792,479,967,441,508,211,2825,4012,1040,2763,2137,1484,1699,3131,1283,2853,1255,3011,440,2199,2855,3248,567,2197,2666,4140,1117,2759,3010,310,2782,214,309,966,1920,1928,208,138,2152,159,1481,3202,3133,1573,2797,2819,1038,1098,1577,3017,590,1079,543,896,2802,130,1366,1278,721,650,1472,900,137,165,]],
        ['4', [454,1262,500,2092,1568,3019,1628,3246,162,506,1570,336,954,1050,505,90,269,1260,2812,1383,504,160,965,972,929,3159,4137,2717,2168,573,163,2151,1477,1369,1080,1933,3013,1163,1078,1361,1930,4136,386,341,687,382,569,571,1247,503,2687,2748,378,877,517,2755,509,641,136,338,905,568,578,2025,3124,525,514,3253,1721,515,1319,1381,362,334,3200,375,388,337,369,3201,1043,353,3203,360,4036,4029,2411,3209,]],
    ]
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)
    
    feature = 'MAN_COLL'
    A = [
        ['0', [2]],
        ['1', [0]],
        ['2', [6,8]],
        ['3', [1]],
        ['4', [7,11,9,10]],
        ['Unknown', [98,99]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)
    
    feature = 'MANCOL_IM'
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)
    
    # Grouped these by the groupings given in the Analytical Users Manual 
    # because 78% of the crashes were '12', crash with another vehicle.  
    feature = 'HARM_EV'
    A = [
        ['Non_Collision', [1,2,3,4,5,6,7,16,44,51,72]],
        ['Collision_with_MVIT', [12,54,55]],
        ['Collision_with_Object_Not_Fixed', [8,9,10,11,14,15,18,45,49,73,74,91]],
        ['Collision_with_Fixed_Object', [17,19,20,21,23,24,25,26,30,31,32,33,34,35,38,39,40,41,42,43,46,48,50,52,53,57,58,59,93]],
        ['Unknown', [98,99]],        
    ]
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)
    
    feature = 'EVENT1_IM'
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)

    feature = 'TYP_INT'
    A = [
        ['0', [11,10,3,]],
        ['1', [1]],
        ['2', [2]],
        ['3', [7,4,6,5,]],
        ['Unknown', [98,99]],        
    ]
    data = Build_Individual_Feature_with_Dict(df_Acc, data, feature, A)

    
    for feature in [
        'YEAR',
        'REGION',
        'ALCOHOL', 
        'ALCHL_IM',
        'MAX_SEV',
        'MAXSEV_IM',
        'RELJCT1',
        'RELJCT1_IM',
    ]:
        data[feature] = df_Acc[feature]

    print ('Need to Do:')
    A = data.columns.values.tolist()
    B = df_Acc.columns.values.tolist()
    C = [b for b in B if b not in A]
    for c in C:
        u = len(df_Acc[c].unique())
        print (c, u)
    
    print ()
    return data

## Bin Vehicle Dataframe

In [12]:
def Build_Vehicle_Dataset_Old_Version(df_Veh):
    print ('Build_Vehicle_Dataset()')
    data = pd.DataFrame()
    
    # Reference
    data['CASENUM'] = df_Veh['CASENUM']
    data['VEH_NO'] = df_Veh['VEH_NO']
    
    feature = 'ACC_TYPE'
    A = [
        ['0', [61,60,51,50,53,59,52,55,58,6,54,1,10,14,16,5,2,7,8,4,0,62,3,9,89,69,41,64,66,87,90,91,]],
        ['1', [83,34,35,88,68,65,86,30,82,38,73,39,]],
        ['2', [98,25,22,11,31,77,12,40,85,24,26,32,71,81,79,29,27,43,]],
        ['3', [21,33,42,48,75,72,80,15,78,28,76,44,45,84,49,]],
        ['4', [20,67,23,74,47,70,46,93,13,92,63,36,37,]],
        ['Unknown', [98,99]],        
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)

    feature = 'BDYTYP_IM'
    A = [
        ['0', [86,87,82,89,81,83,84,80,88,85,90,95,11,97,96,58,12,45,32,91,10,2,59,3,30,]],
        ['1', [4,]],
        ['2', [1,19,42,5,8,16,6,52,]],
        ['3', [14,]],
        ['4', [9,20,22,40,]],
        ['5', [34,31,15,29,39,55,92,17,21,50,93,48,7,28,51,61,67,63,62,66,65,78,64,72,60,71,73,94,41,13,]],
        ['Unknown', [98,99,49,79]],
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)
    feature = 'BODY_TYP'
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)

    feature = 'BUS_USE'
    # Modified from automated suggestion because '1' is 98.46%, 
    # and I wanted more differentiation in the others.  
    A = [
        ['0', [5,]], # Charter
        ['1', [0,]], # Not a bus
        ['2', [6,7,]], # Transit/Shuttle (slow vehicles)
        ['3', [8,1,4,]], # Modified, School, Intercity
        ['Unknown', [98,99]],
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)
    
    # Modified from automated suggestion because "0" is 94.7%
    feature = 'CARGO_BT'
    A = [
        ['0', [0,]], # Not applicable
        ['1', [22,10,]], # Bus/Log
        ['2', [5,2,4,12,8,1,3,11,7,6,9,]],
        ['Uknown', [96,97,98,99]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)
    
    feature = 'DEFORMED'
    A = [
        ['0', [6,]],
        ['1', [4,]],
        ['2', [2,]],
        ['3', [0,]],
        ['Unknown', [8,9]]
    ]    
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)
    
    
    feature = 'DR_PRES' # Does not give useful information, 
    # since '1' is 99.975% of samples
    
    feature = 'EMER_USE'
    # Does not give useful information, since '0' is 98.8278% of samples.

    data['FIRE_EXP'] = df_Veh['FIRE_EXP']

    # None of these give useful information, since '0' is 99.9604% of samples.
    feature = 'HAZ_CNO'
    feature = 'HAZ_INV'
    feature = 'HAZ_PLAC'
    feature = 'HAZ_REL'
    
    feature = 'HIT_RUN'
    A = [
        ['0', [0,]],
        ['1', [1,]],
        ['Unknown', [9,]],
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)

    feature = 'IMPACT1'
    A = [
        ['0', [0,14,61,9,81,3,]],
        ['1', [12,]],
        ['2', [62,11,]],
        ['3', [10,1,82,2,8,4,63,19,20,83,]],
        ['4', [6,]],
        ['5', [7,5,13,18,]],
        ['Unknown', [98,99]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)

    feature = 'J_KNIFE'
    A = [
        ['0', [0,]],
        ['1', [1,]],
        ['2', [2,3,]],
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)
    
    feature = 'M_HARM'
    A = [
        ['0', [74,10,1,5,21,42,32,35,19,46,39,30,6,93,20,45,23,3,58,52,34,2,25,26,24,33,31,44,17,38,41,43,7,91,40,48,57,59,53,]],
        ['1', [12,]],
        ['2', [14,16,55,49,18,50,72,11,73,54,51,8,15,9,]],
        ['Unknown', [98,99]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)
    feature = 'VEVENT_IM'
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)    
    
    feature = 'MAK_MOD'
    A = [
        ['0', [6010,76733,72704,71705,22001,43399,50031,2401,74706,30441,53702,71709,34705,99705,7017,20013,50709,99707,37733,37704,76709,20988,37702,53709,3884,12012,37709,73709,76703,98709,72709,98703,53705,98701,73704,76705,53706,58034,37706,98705,72706,73706,37703,76706,34709,50706,53734,73734,98706,73705,53704,76704,98704,37705,73703,53401,98702,50799,76701,99709,34706,77706,76702,71706,37739,37701,50705,76734,74705,53703,41401,42040,98733,99703,55032,98707,7470,9499,42053,69039,32054,38399,9037,99701,72705,93989,64031,73702,19006,76998,69038,6017,49055,7011,98734,20017,7004,55046,76739,22499,30032,35398,20038,20019,35053,74709,12008,94999,50399,12882,36398,31399,19027,12989,98907,42033,77709,12018,32047,21020,36399,41047,12998,13403,18402,14499,12015,52035,24008,69042,24002,54032,53999,59043,49441,22009,22023,19017,37734,24441,92989,12403,21017,98739,18003,22398,53036,2431,6398,21005,9019,10041,19019,23988,69398,69399,84998,45044,6444,42406,52040,7481,63035,67037,67399,65031,14037,49041,37035,48046,98982,13422,21002,35034,39036,54031,9038,14020,35056,52401,7898,20402,52471,52039,9442,55499,13482,52048,39032,20016,21441,21023,51051,6041,18441,39399,65399,53033,24003,20441,12013,20989,63498,55399,54499,13017,20471,41056,35404,63499,52399,12443,6052,20022,39035,2402,41044,20015,49398,38883,63403,19025,34703,63041,30399,13012,24001,19399,18401,55421,52999,54999,42031,20020,20004,18499,52403,35399,18026,55037,6044,7020,53034,6043,13002,6499,18399,12004,12032,49471,14443,20029,41402,67036,12424,20444,47036,42870,39038,34045,22032,7043,67033,59421,23471,34037,58499,25401,13401,47401,12399,63399,22403,36401,35050,37399,20443,35032,7444,20401,58043,41045,35471,20405,63037,22002,18025,20399,13005,63038,37033,22441,9020,62405,3499,52402,12016,12471,22016,37499,23401,35499,98398,14017,45040,14036,53499,7399,18019,63032,63033,35999,18002,63031,49399,52404,13999,20498,21021,14399,48044,14004,20406,20039,6018,35043,13399,42399,22018,14444,38401,18405,49050,62401,41499,35048,58038,12021,20009,41035,20403,53040,24006,55441,12006,7498,19021,38421,51040,34038,45037,12441,99739,23883,58398,55403,22010,53739,89881,73739,32421,20036,12035,49044,49048,24009,14038,55045,55036,7025,20027,42423,52046,12037,18022,51399,20023,35039,14401,55422,19003,55042,]],
        ['1', [49051,14006,32059,20032,18007,63999,24007,59399,38402,55035,41399,22999,92988,30999,13001,7021,53399,21003,51042,51499,51039,24499,20037,12442,49033,22019,6999,53035,20002,22020,22022,18020,18023,22399,54037,30051,12003,52037,53405,35403,6042,20010,20007,24399,39031,49405,19422,20025,18404,7029,41037,55999,49049,55404,2408,35047,12017,62424,23441,42499,6421,42405,6054,21399,63398,42045,94989,58399,35402,37031,34399,49032,62499,41055,12473,58037,98998,12402,20404,7404,84983,2405,19431,2499,7402,20028,14021,54038,23431,52034,24999,58999,63034,]],
        ['2', [49999,52047,49043,30047,23399,12025,22401,20431,6442,42055,7471,38405,2482,32403,24011,22005,19401,18010,49034,55044,7039,51404,12425,42057,7024,51046,42043,7027,53404,7026,6051,34034,58042,30040,37039,35052,20445,12401,7472,58404,41049,37041,59033,2001,34999,34039,49040,23999,35051,41050,49056,6399,37032,55033,7461,24401,32999,12499,58032,12023,7442,54041,49499,35446,23402,19018,2407,49404,13015,23472,69054,20499,36038,59401,12027,63401,21401,7041,20999,20470,51034,99988,21022,3421,41471,24005,47035,59999,7499,19005,12422,54036,42042,37999,2404,]],
        ['3', [37402,58036,55402,19024,63402,19026,14402,49481,67035,7403,41999,12999,12444,48499,67032,51047,2406,20482,23423,39034,49046,13402,59031,35049,12498,34048,51049,22008,48999,18018,14003,45401,49035,12981,69035,6441,13013,42403,53402,63036,12024,49401,32399,54399,37404,35422,42401,45399,19421,54035,42999,34404,37422,2483,20473,7028,48401,35401,34402,34403,55401,42048,51043,35443,51045,38471,12988,58039,30042,49402,23481,41051,20423,20421,36402,49403,3431,30046,35472,39401,59405,35473,52472,51401,54044,37401,37471,59035,19480,59034,49052,12022,42044,19020,20481,49038,63039,49442,49472,30402,37403,63441,48399,21001,21999,32044,63040,30499,2403,23421,67031,48034,35481,7422,23499,58035,19022,54421,19023,48421,]],
        ['4', [2999,18999,49053,32499,20024,13014,49047,58422,32405,59032,59038,34035,55039,12481,38499,90981,34401,7999,39037,62423,41403,20461,54039,20398,13421,48038,12421,32052,49045,34049,32042,59403,37441,18421,19499,67034,6055,32048,24398,20001,7018,6014,19423,23422,30403,35421,51999,7482,7463,82461,42050,52499,37421,18024,29005,42047,37037,98983,55041,41053,58033,99998,62425,34047,32043,14999,58041,59499,42051,63422,34421,41441,69040,59402,23898,55398,7462,6016,82983,20026,49422,23461,41054,41052,48045,58403,82981,59040,31037,2422,49482,12870,6443,32049,51050,53041,54402,58044,23008,19014,42421,32045,22024,90982,39999,92983,12398,51041,58047,32422,12461,48047,34036,38882,62421,12423,59404,55040,12462,30052,34042,32051,48403,7443,20034,42461,20870,34499,32402,18398,51048,42039,59037,20021,58421,32401,20422,42054,84988,20880,3402,45421,19999,23880,53032,7019,51402,37038,31401,30443,55038,13016,13499,41421,30036,42404,98809,98988,63421,42398,37405,18004,58040,23498,20850,55043,98999,58402,24010,53481,98898,45042,84981,30421,49421,98884,45031,86881,38472,84999,86882,38999,58045,20881,35055,48402,58401,7880,12470,2498,82884,99989,34043,12898,94982,34044,82989,23882,38884,85881,45499,42058,12881,51881,84884,98850,82881,82870,85884,87884,84881,52882,39039,23881,35461,12880,20898,20040,98806,98808,37398,47399,98804,82999,51884,32040,94983,87881,12850,23870,62404,90989,86898,20882,19481,89890,20884,82898,59036,86884,84898,7881,99898,99399,97997,12884,99499,99999,99884,90988,98881,51898,85898,38898,23989,9017,98498,90983,29398,23981,23884,41398,48031,51882,54398,19398,7398,99881,73732,30398,87890,98890,98981,7005,13398,42036,82988,89882,42034,55034,34046,62422,69031,99890,30442,82982,23470,39398,82890,41498,32041,3482,47034,98882,92982,42402,25499,41043,22025,32050,14015,82462,84989,82883,84481,84982,94461,25999,45041,9999,51998,82498,42046,82850,7421,98908,7042,9002,18005,7884,51044,22017,21398,25441,2421,23890,20472,52498,93988,23466,38498,20890,62403,35898,23850,13481,42038,20466,99850,85999,86999,98805,87999,33033,48043,29399,38473,9034,14039,3441,35441,36039,21499,87898,3999,7001,46039,12036,84890,12007,98598,7007,99870,41046,12033,20442,7013,25890,7870,35042,94981,34398,86890,99981,84883,38890,64033,85890,9399,42422,47398,98902,9001,38403,84850,20035,7015,35044,41034,53403,20981,7850,29001,54401,25498,54033,35038,82499,10037,51053,32055,20008,84498,1399,48398,48037,51890,82882,14031,49042,87882,43032,59039,36999,54034,9008,49498,45398,18001,62402,32404,51398,35498,42462,52898,18498,85883,98731,20883,47031,47037,98870,7002,42041,53701,45999,51989,82998,32046,38404,64032,54040,30048,32398,47999,67398,99599,42850,42056,10034,51403,25884,1001,34405,38881,62426,98904,99982,86883,89898,53733,20407,42037,7033,53398,32056,51988,93981,30033,30043,85882,34422,32058,1008,38988,51052,38441,12890,53031,98883,92981,30053,31036,21018,31999,49054,87883,13423,39499,31422,93983,89884,10045,32406,59042,39402,38474,30034,45045,32060,49057,55406,14398,42898,19009,84421,62427,48498,72707,18021,]],
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)

    feature = 'MAKE'
    A = [
        ['0', [74,76,71,72,50,73,77,43,53,98,64,65,21,9,52,22,14,18,92,24,37,39,63,]],
        ['1', [35,6,36,55,67,]],
        ['2', [20,]],
        ['3', [13,69,34,]],
        ['4', [49,]],
        ['5', [30,]],
        ['6', [12,]],
        ['7', [19,2,41,]],
        ['8', [58,7,42,54,47,93,23,59,25,48,38,62,3,32,29,51,31,45,90,10,94,86,89,84,85,82,87,97,33,46,1,]],
        ['Unknown', [99]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)

    feature = 'MAX_VSEV'
    A = [
        ['0', [3,5,6,4,2,]],
        ['1', [1,]],
        ['2', [0,]],
        ['Unknown', [9,]],
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)
    feature = 'MXVSEV_IM'
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)

    feature = 'MDLYR_IM'
    feature = 'MOD_YEAR'
    # These two we will deal with later.  
    # We will merge in 'YEAR' from Accident, then make a new feature, 'VEH_AGE'.
    
    feature = 'MODEL'
    A = [
        ['0', [709,703,701,706,704,705,702,707,799,733,734,907,739,12,56,11,16,4,19,471,424,9,22,29,50,6,3,20,2,21,59,37,18,7,43,13,]],
        ['1', [399,52,17,444,25,36,1,38,998,406,408,15,39,47,31,35,32,405,48,]],
        ['2', [431,33,27,989,23,445,34,40,404,44,425,446,988,26,51,401,407,28,]],
        ['3', [402,42,49,46,443,442,24,499,473,57,54,483,403,441,472,423,41,5,55,480,398,498,]],
        ['4', [481,421,45,422,53,470,14,482,463,461,10,8,999,983,982,870,462,981,883,809,882,880,58,881,806,808,804,884,898,850,997,890,732,908,466,805,598,902,731,599,426,904,474,60,427,]],
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)
    
    
    
    feature = 'NUM_INJV'
    B = [x for x in list(df_Veh[feature].unique()) if x not in [0,1,2,3,99]]
#    print (B)
    A = [
        ['0', [0]],
        ['1', [1]],
        ['2', [2]],
        ['3', [3]],
        ['4+', B],
        ['Unknown', [99]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)
    feature = 'NUMINJ_IM'
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)

    feature = 'NUMOCCS'
    B = [x for x in list(df_Veh[feature].unique()) if x not in [0,1,2,3,99]]
#    print (B)
    A = [
        ['0', [0]],
        ['1', [1]],
        ['2', [2]],
        ['3', [3]],
        ['4+', B],
        ['Unknown', [99]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)

    feature = 'P_CRASH1'
    A = [
        ['0', [14,0,7,3,17,6,]],
        ['1', [1,]],
        ['2', []],
        ['3', [11,]],
        ['4', [12,2,16,]],
        ['5', [5,]],
        ['6', [15,4,10,8,9,13,]],
        ['Unknown', [98,99]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)
    feature = 'PCRASH1_IM'
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)
    
    feature = 'P_CRASH2'
    A = [
        ['0', [8,9,12,13,14,54,1,4,6,5,62,2,66,3,55,]],
        ['1', [17,67,72,63,68,91,64,78,15,10,]],
        ['2', [71,19,18,92,21,65,90,51,70,59,]],
        ['3', [53,]],
        ['4', [73,60,74,87,61,89,]],
        ['5', [52,11,88,16,50,56,20,80,82,81,84,85,83,]],
        ['Unknown', [98,99]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)
    
    feature = 'P_CRASH3'
    A = [
        ['0', [15,6,7,0,11,10,8,9,12,]],
        ['1', [5,16,]],
        ['2', [1,]],
        ['Unknown', [98,99]],
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)

    feature = 'PCRASH4'
    A = [
        ['0', [7,3,4,5,2,0,]],
        ['1', [1,]],
        ['Unknown', [9]],
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)
    
    feature = 'PCRASH5'
    A = [
        ['0', [6,]],
        ['1', [4,]],
        ['2', [0,3,5,]],
        ['3', [2,]],
        ['4', [1,]],
        ['5', [7,]],
        ['Unknown', [9]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)
    
    # Reduce 'Location of Rollover' to 'Was there a rollover?'
    feature = 'ROLINLOC'
    A = [
        ['0', [7,3,6,1,5,4,2,9]],
        ['1', [0,]],
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)
    # Eliminate 'Cause of Rollover' because we would reduce it to the same information.
    feature = 'ROLLOVER'
    
    feature = 'SPEC_USE'
    A = [
        ['0', [19,4,10,1,5,]],
        ['1', [0,]],
        ['2', [3,20,8,6,21,13,2,22,7,23,12,11,]],
        ['Unknown', [98,99]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)
    
    feature = 'SPEEDREL'
    A = [
        ['0', [3,2,5,4,]],
        ['1', [0,]],
        ['Unknown', [8,9,]],
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)
    
    feature = 'TOW_VEH'
    A = [
        ['0', [3,]],
        ['1', [0,]],
        ['2', [6,5,1,2,4,]],
        ['Unknown', [9]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)
    
    feature = 'TOWED'
    A = [
        ['0', [2,]],
        ['1', [7,3,9,]],
        ['2', [8,]],
        ['3', [5,]],
        ['Unknown', [8,9]],
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)

    feature = 'TRAV_SP'
    B = [x for x in list(df_Veh[feature].unique()) if x > 89 and x<998]
    A = [
        ['0-9', range(0,10)],
        ['10-19', range (10,20)],
        ['20-29', range (20,30)],
        ['30-39', range (30,40)],
        ['40-49', range (40,50)],
        ['50-59', range (50,60)],
        ['60-69', range (60,70)],
        ['70-79', range (70,80)],
        ['80-89', range (80,90)],
        ['90+', B],
        ['Unknown', [998,999]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)
    
    feature = 'V_ALCH_IM'
    A = [
        ['0', [1,]],
        ['1', [2,]],
        ['Unknown', [8,9]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)
    feature = 'VEH_ALCH'
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)
    
    feature = 'VALIGN'
    A = [
        ['0', [3,2,4,]],
        ['1', [1,]],
        ['2', [0,]],
        ['Unknown', [8,9,]],
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)
    
    
    
    
    
    
    print ('Unfinished Features:')
    X = df_Veh.columns.values.tolist()
    Y = data.columns.values.tolist()
    Z = [x for x in X if x not in Y]
    for z in Z:
        print (z)
    print ()
    
    print ()
    return data
    


In [13]:
def Build_Vehicle_Dataset(df_Veh):
    print ('Build_Vehicle_Dataset()')
    data = pd.DataFrame()
    
    # Reference
    data['CASENUM'] = df_Veh['CASENUM']
    data['VEH_NO'] = df_Veh['VEH_NO']
    

    feature = 'ACC_TYPE'
    A = [
        ['0', [61,60,51,50,53,59,52,55,58,6,54,1,10,14,16,5,2,7,8,4,0,62,3,9,89,69,41,64,66,87,90,91,]], #  21.7211 %
        ['1', [83,34,35,88,68,65,86,30,82,38,73,39,]], #  20.5914 %
        ['2', [98,25,22,11,31,77,12,40,85,24,26,32,71,81,79,29,27,43,]], #  22.3576 %
        ['3', [21,33,42,48,75,72,80,15,78,28,76,44,45,84,49,]], #  22.3708 %
        ['4', [20,67,23,74,47,70,46,93,13,92,63,36,37,]], #  12.9592 %
        ['Unknowns', [99, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'BDYTYP_IM'
    A = [
        ['0', [86,87,82,89,81,83,84,80,88,85,90,95,11,97,96,58,12,45,32,91,10,2,59,3,30,]], #  7.1472 %
        ['1', [4,]], #  36.3414 %
        ['2', [1,19,42,5,8,16,6,52,]], #  10.832 %
        ['3', [14,]], #  15.7266 %
        ['4', [9,20,22,40,]], #  16.7815 %
        ['5', [34,31,15,29,39,55,92,17,21,50,93,48,7,28,51,61,67,63,62,66,65,78,64,72,60,71,73,94,41,13,]], #  13.1715 %
        ['Unknowns', []]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'BODY_TYP'
    A = [
        ['0', [86,87,82,89,81,83,80,84,88,85,90,95,11,97,96,58,45,12,32,91,10,2,3,59,1,30,]], #  7.8638 %
        ['1', [4,]], #  36.434 %
        ['2', [19,42,5,8,16,6,]], #  9.943 %
        ['3', [14,]], #  15.5655 %
        ['4', [52,9,20,22,40,]], #  16.9082 %
        ['5', [34,31,15,29,39,55,92,17,21,50,93,48,28,7,51,61,67,63,62,66,65,78,64,72,60,71,73,94,41,13,]], #  13.2856 %
        ['Unknowns', [98, 99, 49, 79, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'BUS_USE'
    A = [
        ['0', [5,]], #  0.0207 %
        ['1', [0,]], #  99.5385 %
        ['2', [6,7,8,1,4,]], #  0.4407 %
        ['Unknowns', [98, 99, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'CARGO_BT' # Modified by hand
    A = [
        ['0', [0,]], #  96.4126 %
        ['1', [22,10,5,2,4,12,8,1,97,3,96,11,7,6,9,]], #  3.5873 %
        ['Unknowns', [98, 99, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'DEFORMED' # Modified by hand
    A = [
        ['0', [6,]], #  39.2768 %
        ['1', [4,]], #  25.1503 %
        ['2', [2,]], #  31.9667 %
        ['3', [0,]], #  3.6062 %
        ['Unknowns', [8, 9, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'DR_PRES' # Modified by hand
    A = [
        ['0', [0,]], #  0.0216 %
        ['1', [1,]], #  99.9784 %
        ['Unknowns', [9, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'EMER_USE'
    A = [
        ['0', [6,5,]], #  0.1932 %
        ['1', [0,]], #  99.6845 %
        ['2', [4,3,2,]], #  0.1222 %
        ['Unknowns', [8, 9, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'FIRE_EXP' # Modified by hand
    A = [
        ['0', [1,]], #  0.2076 %
        ['1', [0,]], #  99.7924 %
        ['Unknowns', []]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'HAZ_CNO'
    A = [
        ['0', [9,]], #  0.0011 %
        ['1', [0,]], #  99.9774 %
        ['2', [1,2,8,3,4,6,5,]], #  0.0215 %
        ['Unknowns', [88, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'HAZ_INV' # Modified by hand
    A = [
        ['0', [1,]], #  99.9604 %
        ['1', [2,]], #  0.0396 %
        ['Unknowns', []]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'HAZ_PLAC' # Modified by hand
    A = [
        ['0', [0,]], #  99.9654 %
        ['1', [2,1,]], #  0.0345 %
        ['Unknowns', [8, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'HAZ_REL'
    A = [
        ['0', [2,]], #  0.0066 %
        ['1', [0,]], #  99.9682 %
        ['2', [1,]], #  0.0252 %
        ['Unknowns', [8, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'HIT_RUN' # Modified by hand
    A = [
        ['0', [0,]], #  94.8247 %
        ['1', [1,]], #  5.1753 %
        ['Unknowns', [9, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'IMPACT1'
    A = [
        ['0', [0,14,61,9,81,3,]], #  9.5695 %
        ['1', [12,]], #  42.3692 %
        ['2', [62,11,]], #  8.9371 %
        ['3', [10,1,82,2,8,4,63,19,20,83,]], #  12.8232 %
        ['4', [6,]], #  22.3991 %
        ['5', [7,5,13,18,]], #  3.9018 %
        ['Unknowns', [98, 99, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'IMPACT1_IM'
    A = [
        ['0', [0,14,61,9,81,3,]], #  9.5846 %
        ['1', [12,]], #  42.5618 %
        ['2', [62,11,]], #  8.9862 %
        ['3', [10,1,82,2,4,8,63,20,83,]], #  12.9432 %
        ['4', [6,]], #  22.0047 %
        ['5', [19,7,13,5,18,]], #  3.9193 %
        ['Unknowns', []]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'J_KNIFE'
    A = [
        ['0', [2,]], #  0.0483 %
        ['1', [0,]], #  97.4523 %
        ['2', [3,1,]], #  2.4994 %
        ['Unknowns', []]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'M_HARM'
    A = [
        ['0', [74,10,1,5,21,42,32,35,19,46,39,30,6,93,20,45,23,3,58,52,34,2,25,26,24,33,31,44,17,38,41,43,7,91,40,48,57,59,53,]], #  10.0639 %
        ['1', [12,]], #  81.0472 %
        ['2', [14,16,55,49,18,50,72,11,73,54,51,8,15,9,]], #  8.8889 %
        ['Unknowns', [98, 99, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'MAK_MOD'
    A = [
        ['0', [6010,76733,72704,71705,22001,43399,50031,2401,74706,30441,53702,71709,34705,99705,7017,20013,50709,99707,37733,37704,76709,20988,37702,53709,3884,12012,37709,73709,76703,98709,72709,98703,53705,98701,73704,76705,53706,58034,37706,98705,72706,73706,37703,76706,34709,50706,53734,73734,98706,73705,53704,76704,98704,37705,73703,53401,98702,50799,76701,99709,34706,77706,76702,71706,37739,37701,50705,76734,74705,53703,41401,42040,98733,99703,55032,98707,7470,9499,42053,69039,32054,38399,9037,99701,72705,93989,64031,73702,19006,76998,69038,6017,49055,7011,98734,20017,7004,55046,76739,22499,30032,35398,20038,20019,35053,74709,12008,94999,50399,12882,36398,31399,19027,12989,98907,42033,77709,12018,32047,21020,36399,41047,12998,13403,18402,14499,12015,52035,24008,69042,24002,54032,53999,59043,49441,22009,22023,19017,37734,24441,92989,12403,21017,98739,18003,22398,53036,2431,6398,21005,9019,10041,19019,23988,69398,69399,84998,45044,6444,42406,52040,7481,63035,67037,67399,65031,14037,49041,37035,48046,98982,13422,21002,35034,39036,54031,9038,14020,35056,52401,7898,20402,52471,52039,9442,55499,13482,52048,39032,20016,21441,21023,51051,6041,18441,39399,65399,53033,24003,20441,12013,20989,63498,55399,54499,13017,20471,41056,35404,63499,52399,12443,6052,20022,39035,2402,41044,20015,49398,38883,63403,19025,34703,63041,30399,13012,24001,19399,18401,55421,52999,54999,42031,20020,20004,18499,52403,35399,18026,55037,6044,7020,53034,6043,13002,6499,18399,12004,12032,49471,14443,20029,41402,67036,12424,20444,47036,42870,39038,34045,22032,7043,67033,59421,23471,34037,58499,25401,13401,47401,12399,63399,22403,36401,35050,37399,20443,35032,7444,20401,58043,41045,35471,20405,63037,22002,18025,20399,13005,63038,37033,22441,9020,62405,3499,52402,12016,12471,22016,37499,23401,35499,98398,14017,45040,14036,53499,7399,18019,63032,63033,35999,18002,63031,49399,52404,13999,20498,21021,14399,48044,14004,20406,20039,6018,35043,13399,42399,22018,14444,38401,18405,49050,62401,41499,35048,58038,12021,20009,41035,20403,53040,24006,55441,12006,7498,19021,38421,51040,34038,45037,12441,99739,23883,58398,55403,22010,53739,89881,73739,32421,20036,12035,49044,49048,24009,14038,55045,55036,7025,20027,42423,52046,12037,18022,51399,20023,35039,14401,55422,19003,55042,]], #  20.071 %
        ['1', [49051,14006,32059,20032,18007,63999,24007,59399,38402,55035,41399,22999,92988,30999,13001,7021,53399,21003,51042,51499,51039,24499,20037,12442,49033,22019,6999,53035,20002,22020,22022,18020,18023,22399,54037,30051,12003,52037,53405,35403,6042,20010,20007,24399,39031,49405,19422,20025,18404,7029,41037,55999,49049,55404,2408,35047,12017,62424,23441,42499,6421,42405,6054,21399,63398,42045,94989,58399,35402,37031,34399,49032,62499,41055,12473,58037,98998,12402,20404,7404,84983,2405,19431,2499,7402,20028,14021,54038,23431,52034,24999,58999,63034,]], #  19.9417 %
        ['2', [49999,52047,49043,30047,23399,12025,22401,20431,6442,42055,7471,38405,2482,32403,24011,22005,19401,18010,49034,55044,7039,51404,12425,42057,7024,51046,42043,7027,53404,7026,6051,34034,58042,30040,37039,35052,20445,12401,7472,58404,41049,37041,59033,2001,34999,34039,49040,23999,35051,41050,49056,6399,37032,55033,7461,24401,32999,12499,58032,12023,7442,54041,49499,35446,23402,19018,2407,49404,13015,23472,69054,20499,36038,59401,12027,63401,21401,7041,20999,20470,51034,99988,21022,3421,41471,24005,47035,59999,7499,19005,12422,54036,42042,37999,2404,]], #  20.9377 %
        ['3', [37402,58036,55402,19024,63402,19026,14402,49481,67035,7403,41999,12999,12444,48499,67032,51047,2406,20482,23423,39034,49046,13402,59031,35049,12498,34048,51049,22008,48999,18018,14003,45401,49035,12981,69035,6441,13013,42403,53402,63036,12024,49401,32399,54399,37404,35422,42401,45399,19421,54035,42999,34404,37422,2483,20473,7028,48401,35401,34402,34403,55401,42048,51043,35443,51045,38471,12988,58039,30042,49402,23481,41051,20423,20421,36402,49403,3431,30046,35472,39401,59405,35473,52472,51401,54044,37401,37471,59035,19480,59034,49052,12022,42044,19020,20481,49038,63039,49442,49472,30402,37403,63441,48399,21001,21999,32044,63040,30499,2403,23421,67031,48034,35481,7422,23499,58035,19022,54421,19023,48421,]], #  19.0517 %
        ['4', [2999,18999,49053,32499,20024,13014,49047,58422,32405,59032,59038,34035,55039,12481,38499,90981,34401,7999,39037,62423,41403,20461,54039,20398,13421,48038,12421,32052,49045,34049,32042,59403,37441,18421,19499,67034,6055,32048,24398,20001,7018,6014,19423,23422,30403,35421,51999,7482,7463,82461,42050,52499,37421,18024,29005,42047,37037,98983,55041,41053,58033,99998,62425,34047,32043,14999,58041,59499,42051,63422,34421,41441,69040,59402,23898,55398,7462,6016,82983,20026,49422,23461,41054,41052,48045,58403,82981,59040,31037,2422,49482,12870,6443,32049,51050,53041,54402,58044,23008,19014,42421,32045,22024,90982,39999,92983,12398,51041,58047,32422,12461,48047,34036,38882,62421,12423,59404,55040,12462,30052,34042,32051,48403,7443,20034,42461,20870,34499,32402,18398,51048,42039,59037,20021,58421,32401,20422,42054,84988,20880,3402,45421,19999,23880,53032,7019,51402,37038,31401,30443,55038,13016,13499,41421,30036,42404,98809,98988,63421,42398,37405,18004,58040,23498,20850,55043,98999,58402,24010,53481,98898,45042,84981,30421,49421,98884,45031,86881,38472,84999,86882,38999,58045,20881,35055,48402,58401,7880,12470,2498,82884,99989,34043,12898,94982,34044,82989,23882,38884,85881,45499,42058,12881,51881,84884,98850,82881,82870,85884,87884,84881,52882,39039,23881,35461,12880,20898,20040,98806,98808,37398,47399,98804,82999,51884,32040,94983,87881,12850,23870,62404,90989,86898,20882,19481,89890,20884,82898,59036,86884,84898,7881,99898,99399,97997,12884,99499,99999,99884,90988,98881,51898,85898,38898,23989,9017,98498,90983,29398,23981,23884,41398,48031,51882,54398,19398,7398,99881,73732,30398,87890,98890,98981,7005,13398,42036,82988,89882,42034,55034,34046,62422,69031,99890,30442,82982,23470,39398,82890,41498,32041,3482,47034,98882,92982,42402,25499,41043,22025,32050,14015,82462,84989,82883,84481,84982,94461,25999,45041,9999,51998,82498,42046,82850,7421,98908,7042,9002,18005,7884,51044,22017,21398,25441,2421,23890,20472,52498,93988,23466,38498,20890,62403,35898,23850,13481,42038,20466,99850,85999,86999,98805,87999,33033,48043,29399,38473,9034,14039,3441,35441,36039,21499,87898,3999,7001,46039,12036,84890,12007,98598,7007,99870,41046,12033,20442,7013,25890,7870,35042,94981,34398,86890,99981,84883,38890,64033,85890,9399,42422,47398,98902,9001,38403,84850,20035,7015,35044,41034,53403,20981,7850,29001,54401,25498,54033,35038,82499,10037,51053,32055,20008,84498,1399,48398,48037,51890,82882,14031,49042,87882,43032,59039,36999,54034,9008,49498,45398,18001,62402,32404,51398,35498,42462,52898,18498,85883,98731,20883,47031,47037,98870,7002,42041,53701,45999,51989,82998,32046,38404,64032,54040,30048,32398,47999,67398,99599,42850,42056,10034,51403,25884,1001,34405,38881,62426,98904,99982,86883,89898,53733,20407,42037,7033,53398,32056,51988,93981,30033,30043,85882,34422,32058,1008,38988,51052,38441,12890,53031,98883,92981,30053,31036,21018,31999,49054,87883,13423,39499,31422,93983,89884,10045,32406,59042,39402,38474,30034,45045,32060,49057,55406,14398,42898,19009,84421,62427,48498,72707,18021,]], #  19.9996 %
        ['Unknowns', []]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'MAKE'
    A = [
        ['0', [74,76,71,72,50,73,77,43,53,98,64,65,21,9,52,22,14,18,92,24,37,39,63,]], #  28.1315 %
        ['1', [35,6,36,55,67,]], #  6.162 %
        ['2', [20,]], #  12.6429 %
        ['3', [13,69,34,]], #  2.0354 %
        ['4', [49,]], #  11.7578 %
        ['5', [30,]], #  1.5107 %
        ['6', [12,]], #  13.4181 %
        ['7', [19,2,]], #  5.6145 %
        ['8', [41,58,7,42,54,47,93,23,59,25,48,38,62,3,32,29,51,31,45,90,10,94,86,89,84,85,82,87,97,33,46,1,]], #  18.727 %
        ['Unknowns', [99, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'MAX_VSEV' # Modified by hand
    A = [
        ['0', [3,5,6,4,2,]], #  17.1638 %
        ['1', [1,]], #  17.3546 %
        ['2', [0,]], #  65.4815 %
        ['Unknowns', [9, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'MDLYR_IM'
    A = [
        ['0', [1929,1947,1962,1968,1951,1956,1974,1982,1978,1955,1953,1960,1959,1950,1970,1986,1975,1985,1981,1966,1965,1987,1983,1973,1931,1977,1979,1984,1964,1976,1991,1980,1971,1988,1993,1992,1990,1994,1998,1997,1995,1989,1996,2002,1999,2001,2000,2005,]], #  22.8558 %
        ['1', [2003,2020,2004,2019,2006,]], #  17.6131 %
        ['2', [2007,1969,1940,2016,2017,2018,]], #  21.4618 %
        ['3', [2009,2008,2015,]], #  18.3831 %
        ['4', [2013,2014,2021,2012,1967,2011,2010,1957,1972,1948,1952,1928,1932,1933,1963,1954,1958,1934,1961,]], #  19.6865 %
        ['Unknowns', []]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'MOD_YEAR'
    A = [
        ['0', [1929,1947,1962,1968,1951,1956,1974,1982,1978,1955,1953,1960,1959,1950,1970,1986,1975,1985,1981,1966,1965,1987,1983,1973,1931,1977,1979,1984,1964,1976,1991,1980,1971,1988,1993,1992,1990,1994,1998,1995,1997,1996,1989,2002,2001,1999,2000,2005,]], #  22.8303 %
        ['1', [2003,2004,2020,2006,2019,]], #  17.5677 %
        ['2', [2007,2009,1969,1940,2016,2008,]], #  22.0677 %
        ['3', [2017,2018,2015,]], #  18.0075 %
        ['4', [2013,2014,2021,2012,2011,2010,1967,1957,1972,1948,1952,1928,1932,1933,1963,1954,1958,1934,1961,]], #  19.5268 %
        ['Unknowns', [9998, 9999, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'MODEL'
    A = [
        ['0', [709,703,701,706,704,705,702,707,799,733,734,907,739,12,56,11,16,4,19,471,424,9,22,29,50,6,3,20,2,21,59,37,18,7,43,13,]], #  20.8933 %
        ['1', [399,52,17,444,25,36,1,38,998,406,408,15,39,47,31,35,32,405,48,]], #  19.3475 %
        ['2', [431,33,27,989,23,445,34,40,404,44,425,446,988,26,51,401,407,28,]], #  22.7449 %
        ['3', [402,42,49,46,443,442,24,499,473,57,54,483,403,441,472,423,41,5,55,480,398,498,]], #  21.1888 %
        ['4', [481,421,45,422,53,470,14,482,463,461,10,8,999,983,982,870,462,981,883,809,882,880,58,881,806,808,804,884,898,850,997,890,732,908,466,805,598,902,731,599,426,904,474,60,427,]], #  15.8254 %
        ['Unknowns', []]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'MXVSEV_IM' # Modified by hand
    A = [
        ['0', [3,5,4,2,6,]], #  17.1044 %
        ['1', [1,]], #  17.4622 %
        ['2', [0,]], #  65.4333 %
        ['Unknowns', []]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'NUM_INJV' # Modified by hand
    A = [
        ['0', [26,8,11,7,5,6,4,9,3,2,]], #  10.333 %
        ['1', [1,]], #  24.1729 %
        ['2', [10,14,12,]], #  0.0098 %
        ['3', [0,]], #  65.4842 %
        ['Unknowns', [99, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'NUMINJ_IM' # Modified by hand
    A = [
        ['0', [26,8,11,7,6,5,4,9,3,2,]], #  10.2074 %
        ['1', [1,]], #  24.3463 %
        ['2', [10,14,12,]], #  0.0095 %
        ['3', [0,]], #  65.4369 %
        ['Unknowns', []]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'NUMOCCS'
    A = [
        ['0', [59,26,35,31,37,10,14,33,13,8,20,27,11,]], #  0.2623 %
        ['1', [2,]], #  24.6466 %
        ['2', [6,7,21,]], #  1.1684 %
        ['3', [1,]], #  54.7647 %
        ['4', [12,9,]], #  0.0809 %
        ['5', [3,]], #  10.4213 %
        ['6', [38,5,17,4,19,16,34,25,28,24,43,49,23,15,29,22,18,40,32,55,53,50,44,51,30,39,41,75,47,95,52,54,62,60,56,58,46,65,57,48,36,45,77,]], #  8.6562 %
        ['Unknowns', [99, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'P_CRASH1' # Modified by hand
    A = [
        ['0', [14,0,7,3,17,6,]], #  6.8626 %
        ['1', [1,]], #  50.4172 %
        ['2', [11,]], #  10.3122 %
        ['3', [12,98,2,16,]], #  6.6395 %
        ['4', [5,]], #  15.4783 %
        ['5', [15,4,10,8,9,13,]], #  10.2902 %
        ['Unknowns', [99, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'P_CRASH2'
    A = [
        ['0', [8,9,12,13,14,54,1,4,6,5,62,2,66,3,55,]], #  25.2772 %
        ['1', [17,67,72,63,68,91,64,78,15,]], #  14.9584 %
        ['2', [10,71,98,19,18,92,21,65,90,51,70,59,]], #  7.7214 %
        ['3', [53,]], #  19.6503 %
        ['4', [73,60,74,87,61,89,52,]], #  15.3052 %
        ['5', [11,88,16,50,56,20,80,82,81,84,85,83,]], #  17.0873 %
        ['Unknowns', [99, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'P_CRASH3' # Modified by hand
    A = [
        ['0', [15,98,6,7,0,11,10,8,9,12,5,]], #  15.3542 %
        ['1', [16,]], #  13.4695 %
        ['2', [1,]], #  71.1762 %
        ['Unknowns', [99, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'PCRASH1_IM'
    A = [
        ['0', [14,0,7,3,17,6,]], #  6.8303 %
        ['1', [1,]], #  50.6172 %
        ['2', [11,]], #  10.2666 %
        ['3', [12,98,2,16,]], #  6.6712 %
        ['4', [5,]], #  15.2315 %
        ['5', [15,4,10,8,9,13,]], #  10.3832 %
        ['Unknowns', []]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'PCRASH4' # Modified by hand
    A = [
        ['0', [7,3,4,5,2,0,]], #  3.8805 %
        ['1', [1,]], #  96.1194 %
        ['Unknowns', [9, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'PCRASH5' # Modified by hand
    A = [
        ['0', [6,]], #  0.1972 %
        ['1', [4,]], #  10.8203 %
        ['2', [0,3,5,2]], #  10.7314 %
        ['3', [1,]], #  76.6633 %
        ['4', [7,]], #  1.5877 %
        ['Unknowns', [9, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'ROLINLOC' # Modified by hand
    A = [
        ['0', [7,3,6,1,5,4,2,]], #  2.9048 %
        ['1', [0,]], #  97.0952 %
        ['Unknowns', [9, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'ROLLOVER' # Modified by hand
    A = [
        ['0', [1,9,2,]], #  2.951 %
        ['1', [0,]], #  97.0489 %
        ['Unknowns', []]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'SPEC_USE'
    A = [
        ['0', [19,4,10,1,5,]], #  0.5028 %
        ['1', [0,]], #  98.7952 %
        ['2', [3,20,8,6,21,13,2,22,7,23,12,11,]], #  0.7021 %
        ['Unknowns', [98, 99, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'SPEEDREL' # Modified by hand
    A = [
        ['0', [3,2,5,8,4,]], #  6.1165 %
        ['1', [0,]], #  93.8834 %
        ['Unknowns', [9, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'TOW_VEH'
    A = [
        ['0', [3,]], #  0.0021 %
        ['1', [0,]], #  97.4427 %
        ['2', [6,5,1,2,4,]], #  2.5551 %
        ['Unknowns', [9, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'TOWED' # Modified by hand
    A = [
        ['0', [2,]], #  33.0112 %
        ['1', [7,]], #  8.3598 %
        ['2', [5,]], #  58.629 %
        ['Unknowns', [8, 9, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'TRAV_SP'
    A = [
        ['0', [103,86,121,122,111,130,110,120,97,112,82,108,76,91,100,83,90,102,84,118,105,115,85,74,80,95,92,93,96,71,89,52,87,145,114,75,42,44,58,81,29,59,63,48,77,60,51,70,50,66,43,73,55,997,78,68,57,53,65,45,46,67,62,33,54,88,24,37,79,38,72,69,]], #  23.8671 %
        ['1', [40,39,27,64,56,47,35,11,32,61,30,19,22,31,12,]], #  16.8789 %
        ['2', [25,23,28,26,49,16,34,98,13,18,20,17,15,9,]], #  11.0565 %
        ['3', [0,]], #  32.6952 %
        ['4', [10,8,99,41,7,1,5,14,2,3,6,36,4,21,101,150,139,104,125,]], #  15.5016 %
        ['Unknowns', [998, 999, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'V_ALCH_IM' # Modified by hand
    A = [
        ['0', [1,]], #  3.044 %
        ['1', [2,]], #  96.956 %
        ['Unknowns', []]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'VALIGN'
    A = [
        ['0', [3,2,4,]], #  8.3849 %
        ['1', [1,]], #  88.9094 %
        ['2', [0,]], #  2.7058 %
        ['Unknowns', [8, 9, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'VEH_ALCH' # Modified by hand
    A = [
        ['0', [1,8,]], #  3.0238 %
        ['1', [2,]], #  96.9762 %
        ['Unknowns', [9, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'VEVENT_IM'
    A = [
        ['0', [74,10,1,5,21,42,32,19,35,46,39,30,4,93,20,3,45,23,58,52,2,34,6,25,26,24,33,31,44,38,17,41,43,7,91,40,48,57,59,53,]], #  10.0888 %
        ['1', [12,]], #  81.0178 %
        ['2', [49,16,14,55,18,50,72,11,73,54,51,8,15,9,]], #  8.8932 %
        ['Unknowns', []]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'VNUM_LAN'
    A = [
        ['0', [2,]], #  44.9531 %
        ['1', [4,]], #  14.8614 %
        ['2', [3,]], #  18.2764 %
        ['3', [5,]], #  10.9793 %
        ['4', [7,1,6,0,]], #  10.9298 %
        ['Unknowns', [8, 9, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'VPROFILE'
    A = [
        ['0', [6,5,4,3,]], #  5.9771 %
        ['1', [1,]], #  83.0272 %
        ['2', [2,0,]], #  10.9956 %
        ['Unknowns', [8, 9, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'VSPD_LIM' # Modified by hand
    A = [
        ['0', [90,55,]], #  10.9312 %
        ['1', [80,75,70,50,65,]], #  14.7318 %
        ['2', [45,]], #  21.5203 %
        ['3', [60,]], #  1.9729 %
        ['4', [40,]], #  11.2788 %
        ['5', [35,]], #  19.1915 %
        ['6', []], #  9.1917 %
        ['7', [25,30,20,15,0,10,5,]], #  11.1816 %
        ['Unknowns', [98, 99, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'VSURCOND' # Modified by hand
    A = [
        ['0', [5,11,7,8,6,]], #  0.2729 %
        ['1', [1,]], #  81.6571 %
        ['2', [2,]], #  13.4651 %
        ['3', [10,4,3,0,]], #  4.6048 %
        ['Unknowns', [98, 99, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'VTCONT_F'
    A = [
        ['0', [1,]], #  0.1033 %
        ['1', [0,]], #  62.2729 %
        ['2', [4,]], #  0.006 %
        ['3', [3,]], #  37.5464 %
        ['4', [2,]], #  0.0713 %
        ['Unknowns', [8, 9, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'VTRAFCON' # Modified by hand
    A = [
        ['0', [29,40,28,9,4,98,65,]], #  2.0442 %
        ['1', [0,]], #  62.2529 %
        ['2', [3,]], #  24.9677 %
        ['3', [1,20,7,8,23,50,2,21,]], #  10.7352 %
        ['Unknowns', [97, 99, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)


    feature = 'VTRAFWAY'
    A = [
        ['0', [1,]], #  45.1904 %
        ['1', [2,]], #  17.174 %
        ['2', [5,]], #  5.757 %
        ['3', [3,]], #  23.3378 %
        ['4', [6,4,0,]], #  8.541 %
        ['Unknowns', [8, 9, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Veh, data, feature, A)

    return data


In [15]:
def Main():
    df_Acc, df_Veh, df_Per = Get_Data()
    df_Acc, df_Veh, df_Per = Drop_Repeated_Features(df_Acc, df_Veh, df_Per)    
    df_Acc, df_Veh, df_Per = Drop_Irrelevant_Features (df_Acc, df_Veh, df_Per)

    Analyze_Binning(df_Acc)
    df_Acc = Build_Accident_Dataset(df_Acc)
    Analyze_Binning(df_Acc)

    Analyze_Binning(df_Veh)
    df_Veh = Build_Vehicle_Dataset(df_Veh)
    Analyze_Binning(df_Veh)
    
    
Main()

Get_Data
df_Acc.shape =  (259077, 51)
df_Veh.shape =  (457314, 97)
df_Per.shape =  (644274, 67)

Drop_Repeated_Features()
df_Acc.shape =  (259077, 51)
df_Vet.shape =  (457314, 83)
df_Per.shape =  (644274, 38)

Drop_Irrelevant_Features
df_Acc.shape =  (259077, 40)
df_Veh.shape =  (457314, 55)
df_Per.shape =  (644274, 24)

Analyze_Binning
ALCHL_IM 2
ALCOHOL 4
CASENUM 259077
DAY_WEEK 7
EVENT1_IM 54
HARM_EV 56
HOUR 25
HOUR_IM 24
INT_HWY 3
LGTCON_IM 7
LGT_COND 9
MANCOL_IM 9
MAN_COLL 11
MAXSEV_IM 8
MAX_SEV 9
MONTH 12
NO_INJ_IM 18
NUM_INJ 20
PEDS 10
PERMVIT 26
PERNOTMVIT 10
PJ 422
PSU 60
PVH_INVL 11
REGION 4
RELJCT1 4
RELJCT1_IM 3
RELJCT2 15
RELJCT2_IM 13
REL_ROAD 13
SCH_BUS 2
TYP_INT 11
URBANICITY 2
VE_FORMS 13
VE_TOTAL 13
WEATHER 13
WEATHR_IM 11
WKDY_IM 7
WRK_ZONE 5
YEAR 5

Build_Accident_Dataset()
Need to Do:

Analyze_Binning
ALCHL_IM 2
ALCOHOL 4
CASENUM 259077
DAY_WEEK 2
EVENT1_IM 4
HARM_EV 5
HOUR 8
HOUR_IM 7
INT_HWY 2
LGTCON_IM 4
LGT_COND 5
MANCOL_IM 5
MAN_COLL 6
MAXSEV_IM 8
MAX_SEV 9
MO