- CRSS where each sample corresponds to one person, not to one accident.  
- This one corresponds better to evaluating per phone.

In [1]:
%%latex
\tableofcontents

<IPython.core.display.Latex object>

# Setup
## Import Libraries

In [2]:
import sys, copy, math, time

print ('Python version: {}'.format(sys.version))

from IPython.display import display, HTML

from collections import Counter

import numpy as np
print ('NumPy version: {}'.format(np.__version__))
np.set_printoptions(suppress=True)

import pandas as pd
print ('Pandas version:  {}'.format(pd.__version__))
pd.set_option('display.max_rows', 500)



Python version: 3.9.7 (default, Oct 22 2021, 13:24:00) 
[Clang 13.0.0 (clang-1300.0.29.3)]
NumPy version: 1.24.0
Pandas version:  1.5.2


## Import Data

### person.csv from CRSS

In [3]:
def Import_Data_Person():
    print ('Import_Data_Person()')

    df = pd.DataFrame([])
    for year in ['2016','2017','2018']:
        filename = '../../Big_Files/CRSS_2020_Update/CRSS' + year + 'CSV/PERSON.CSV'
#        filename = '../../CRSS/CRSS' + year + 'CSV/PERSON.CSV'
        temp = pd.read_csv(filename, index_col=None)
        print (year, len(temp))
        df = df.append(temp)

    for year in ['2019','2020']:
        filename = '../../Big_Files/CRSS_2020_Update/CRSS' + year + 'CSV/person.csv'
#        filename = '../../CRSS/CRSS' + year + 'CSV/person.csv'
        temp = pd.read_csv(filename, index_col=None, encoding='latin1')
        print (year, len(temp))
        df = df.append(temp)

#    for feature in df:
#        print (feature)
    

#    for feature in df:
#        if 'NAME' in feature:
#            df.drop(columns=[feature], inplace=True)

    print (df.shape)
    print ()
    return df

### All Data

In [4]:
def Import_Data():
    print ('Import_Data()')
    df_Person = Import_Data_Person()
    
    print ()
    return df_Person

# Build Database

## Build_Person_Dataset

In [6]:
def Build_Person_Dataset(df_Person):
    print ('Build_Person_Dataset')
    data = pd.DataFrame()
    
    F = [
        'CASENUM',
        'VEH_NO',
        'AIR_BAG',
        'ALC_RES',
        'ALC_STATUS',
        'DRINKING',
        'DRUGS',
        'EJECT_IM',
        'EJECTION',
        'INJ_SEV',
        'INJSEV_IM',
        'LOCATION',
#        'PER_NO',
        'PER_TYP',
        'PERALCH_IM',
        'REST_MIS',
        'REST_USE',
        'SEAT_IM',
        'SEAT_POS',
        'SEX_IM',
        'SEX',
    ]

    for f in F:
        data[f] = df_Person[f]
        
    data['HOSPITAL'] = df_Person['HOSPITAL'].apply(lambda x:1 if x in [1,2,3,4,5] else 0)

    data = data[data.VEH_NO != 0]

    data = data.reindex(sorted(data.columns), axis=1)    
    
    print ()
    return data

## Feature Names

In [7]:
def Feature_Names(data, Named_Features):
    print ('Feature_Names')
    D = {}
    for f in Named_Features:
        g = f + 'NAME'
        A = pd.concat([data[f],data[g]], axis=1)
        A.drop_duplicates(inplace=True)
        A.dropna(inplace=True)
#        print (f)
#        print (len(A))
#        print (A.head())
#        print ()
        B = dict(zip(A[f],A[g]))
        D[f] = B
#        print (B)
#        print ()
#    print (D)
    print ()
    return D
        

In [13]:
def Remove_Unknowns_in_Feature(data, feature):
    
    Unknowns = {
        'AIR_BAG': [98,99,],
        'ALC_RES': [999,],
        'ALC_STATUS': [8,9,],
        'CASENUM': [],
        'DRINKING': [8,9,],
        'DRUGS': [8,9,],
        'EJECT_IM': [],
        'EJECTION': [7,9,],
        'HELM_USE': [98,99,],
        'HOSPITAL': [],
        'INJ_SEV': [9,],
        'INJSEV_IM': [],
        'LOCATION': [98,99,],
        'PER_NO': [],
        'PER_TYP': [],
        'PERALCH_IM': [],
        'REST_MIS': [],
        'REST_USE': [98,99,],
        'SEAT_IM': [],
        'SEAT_POS': [98,99,],
        'SEX': [8,9,],
        'SEX_IM': [],
        'VEH_NO': [],
    }
    
#    print ('Remove_Unknowns_in_Feature ', feature, Unknowns[feature], len(data))
    data_temp = data[~data[feature].isin(Unknowns[feature])]
#    print ('Remove_Unknowns_in_Feature ', feature, Unknowns[feature], len(data_temp))
#    print ()
    
    return data_temp, Unknowns[feature]

## Correlation

In [17]:
def Correlation(data, target, feature, value, name):
    TN = 0
    FP = 0
    FN = 0
    TP = 0
    contingency_matrix = pd.crosstab(data[target], data[feature])
    cm = contingency_matrix.values.tolist()
    if len(cm)==2 and len(cm[0])==2:
        corr = cm[1][1] / (cm[0][1] + cm[1][1])
        per = (cm[0][1] + cm[1][1])/(cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])
    else:
        corr = 0
        per = 0
    per = round(per*100,4)
    corr = round(corr*100,4)
#    print ("    - ", feature)
#    print ("    - ", value)
#    print ("    - ", name)
#    print (contingency_matrix)
#    print ('        - per = ', per)
#    print ("        - corr = ", corr)
#    print ()
    return (per, corr)

def Correlation_by_Value(data, target, feature, Feature_Names_Dict, Unknowns):
# I decided against the np.unique because it treats each nan as a separate entry.
#    V = np.unique(data[feature].values) 
    V = data[feature].unique()
#    print (V)
    B = []

    for value in V:
        A = pd.DataFrame()
        A[feature] = data[feature].apply(lambda x: 1 if x==value else 0)
        A[target] = data[target]
        if feature in Feature_Names_Dict:
            if value in Feature_Names_Dict[feature]:
                name = Feature_Names_Dict[feature][value]
            else:
                name=str(value)
        else:
            name = str(value)
#        if len(name)>30:
#            name = name[:30]
        per, corr = Correlation(A, target, feature, value, name)
        B.append([feature, value, name, per, corr])
#    print (feature)
    B = sorted(B, key=lambda x:x[4], reverse=True)
    for b in B:
        c = b[1]
        try:
            c = int(c)
        except:
            c=c
        else:
            c = int(c)
#        print (c, end=',')
#    print ()
#    print ()

    # Print grouped into 100/p blocks of same size
    print ("    feature = '%s'" % feature)
    print ('    A = [')
    p = 20
    s = 0.0
    s2 = 0.0
    n=0
    print ("        ['%d', [" % n , end='')
    for b in B:
        t = s + b[3]
        if b[3]<10:
            s2 = s2 + b[3]
        q = int(s/p)
        r = int((t-0.001)/p)
        if r>q or b[3]>10:
            print ("]], # ", round(s2,4), '%')
            s2 = 0.0
            n += 1
            print ("        ['%d', [" % n , end='')
        s = t
        
        c = b[1]
        try:
            c = int(c)
        except:
            c=c
        else:
            c = int(c)
        print (c, end=',')
        if b[3]>10:
            print ("]], # ", round(b[3],4), '%')
            s2=0.0
            n += 1
            print ("        ['%d', [" % n , end='')
    print ("]], # ", round(s2,4), '%')
    print ("        ['Unknowns', [", end='')
    for u in Unknowns:
        print (u, end=', ')
    print ("]]" )
    print ('    ]')
    print ('    data = Build_Individual_Feature_with_Dict(df_Per, data, feature, A)')
    print ()
    
    C = pd.DataFrame(B)
    C.columns = ['Feature', 'Code', 'Name', 'Per', 'Corr']
#    C.drop(C[C['Per'] < 0.1].index, inplace=True)
#    print (C)
    display(C)

    TeX = open('../Correlation/Correlation_' + feature + '.tex', 'w')
    E = [c for c in B if c[3]>=0.0]
    
        
    
    for c in E:
        a = c[0]
        b = c[1]
        d = c[2]
        e = "{:.4f}".format(c[3])
        f = "{:.4f}".format(c[4])
        TeX.write('\t & \\verb|%s| & %s & %s & %s & %s \\cr\n' % (a,b,d,e,f))
    

    TeX = open('../Correlation/Correlation_Ordered_' + feature + '.tex', 'w')
    E = sorted(B, key=lambda x:x[1], reverse=False)

    
    for c in E:
        a = c[0]
        b = c[1]
        d = c[2]
        e = "{:.4f}".format(c[3])
        f = "{:.4f}".format(c[4])
        TeX.write('\t & \\verb|%s| & %s & %s & %s & %s \\cr\n' % (a,b,d,e,f))
    

    print ()
    return B

def Correlation_All(data, target, Feature_Names_Dict):
    print ('Correlation_All')
    
    C = []
    for feature in data:
        data_temp, Unknowns = Remove_Unknowns_in_Feature(data, feature)
        U = data_temp[feature].unique()
#        print (feature, len(U))
        if len(U)<10000:
            B = Correlation_by_Value(
                data_temp, target, feature, Feature_Names_Dict, Unknowns
            )
            for b in B:
                C.append(b)
#            print ()
#        print ()
#    for c in C:
#        print (c)
#    print ()
    C = sorted(C, key=lambda x:x[4], reverse=True)
    D = pd.DataFrame(C)
    D.columns = ['Feature', 'Code', 'Name', 'Per', 'Corr']
    print (D)
    print ()
    
    D.drop(D[D['Per'] < 0.5].index, inplace=True)
    print (D)
    print ()
    
    TeX = open('../Correlation/Correlation.tex', 'w')
    E = [c for c in C if c[3]>=0.5]
    
    for c in E:
        a = c[0]
        b = c[1]
        d = c[2]
        e = "{:.4f}".format(c[3])
        f = "{:.4f}".format(c[4])
        TeX.write('\\verb|%s| & %s & %s & %s & %s \\cr\n' % (a,b,d,e,f))
    
    return 0

    

In [62]:
def PreDiscretize(df):
    print ('PreDiscretize')
    feature = 'ALC_RES'
    interval = 50
    A = [*range(-1,941,interval)]
    A = A + [994,995,996,997,998,999]
    L = []
    for x in range (-1,940,interval):
        s = str(x+1).zfill(3) + '-' + str(x+interval).zfill(3)
        L.append(s)
    L = L + ['995', '996', '997', '998', '999']
    print (len(A), len(L))
    print (A)
    print (L)
    New = pd.cut(
        df[feature], 
        A, 
        labels=L
    )
    print ()
    Crosstabs = pd.crosstab(df[feature], New)
    print ()
    display (Crosstabs)
    
    df[feature] = New
    
    print (df[feature].value_counts())
    
    return df
    

# Main()

In [64]:
def Main():
    target = 'HOSPITAL'
    df_Person = Import_Data()
    data = Build_Person_Dataset(df_Person)
    
    print ('Features in data, with Number of Unique Values and Number of Blank Values')
    for feature in data:
        U = data[feature].unique()
        s = data[feature].isna().sum()
        print (feature, len(U), s)
    print ()
    
    
    print ('Features in df_Person with Names')
    F = []
    for feature in df_Person:
        if 'NAME' in feature:
            f = feature[:-4]
            if f in data:
                F.append(f)
    F = sorted(F)
    for f in F:
        print ("        '%s'," % f)
    print ()
    
    Named_Features_df_Person = [
        'AIR_BAG',
        'ALC_RES',
        'ALC_STATUS',
        'DRINKING',
        'DRUGS',
        'EJECTION',
        'EJECT_IM',
        'HOSPITAL',
        'INJSEV_IM',
        'INJ_SEV',
        'LOCATION',
        'PERALCH_IM',
        'PER_TYP',
        'REST_MIS',
        'REST_USE',
        'SEAT_IM',
        'SEAT_POS',
        'SEX',
        'SEX_IM',
    ]

    Feature_Names_Dict = Feature_Names(df_Person, Named_Features_df_Person)

#    print (Feature_Names_Dict)

    PreDiscretize(data)

    Correlation_All(data, target, Feature_Names_Dict)


Main()

Import_Data()
Import_Data_Person()
2016 117759


  df = df.append(temp)


2017 138913
2018 120230


  df = df.append(temp)
  df = df.append(temp)


2019 135410


  df = df.append(temp)


2020 131962


  df = df.append(temp)


(644274, 117)


Build_Person_Dataset

Features in data, with Number of Unique Values and Number of Blank Values
AIR_BAG 11 0
ALC_RES 335 0
ALC_STATUS 5 0
CASENUM 258904 0
DRINKING 4 0
DRUGS 4 0
EJECTION 7 0
EJECT_IM 5 0
HOSPITAL 2 0
INJSEV_IM 7 0
INJ_SEV 8 0
LOCATION 1 0
PERALCH_IM 2 0
PER_NO 75 0
PER_TYP 4 0
REST_MIS 3 0
REST_USE 20 0
SEAT_IM 23 0
SEAT_POS 29 0
SEX 4 0
SEX_IM 2 0
VEH_NO 15 0

Features in df_Person with Names
        'AIR_BAG',
        'ALC_RES',
        'ALC_STATUS',
        'DRINKING',
        'DRUGS',
        'EJECTION',
        'EJECT_IM',
        'HOSPITAL',
        'INJSEV_IM',
        'INJ_SEV',
        'LOCATION',
        'PERALCH_IM',
        'PER_TYP',
        'REST_MIS',
        'REST_USE',
        'SEAT_IM',
        'SEAT_POS',
        'SEX',
        'SEX_IM',

Feature_Names

PreDiscretize
25 24
[-1, 49, 99, 149, 199, 249, 299, 349, 399, 449, 499, 549, 599, 649, 699, 749, 799, 849, 899, 994, 995, 996, 997, 998, 999]
['000-049', '050-099', '100-149', '150-19

ALC_RES,000-049,050-099,100-149,150-199,200-249,250-299,300-349,350-399,400-449,450-499,500-549,600-649,650-699,750-799,900-949,995,996,997,998,999
ALC_RES,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,1709,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


996        504546
995        105852
997          5448
000-049      1862
999           939
150-199       728
100-149       618
200-249       493
050-099       305
250-299       191
998           143
300-349        55
350-399        18
900-949        15
400-449         4
600-649         1
750-799         1
650-699         1
500-549         1
450-499         1
800-849         0
850-899         0
550-599         0
700-749         0
Name: ALC_RES, dtype: int64
Correlation_All
    feature = 'AIR_BAG'
    A = [
        ['0', [8,1,3,9,7,2,0,]], #  19.879 %
        ['1', [20,]], #  80.1172 %
        ['2', [28,]], #  0.0038 %
        ['Unknowns', [98, 99, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Per, data, feature, A)



Unnamed: 0,Feature,Code,Name,Per,Corr
0,AIR_BAG,8,Deployed- Combination,4.5944,45.1759
1,AIR_BAG,1,Deployed- Front,6.393,37.4622
2,AIR_BAG,3,Deployed- Curtain (roof),0.2088,37.1894
3,AIR_BAG,9,Deployment- Unknown Location,5.8875,36.5269
4,AIR_BAG,7,"Deployed- Other (Knee, air belt, etc.)",0.0288,31.0559
5,AIR_BAG,2,"Deployed- Side (door, seatback)",0.8347,30.6389
6,AIR_BAG,0,0,1.9318,25.0857
7,AIR_BAG,20,Not Deployed,80.1172,9.4815
8,AIR_BAG,28,28,0.0038,0.0



    feature = 'ALC_RES'
    A = [
        ['0', [650-699,450-499,500-549,900-949,350-399,997,300-349,998,250-299,050-099,400-449,100-149,200-249,000-049,150-199,999,]], #  1.7422 %
        ['1', [996,]], #  81.2183 %
        ['2', []], #  0.0 %
        ['3', [995,]], #  17.0393 %
        ['4', [750-799,600-649,]], #  0.0004 %
        ['Unknowns', [999, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Per, data, feature, A)



Unnamed: 0,Feature,Code,Name,Per,Corr
0,ALC_RES,650-699,650-699,0.0002,100.0
1,ALC_RES,450-499,450-499,0.0002,100.0
2,ALC_RES,500-549,500-549,0.0002,100.0
3,ALC_RES,900-949,900-949,0.0024,60.0
4,ALC_RES,350-399,350-399,0.0029,55.5556
5,ALC_RES,997,997,0.877,50.826
6,ALC_RES,300-349,300-349,0.0089,36.3636
7,ALC_RES,998,998,0.023,34.965
8,ALC_RES,250-299,250-299,0.0307,28.2723
9,ALC_RES,050-099,050-099,0.0491,26.8852



    feature = 'ALC_STATUS'
    A = [
        ['0', [2,1,]], #  1.9717 %
        ['1', [0,]], #  98.0283 %
        ['2', []], #  0.0 %
        ['Unknowns', [8, 9, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Per, data, feature, A)



Unnamed: 0,Feature,Code,Name,Per,Corr
0,ALC_STATUS,2,Test Given,1.9215,39.4031
1,ALC_STATUS,1,1,0.0502,29.0698
2,ALC_STATUS,0,Test Not Given,98.0283,14.0061



    feature = 'DRINKING'
    A = [
        ['0', [1,]], #  3.0974 %
        ['1', [0,]], #  96.9026 %
        ['2', []], #  0.0 %
        ['Unknowns', [8, 9, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Per, data, feature, A)



Unnamed: 0,Feature,Code,Name,Per,Corr
0,DRINKING,1,Yes (Alcohol Involved),3.0974,38.6786
1,DRINKING,0,No (Alcohol Not Involved),96.9026,13.4891



    feature = 'DRUGS'
    A = [
        ['0', [1,]], #  1.1139 %
        ['1', [0,]], #  98.8861 %
        ['2', []], #  0.0 %
        ['Unknowns', [8, 9, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Per, data, feature, A)



Unnamed: 0,Feature,Code,Name,Per,Corr
0,DRUGS,1,Yes (drugs involved),1.1139,48.1834
1,DRUGS,0,No (drugs not involved),98.8861,13.8658



    feature = 'EJECTION'
    A = [
        ['0', [1,3,8,2,]], #  3.2104 %
        ['1', [0,]], #  96.7896 %
        ['2', []], #  0.0 %
        ['Unknowns', [7, 9, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Per, data, feature, A)



Unnamed: 0,Feature,Code,Name,Per,Corr
0,EJECTION,1,Totally Ejected,0.3277,68.5446
1,EJECTION,3,Ejected - Unknown Degree,0.0073,65.1163
2,EJECTION,8,Not Applicable,2.7826,61.6278
3,EJECTION,2,Partially Ejected,0.0928,53.9595
4,EJECTION,0,Not Ejected,96.7896,13.5697



    feature = 'EJECT_IM'
    A = [
        ['0', [1,3,8,2,]], #  3.055 %
        ['1', [0,]], #  96.9451 %
        ['2', []], #  0.0 %
        ['Unknowns', []]
    ]
    data = Build_Individual_Feature_with_Dict(df_Per, data, feature, A)



Unnamed: 0,Feature,Code,Name,Per,Corr
0,EJECT_IM,1,Totally Ejected,0.3199,66.4318
1,EJECT_IM,3,Ejected - Unknown Degree,0.0069,65.1163
2,EJECT_IM,8,Not Applicable,2.6369,61.2478
3,EJECT_IM,2,Partially Ejected,0.0913,51.6755
4,EJECT_IM,0,Not Ejected,96.9451,12.9036



    feature = 'HOSPITAL'
    A = [
        ['0', []], #  0.0 %
        ['1', [0,]], #  14.3886 %
        ['2', []], #  0.0 %
        ['3', [1,]], #  14.3886 %
        ['4', []], #  0.0 %
        ['Unknowns', []]
    ]
    data = Build_Individual_Feature_with_Dict(df_Per, data, feature, A)



Unnamed: 0,Feature,Code,Name,Per,Corr
0,HOSPITAL,0,Not Transported for Treatment,14.3886,100.0
1,HOSPITAL,1,EMS Air,14.3886,100.0



    feature = 'INJSEV_IM'
    A = [
        ['0', [3,5,2,6,4,]], #  13.7916 %
        ['1', [1,]], #  14.6972 %
        ['2', []], #  0.0 %
        ['3', [0,]], #  71.5113 %
        ['4', []], #  0.0 %
        ['Unknowns', []]
    ]
    data = Build_Individual_Feature_with_Dict(df_Per, data, feature, A)



Unnamed: 0,Feature,Code,Name,Per,Corr
0,INJSEV_IM,3,Suspected Serious Injury (A),4.8002,89.2622
1,INJSEV_IM,5,"Injured, Severity Unknown",0.2782,65.4514
2,INJSEV_IM,2,Suspected Minor Injury (B),7.9611,57.2307
3,INJSEV_IM,6,Died Prior to Crash*,0.0031,52.6316
4,INJSEV_IM,4,Fatal Injury (K),0.749,41.6291
5,INJSEV_IM,1,Possible Injury (C),14.6972,34.3749
6,INJSEV_IM,0,No Apparent Injury (O),71.5113,0.0



    feature = 'INJ_SEV'
    A = [
        ['0', [3,5,2,6,4,]], #  13.8659 %
        ['1', [1,]], #  14.5494 %
        ['2', []], #  0.0 %
        ['3', [0,]], #  71.5847 %
        ['4', []], #  0.0 %
        ['Unknowns', [9, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Per, data, feature, A)



Unnamed: 0,Feature,Code,Name,Per,Corr
0,INJ_SEV,3,Suspected Serious Injury (A),4.8266,91.6805
1,INJ_SEV,5,"Injured, Severity Unknown",0.2626,70.9841
2,INJ_SEV,2,Suspected Minor Injury (B),8.0164,58.6911
3,INJ_SEV,6,Died Prior to Crash*,0.0032,52.6316
4,INJ_SEV,4,Fatal Injury (K),0.7571,42.5991
5,INJ_SEV,1,Possible Injury (C),14.5494,35.864
6,INJ_SEV,0,No Apparent Injury (O),71.5847,0.0



    feature = 'LOCATION'
    A = [
        ['0', [0,]], #  0.0 %
        ['Unknowns', [98, 99, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Per, data, feature, A)



Unnamed: 0,Feature,Code,Name,Per,Corr
0,LOCATION,0,Occupant of a Motor Vehicle,0,0



    feature = 'PERALCH_IM'
    A = [
        ['0', [1,]], #  2.4886 %
        ['1', [0,]], #  97.5114 %
        ['2', []], #  0.0 %
        ['Unknowns', []]
    ]
    data = Build_Individual_Feature_with_Dict(df_Per, data, feature, A)



Unnamed: 0,Feature,Code,Name,Per,Corr
0,PERALCH_IM,1,Yes (Alcohol Involved),2.4886,35.8021
1,PERALCH_IM,0,No (Alcohol Not Involved),97.5114,13.8421



    feature = 'PER_NO'
    A = [
        ['0', [16,17,18,19,20,21,22,23,24,25,26,15,7,9,]], #  0.0969 %
        ['1', [2,]], #  17.4846 %
        ['2', [8,6,10,13,]], #  0.2535 %
        ['3', [1,]], #  73.7435 %
        ['4', [5,3,4,12,11,14,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,]], #  8.4233 %
        ['Unknowns', []]
    ]
    data = Build_Individual_Feature_with_Dict(df_Per, data, feature, A)



Unnamed: 0,Feature,Code,Name,Per,Corr
0,PER_NO,16,16,0.0003,50.0
1,PER_NO,17,17,0.0003,50.0
2,PER_NO,18,18,0.0003,50.0
3,PER_NO,19,19,0.0003,50.0
4,PER_NO,20,20,0.0003,50.0
5,PER_NO,21,21,0.0003,50.0
6,PER_NO,22,22,0.0003,50.0
7,PER_NO,23,23,0.0003,50.0
8,PER_NO,24,24,0.0003,50.0
9,PER_NO,25,25,0.0003,50.0



    feature = 'PER_TYP'
    A = [
        ['0', [9,]], #  0.0203 %
        ['1', [2,]], #  26.1757 %
        ['2', []], #  0.0 %
        ['3', [1,]], #  73.4507 %
        ['4', [3,]], #  0.3533 %
        ['Unknowns', []]
    ]
    data = Build_Individual_Feature_with_Dict(df_Per, data, feature, A)



Unnamed: 0,Feature,Code,Name,Per,Corr
0,PER_TYP,9,Unknown Occupant Type in a Motor Vehicle In- T...,0.0203,18.254
1,PER_TYP,2,Passenger of a Motor Vehicle In-Transport,26.1757,15.3417
2,PER_TYP,1,Driver of a Motor Vehicle In-Transport,73.4507,14.0737
3,PER_TYP,3,Occupant of a Motor Vehicle Not In- Transport,0.3533,9.0205



    feature = 'REST_MIS'
    A = [
        ['0', [7,1,]], #  7.2463 %
        ['1', [0,]], #  92.7536 %
        ['2', []], #  0.0 %
        ['Unknowns', []]
    ]
    data = Build_Individual_Feature_with_Dict(df_Per, data, feature, A)



Unnamed: 0,Feature,Code,Name,Per,Corr
0,REST_MIS,7,None Used/Not Applicable,6.5321,25.8828
1,REST_MIS,1,"Yes, Indication of Mis-Use",0.7142,15.2581
2,REST_MIS,0,No Indication of Mis-Use,92.7536,13.5724



    feature = 'REST_USE'
    A = [
        ['0', [16,5,17,19,20,7,6,29,0,2,1,97,]], #  7.5268 %
        ['1', [3,]], #  85.8489 %
        ['2', [12,8,11,10,4,]], #  6.6243 %
        ['Unknowns', [98, 99, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Per, data, feature, A)



Unnamed: 0,Feature,Code,Name,Per,Corr
0,REST_USE,16,16,0.0219,65.3226
1,REST_USE,5,5,0.3134,61.9989
2,REST_USE,17,17,0.4542,60.4597
3,REST_USE,19,19,0.6565,60.027
4,REST_USE,20,None Used/Not Applicable,3.9344,52.1723
5,REST_USE,7,7,0.5247,49.747
6,REST_USE,6,Racing-Style Harness Used,0.0012,42.8571
7,REST_USE,29,29,0.052,42.1769
8,REST_USE,0,0,0.0619,22.8571
9,REST_USE,2,Lap Belt Only Used,0.8653,16.0941



    feature = 'SEAT_IM'
    A = [
        ['0', [56,52,55,51,54,53,38,18,28,12,21,]], #  4.862 %
        ['1', [13,]], #  14.5985 %
        ['2', []], #  0.0 %
        ['3', [11,]], #  73.6819 %
        ['4', [23,22,33,32,42,31,50,43,41,48,]], #  6.8577 %
        ['Unknowns', []]
    ]
    data = Build_Individual_Feature_with_Dict(df_Per, data, feature, A)



Unnamed: 0,Feature,Code,Name,Per,Corr
0,SEAT_IM,56,Appended to a Motor Vehicle for Motion,0.001,66.6667
1,SEAT_IM,52,Other Passenger in unenclosed passenger or car...,0.0116,43.0556
2,SEAT_IM,55,Riding on Exterior of Vehicle,0.0171,38.6792
3,SEAT_IM,51,Other Passenger in enclosed passenger or cargo...,0.2009,30.609
4,SEAT_IM,54,Trailing Unit,0.006,29.7297
5,SEAT_IM,53,"Other Passenger in passenger or cargo area, un...",0.009,25.0
6,SEAT_IM,38,"Third Seat, Other",0.004,24.0
7,SEAT_IM,18,"Front Seat, Other",0.0146,20.8791
8,SEAT_IM,28,"Second Seat, Other",0.0307,20.4188
9,SEAT_IM,12,"Front Seat, Middle",0.387,17.4709



    feature = 'SEAT_POS'
    A = [
        ['0', [56,52,55,51,54,53,38,18,28,12,29,49,]], #  0.7622 %
        ['1', [13,]], #  13.8574 %
        ['2', [21,19,39,]], #  4.1646 %
        ['3', [11,]], #  74.6203 %
        ['4', [23,22,33,42,32,50,31,43,41,48,]], #  6.5957 %
        ['Unknowns', [98, 99, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Per, data, feature, A)



Unnamed: 0,Feature,Code,Name,Per,Corr
0,SEAT_POS,56,Appended to a Motor Vehicle for Motion,0.001,66.6667
1,SEAT_POS,52,Other Passenger in unenclosed passenger or car...,0.0117,43.0556
2,SEAT_POS,55,Riding on Exterior of Vehicle,0.017,39.4231
3,SEAT_POS,51,Other Passenger in enclosed passenger or cargo...,0.1976,30.8581
4,SEAT_POS,54,Trailing Unit,0.006,29.7297
5,SEAT_POS,53,"Other Passenger in passenger or cargo area, un...",0.009,25.4545
6,SEAT_POS,38,"Third Seat, Other",0.0039,25.0
7,SEAT_POS,18,"Front Seat, Other",0.0109,23.8806
8,SEAT_POS,28,"Second Seat, Other",0.0284,21.2644
9,SEAT_POS,12,"Front Seat, Middle",0.3684,18.4152



    feature = 'SEX'
    A = [
        ['0', []], #  0.0 %
        ['1', [2,]], #  45.5467 %
        ['2', []], #  0.0 %
        ['3', [1,]], #  54.4533 %
        ['4', []], #  0.0 %
        ['Unknowns', [8, 9, ]]
    ]
    data = Build_Individual_Feature_with_Dict(df_Per, data, feature, A)



Unnamed: 0,Feature,Code,Name,Per,Corr
0,SEX,2,Female,45.5467,16.0196
1,SEX,1,Male,54.4533,14.0402



    feature = 'SEX_IM'
    A = [
        ['0', []], #  0.0 %
        ['1', [2,]], #  45.5047 %
        ['2', []], #  0.0 %
        ['3', [1,]], #  54.4953 %
        ['4', []], #  0.0 %
        ['Unknowns', []]
    ]
    data = Build_Individual_Feature_with_Dict(df_Per, data, feature, A)



Unnamed: 0,Feature,Code,Name,Per,Corr
0,SEX_IM,2,Female,45.5047,15.4391
1,SEX_IM,1,Male,54.4953,13.5114



    feature = 'VEH_NO'
    A = [
        ['0', [13,7,8,]], #  0.0282 %
        ['1', [1,]], #  54.261 %
        ['2', [9,]], #  0.0043 %
        ['3', [2,]], #  40.1642 %
        ['4', [6,10,3,5,4,11,12,14,15,]], #  5.5421 %
        ['Unknowns', []]
    ]
    data = Build_Individual_Feature_with_Dict(df_Per, data, feature, A)



Unnamed: 0,Feature,Code,Name,Per,Corr
0,VEH_NO,13,13,0.001,66.6667
1,VEH_NO,7,7,0.0179,18.018
2,VEH_NO,8,8,0.0093,15.5172
3,VEH_NO,1,1,54.261,15.0323
4,VEH_NO,9,9,0.0043,14.8148
5,VEH_NO,2,2,40.1642,14.0937
6,VEH_NO,6,6,0.0486,13.245
7,VEH_NO,10,10,0.0014,11.1111
8,VEH_NO,3,3,4.4652,10.3645
9,VEH_NO,5,5,0.1801,10.0089



        Feature     Code                                               Name  \
0       ALC_RES  650-699                                            650-699   
1       ALC_RES  450-499                                            450-499   
2       ALC_RES  500-549                                            500-549   
3      HOSPITAL        0                      Not Transported for Treatment   
4      HOSPITAL        1                                            EMS Air   
5       INJ_SEV        3                       Suspected Serious Injury (A)   
6     INJSEV_IM        3                       Suspected Serious Injury (A)   
7       INJ_SEV        5                          Injured, Severity Unknown   
8      EJECTION        1                                    Totally Ejected   
9       SEAT_IM       56             Appended to a Motor Vehicle for Motion   
10     SEAT_POS       56             Appended to a Motor Vehicle for Motion   
11       VEH_NO       13                           