- CRSS where each sample corresponds to one person, not to one accident.  
- This one corresponds better to evaluating per phone.

In [1]:
%%latex
\tableofcontents

<IPython.core.display.Latex object>

# Setup
## Import Libraries

In [2]:
import sys, copy, math, time

print ('Python version: {}'.format(sys.version))

from IPython.display import display, HTML

from collections import Counter

import numpy as np
print ('NumPy version: {}'.format(np.__version__))
np.set_printoptions(suppress=True)

import pandas as pd
print ('Pandas version:  {}'.format(pd.__version__))
pd.set_option('display.max_rows', 500)



Python version: 3.9.16 (main, Dec  7 2022, 10:02:13) 
[Clang 14.0.0 (clang-1400.0.29.202)]
NumPy version: 1.24.0
Pandas version:  1.5.2


## Import Data

### person.csv from CRSS

In [3]:
def Import_Data_Person():
    print ('Import_Data_Person()')

    df = pd.DataFrame([])
    for year in ['2016','2017','2018']:
        filename = '../../Big_Files/CRSS_2020_Update/CRSS' + year + 'CSV/PERSON.CSV'
#        filename = '../../CRSS/CRSS' + year + 'CSV/PERSON.CSV'
        temp = pd.read_csv(filename, index_col=None)
        print (year, len(temp))
        df = df.append(temp)

    for year in ['2019','2020']:
        filename = '../../Big_Files/CRSS_2020_Update/CRSS' + year + 'CSV/person.csv'
#        filename = '../../CRSS/CRSS' + year + 'CSV/person.csv'
        temp = pd.read_csv(filename, index_col=None, encoding='latin1')
        print (year, len(temp))
        df = df.append(temp)

#    for feature in df:
#        print (feature)
    

#    for feature in df:
#        if 'NAME' in feature:
#            df.drop(columns=[feature], inplace=True)

    print (df.shape)
    print ()
    return df

### All Data

In [4]:
def Import_Data():
    print ('Import_Data()')
    df_Person = Import_Data_Person()
    
    print ()
    return df_Person

# Build Database

## Build_Person_Dataset

In [5]:
def Build_Person_Dataset(df_Person):
    print ('Build_Person_Dataset')
    data = pd.DataFrame()
    
    F = [
        'CASENUM',
        'VEH_NO',
        'AGE', 
        'AGE_IM',
        'AIR_BAG',
        'ALC_RES',
        'ALC_STATUS',
        'DRINKING',
        'DRUGS',
        'EJECT_IM',
        'EJECTION',
        'INJ_SEV',
        'INJSEV_IM',
        'LOCATION',
#        'PER_NO',
        'PER_TYP',
        'PERALCH_IM',
        'REST_MIS',
        'REST_USE',
        'SEAT_IM',
        'SEAT_POS',
        'SEX_IM',
        'SEX',
    ]

    for f in F:
        data[f] = df_Person[f]
        
    data['HOSPITAL'] = df_Person['HOSPITAL'].apply(lambda x:1 if x in [1,2,3,4,5] else 0)

    data = data[data.VEH_NO != 0]

    data = data.reindex(sorted(data.columns), axis=1)    
    
    print ()
    return data

## Feature Names

In [6]:
def Feature_Names(data, Named_Features):
    print ('Feature_Names')
    D = {}
    for f in Named_Features:
        g = f + 'NAME'
        A = pd.concat([data[f],data[g]], axis=1)
        A.drop_duplicates(inplace=True)
        A.dropna(inplace=True)
#        print (f)
#        print (len(A))
#        print (A.head())
#        print ()
        B = dict(zip(A[f],A[g]))
        D[f] = B
#        print (B)
#        print ()
#    print (D)
    print ()
    return D
        

In [7]:
def Remove_Unknowns_in_Feature(data, feature):
    
    Unknowns = {
        'AGE': [998,999],
        'AGE_IM': [],
        'AIR_BAG': [98,99,],
        'ALC_RES': [999,],
        'ALC_STATUS': [8,9,],
        'CASENUM': [],
        'DRINKING': [8,9,],
        'DRUGS': [8,9,],
        'EJECT_IM': [],
        'EJECTION': [7,9,],
        'HELM_USE': [98,99,],
        'HOSPITAL': [],
        'INJ_SEV': [9,],
        'INJSEV_IM': [],
        'LOCATION': [98,99,],
        'PER_NO': [],
        'PER_TYP': [],
        'PERALCH_IM': [],
        'REST_MIS': [],
        'REST_USE': [98,99,],
        'SEAT_IM': [],
        'SEAT_POS': [98,99,],
        'SEX': [8,9,],
        'SEX_IM': [],
        'VEH_NO': [],
    }
    
#    print ('Remove_Unknowns_in_Feature ', feature, Unknowns[feature], len(data))
    data_temp = data[~data[feature].isin(Unknowns[feature])]
#    print ('Remove_Unknowns_in_Feature ', feature, Unknowns[feature], len(data_temp))
#    print ()
    
    return data_temp, Unknowns[feature]

## Correlation

In [8]:
def Correlation(data, target, feature, value, name):
    TN = 0
    FP = 0
    FN = 0
    TP = 0
    contingency_matrix = pd.crosstab(data[target], data[feature])
    cm = contingency_matrix.values.tolist()
    if len(cm)==2 and len(cm[0])==2:
        corr = cm[1][1] / (cm[0][1] + cm[1][1])
        per = (cm[0][1] + cm[1][1])/(cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])
    else:
        corr = 0
        per = 0
    per = round(per*100,4)
    corr = round(corr*100,4)
#    print ("    - ", feature)
#    print ("    - ", value)
#    print ("    - ", name)
#    print (contingency_matrix)
#    print ('        - per = ', per)
#    print ("        - corr = ", corr)
#    print ()
    return (per, corr)

def Correlation_by_Value(data, target, feature, Feature_Names_Dict, Unknowns):
# I decided against the np.unique because it treats each nan as a separate entry.
#    V = np.unique(data[feature].values) 
    V = data[feature].unique()
#    print (V)
    B = []

    for value in V:
        A = pd.DataFrame()
        A[feature] = data[feature].apply(lambda x: 1 if x==value else 0)
        A[target] = data[target]
        if feature in Feature_Names_Dict:
            if value in Feature_Names_Dict[feature]:
                name = Feature_Names_Dict[feature][value]
            else:
                name=str(value)
        else:
            name = str(value)
#        if len(name)>30:
#            name = name[:30]
        per, corr = Correlation(A, target, feature, value, name)
        B.append([feature, value, name, per, corr])
#    print (feature)
    B = sorted(B, key=lambda x:x[4], reverse=True)
    for b in B:
        c = b[1]
        try:
            c = int(c)
        except:
            c=c
        else:
            c = int(c)
#        print (c, end=',')
#    print ()
#    print ()

    # Print grouped into 100/p blocks of same size
    print ("    feature = '%s'" % feature)
    print ('    A = [')
    p = 20
    s = 0.0
    s2 = 0.0
    n=0
    print ("        ['%d', [" % n , end='')
    for b in B:
        t = s + b[3]
        if b[3]<10:
            s2 = s2 + b[3]
        q = int(s/p)
        r = int((t-0.001)/p)
        if r>q or b[3]>10:
            print ("]], # ", round(s2,4), '%')
            s2 = 0.0
            n += 1
            print ("        ['%d', [" % n , end='')
        s = t
        
        c = b[1]
        try:
            c = int(c)
        except:
            c=c
        else:
            c = int(c)
        print (c, end=',')
        if b[3]>10:
            print ("]], # ", round(b[3],4), '%')
            s2=0.0
            n += 1
            print ("        ['%d', [" % n , end='')
    print ("]], # ", round(s2,4), '%')
    print ("        ['Unknowns', [", end='')
    for u in Unknowns:
        print (u, end=', ')
    print ("]]" )
    print ('    ]')
    print ('    data = Build_Individual_Feature_with_Dict(df_Per, data, feature, A)')
    print ()
    
    C = pd.DataFrame(B)
    C.columns = ['Feature', 'Code', 'Name', 'Per', 'Corr']
#    C.drop(C[C['Per'] < 0.1].index, inplace=True)
#    print (C)
    display(C)

    TeX = open('../Correlation/Correlation_' + feature + '.tex', 'w')
    E = [c for c in B if c[3]>=0.0]
    
        
    
    for c in E:
        a = c[0]
        b = c[1]
        d = c[2]
        e = "{:.4f}".format(c[3])
        f = "{:.4f}".format(c[4])
        TeX.write('\t & \\verb|%s| & %s & %s & %s & %s \\cr\n' % (a,b,d,e,f))
    

    TeX = open('../Correlation/Correlation_Ordered_' + feature + '.tex', 'w')
    E = sorted(B, key=lambda x:x[1], reverse=False)

    
    for c in E:
        a = c[0]
        b = c[1]
        d = c[2]
        e = "{:.4f}".format(c[3])
        f = "{:.4f}".format(c[4])
        TeX.write('\t & \\verb|%s| & %s & %s & %s & %s \\cr\n' % (a,b,d,e,f))
    

    print ()
    return B

def Correlation_All(data, target, Feature_Names_Dict):
    print ('Correlation_All')
    
    C = []
    for feature in data:
        data_temp, Unknowns = Remove_Unknowns_in_Feature(data, feature)
        U = data_temp[feature].unique()
#        print (feature, len(U))
        if len(U)<10000:
            B = Correlation_by_Value(
                data_temp, target, feature, Feature_Names_Dict, Unknowns
            )
            for b in B:
                C.append(b)
#            print ()
#        print ()
#    for c in C:
#        print (c)
#    print ()
    C = sorted(C, key=lambda x:x[4], reverse=True)
    D = pd.DataFrame(C)
    D.columns = ['Feature', 'Code', 'Name', 'Per', 'Corr']
    print (D)
    print ()
    
    D.drop(D[D['Per'] < 0.5].index, inplace=True)
    print (D)
    print ()
    
    TeX = open('../Correlation/Correlation.tex', 'w')
    E = [c for c in C if c[3]>=0.5]
    
    for c in E:
        a = c[0]
        b = c[1]
        d = c[2]
        e = "{:.4f}".format(c[3])
        f = "{:.4f}".format(c[4])
        TeX.write('\\verb|%s| & %s & %s & %s & %s \\cr\n' % (a,b,d,e,f))
    
    return 0

    

In [9]:
def PreDiscretize(df):
    print ('PreDiscretize')
    feature = 'ALC_RES'
    interval = 50
    A = [*range(-1,941,interval)]
    A = A + [994,995,996,997,998,999]
    L = []
    for x in range (-1,940,interval):
        s = str(x+1).zfill(3) + '-' + str(x+interval).zfill(3)
        L.append(s)
    L = L + ['995', '996', '997', '998', '999']
    print (len(A), len(L))
    print (A)
    print (L)
    New = pd.cut(
        df[feature], 
        A, 
        labels=L
    )
    print ()
    Crosstabs = pd.crosstab(df[feature], New)
    print ()
    display (Crosstabs)
    
    df[feature] = New
    
    print (df[feature].value_counts())
    
    return df
    

# Main()

In [10]:
def Main():
    target = 'HOSPITAL'
    df_Person = Import_Data()
    data = Build_Person_Dataset(df_Person)
    
    print ('Features in data, with Number of Unique Values and Number of Blank Values')
    for feature in data:
        U = data[feature].unique()
        s = data[feature].isna().sum()
        print (feature, len(U), s)
    print ()
    
    
    print ('Features in df_Person with Names')
    F = []
    for feature in df_Person:
        if 'NAME' in feature:
            f = feature[:-4]
            if f in data:
                F.append(f)
    F = sorted(F)
    for f in F:
        print ("        '%s'," % f)
    print ()
    
    Named_Features_df_Person = [
        'AGE',
        'AGE_IM',
        'AIR_BAG',
        'ALC_RES',
        'ALC_STATUS',
        'DRINKING',
        'DRUGS',
        'EJECTION',
        'EJECT_IM',
        'HOSPITAL',
        'INJSEV_IM',
        'INJ_SEV',
        'LOCATION',
        'PERALCH_IM',
        'PER_TYP',
        'REST_MIS',
        'REST_USE',
        'SEAT_IM',
        'SEAT_POS',
        'SEX',
        'SEX_IM',
    ]

    Feature_Names_Dict = Feature_Names(df_Person, Named_Features_df_Person)

    print (Feature_Names_Dict)
    
    return 0

    PreDiscretize(data)

    Correlation_All(data, target, Feature_Names_Dict)


Main()

Import_Data()
Import_Data_Person()
2016 117759


  df = df.append(temp)


2017 138913
2018 120230


  df = df.append(temp)
  df = df.append(temp)


2019 135410


  df = df.append(temp)


2020 131962


  df = df.append(temp)


(644274, 117)


Build_Person_Dataset

Features in data, with Number of Unique Values and Number of Blank Values
AGE 118 0
AGE_IM 116 0
AIR_BAG 11 0
ALC_RES 335 0
ALC_STATUS 5 0
CASENUM 258904 0
DRINKING 4 0
DRUGS 4 0
EJECTION 7 0
EJECT_IM 5 0
HOSPITAL 2 0
INJSEV_IM 7 0
INJ_SEV 8 0
LOCATION 1 0
PERALCH_IM 2 0
PER_TYP 4 0
REST_MIS 3 0
REST_USE 20 0
SEAT_IM 23 0
SEAT_POS 29 0
SEX 4 0
SEX_IM 2 0
VEH_NO 15 0

Features in df_Person with Names
        'AGE',
        'AGE_IM',
        'AIR_BAG',
        'ALC_RES',
        'ALC_STATUS',
        'DRINKING',
        'DRUGS',
        'EJECTION',
        'EJECT_IM',
        'HOSPITAL',
        'INJSEV_IM',
        'INJ_SEV',
        'LOCATION',
        'PERALCH_IM',
        'PER_TYP',
        'REST_MIS',
        'REST_USE',
        'SEAT_IM',
        'SEAT_POS',
        'SEX',
        'SEX_IM',

Feature_Names

{'AGE': {39: '39 Years', 20: '20 Years', 19: '19 Years', 999: 'Reported as Unknown', 42: '42 Years', 47: '47 Years', 37: '37 Years', 50: '50

0