- In the CRSS dataset, we want to do two things.  
    - Impute unknown values
    - Bin into fewer categories
- Does the order of operations matter?
- General Strategy
    - Pull the features I want to use in the Accident data file
        - If the feature has been imputed by CRSS, pull the unimputed version
    - For each feature, count the number of samples with unknown values and record the proportion, p(feature)
    - Delete any records with unknown values in any features.  Call this dataframe df_A.
    - Create a deep copy of df_A with binned values to be ground truth.  Call this dataframe df_B.
    - For each feature, delete the value for p(feature) of the records. Call this dataframe df_C.
    - Bin Then Impute
        - Bin the values in df_C; call it df_D.
        - Impute blank values in df_D; call it df_E
        - For each feature, for the samples that were blank, make a crosstab between df_B and df_E
    - Impute Then Bin
        - Impute blank values in df_C; call it df_F.
        - Bin the values in df_F; call it df_G
        - For each feature, for the samples that were blank, make a crosstab between df_B and df_G
    - Find an appropriate metric
    

In [1]:
%%latex
\tableofcontents

<IPython.core.display.Latex object>

# Setup
## Import Libraries

In [2]:
import sys, copy, math, time, os

print ('Python version: {}'.format(sys.version))

import numpy as np
print ('NumPy version: {}'.format(np.__version__))
np.set_printoptions(suppress=True)


import pandas as pd
print ('Pandas version:  {}'.format(pd.__version__))
pd.set_option('display.max_rows', 500)

# Library for reading Microsoft Access files
import pandas_access as mdb


# Set Randomness.  Copied from https://www.kaggle.com/code/abazdyrev/keras-nn-focal-loss-experiments
import random


Python version: 3.9.16 (main, Dec  7 2022, 10:02:13) 
[Clang 14.0.0 (clang-1400.0.29.202)]
NumPy version: 1.24.0
Pandas version:  1.5.2


# Import Data

### accident.csv from CRSS

In [3]:
def Import_Data_Accident(NAMES):
    print ('Import_Data_Accident()')

    df = pd.DataFrame([])
#    for year in ['2018']:
    for year in ['2016','2017','2018']:
        filename = '../../CRSS/CRSS' + year + 'CSV/ACCIDENT.CSV'
        temp = pd.read_csv(filename, index_col=None)
        print (year, len(temp))
        df = df.append(temp)

#    for year in ['2020']:
    for year in ['2019','2020']:
        filename = '../../CRSS/CRSS' + year + 'CSV/accident.csv'
        temp = pd.read_csv(filename, index_col=None)
        print (year, len(temp))
        df = df.append(temp)
    
    if NAMES==0:
        for feature in df:
            if 'NAME' in feature:
                df.drop(columns=[feature], inplace=True)

    print (df.shape)
    print ()
    return df

## vehicle.csv from CRSS

In [4]:
def Import_Data_Vehicle(NAMES):
    print ('Import_Data_Vehicle()')

    df = pd.DataFrame([])
    for year in ['2016','2017','2018']:
        filename = '../../CRSS/CRSS' + year + 'CSV/VEHICLE.CSV'
        temp = pd.read_csv(filename, index_col=None, low_memory=False)
        print (year, len(temp))
        df = df.append(temp)

    for year in ['2019','2020']:
        filename = '../../CRSS/CRSS' + year + 'CSV/vehicle.csv'
        temp = pd.read_csv(filename, index_col=None, encoding='latin1', low_memory=False)
        print (year, len(temp))
        df = df.append(temp)

    if NAMES==0:
        for feature in df:
            if 'NAME' in feature:
                df.drop(columns=[feature], inplace=True)

    print (df.shape)
    print ()
    return df

### person.csv from CRSS

In [5]:
def Import_Data_Person(NAMES):
    print ('Import_Data_Person()')

    df = pd.DataFrame([])
    for year in ['2016','2017','2018']:
        filename = '../../CRSS/CRSS' + year + 'CSV/PERSON.CSV'
        temp = pd.read_csv(filename, index_col=None)
        print (year, len(temp))
        df = df.append(temp)

    for year in ['2019','2020']:
        filename = '../../CRSS/CRSS' + year + 'CSV/person.csv'
        temp = pd.read_csv(filename, index_col=None, encoding='latin1')
        print (year, len(temp))
        df = df.append(temp)

    if NAMES==0:
        for feature in df:
            if 'NAME' in feature:
                df.drop(columns=[feature], inplace=True)

    print (df.shape)
    print ()
    return df

## Get Data
- The Get_Data_from_Original() reads the (original) CRSS files from the CRSS directory, preprocesses it, and writes it to files in a folder outside this GitHub repo (because the files are too large for my subscription), and returns the dataframes.
- The Get_Data_from_Temp_Files() reads the temp files and returns the dataframes.  I created this option for running repeatedly during writing and debugging, because it's much faster.

In [6]:
def Get_Data_from_Original():
    print ('Get_Data_from_Original()')
    
    df_Accident = Import_Data_Accident(0)
    df_Vehicle = Import_Data_Vehicle(0)
    df_Person = Import_Data_Person(0)
    
    df_Accident.to_csv('../../Big_Files/Accident.csv', index=False)
    df_Vehicle.to_csv('../../Big_Files/Vehicle.csv', index=False)
    df_Person.to_csv('../../Big_Files/Person.csv', index=False)
    

    df_Accident = Import_Data_Accident(1)
    df_Vehicle = Import_Data_Vehicle(1)
    df_Person = Import_Data_Person(1)
    
    df_Accident.to_csv('../../Big_Files/Accident_with_NAMES.csv', index=False)
    df_Vehicle.to_csv('../../Big_Files/Vehicle_with_NAMES.csv', index=False)
    df_Person.to_csv('../../Big_Files/Person.csv_with_NAMES', index=False)
    

    return df_Accident, df_Vehicle, df_Person

In [7]:
def Get_Data_from_Temp_Files():
    print ('Get_Data_from_Temp_File')
    df_Accident = pd.read_csv('../../Big_Files/Accident.csv')
    df_Vehicle = pd.read_csv('../../Big_Files/Vehicle.csv', low_memory=False)
    df_Person = pd.read_csv('../../Big_Files/Person.csv')
    print ('len(df_Accident) = ', len(df_Accident))
    print ()
    
    return df_Accident, df_Vehicle, df_Person    

In [8]:
#df_Accident, df_Vehicle, df_Person = Get_Data_from_Original()

In [9]:
df_Accident, df_Vehicle, df_Person = Get_Data_from_Temp_Files()

Get_Data_from_Temp_File
len(df_Accident) =  259077



# Accident Dataset

- This information is in the Accident_Dataset_Information.xlsx file
- It does not include the 2021 data

|  Feature  |  Meaning  |  Number of Values  |  Number of Missing Values  | Continuous, Categorical, Count, Mixed, Transfer, or Drop | Values Signifying ''Missing'' | Number of Samples signified as "Missing" |  Notes  | 
 | --- | --- | --- | --- | --- | --- | --- | --- |
 | ALCHL_IM | ALCOHOL Imputed | 2 | 0 | Categorical |  |  |  | 
 | ALCOHOL | Alcohol Involved in Crash  | 4 | 0 | Categorical | [9] | 59889 | Derived Data Element | 
 | CASENUM |  | 259077 | 0 | Transfer |  |  |  | 
 | CF1 |  | 23 | 54745 | Drop |  |  | Discontinued | 
 | CF2 |  | 17 | 54745 | Drop |  |  | Discontinued | 
 | CF3 |  | 11 | 54745 | Drop |  |  | Discontinued | 
 | DAY_WEEK |  | 7 | 0 | Categorical | [9] | 0 |  | 
 | EVENT1_IM | HARM_EV Imputed | 54 | 0 | Categorical |  |  |  | 
 | HARM_EV | First Harmful Event  | 56 | 0 | Categorical | [98,99] | 166 |  | 
 | HOUR |  | 25 | 0 | Categorical | [99] | 1127 |  | 
 | HOUR_IM |  | 24 | 0 | Categorical |  |  |  | 
 | INT_HWY | Interstate Highway | 3 | 0 | Categorical | [9] | 25 |  | 
 | LGTCON_IM | LGT_COND Imputed | 7 | 0 | Categorical |  |  |  | 
 | LGT_COND | Light Condition | 9 | 0 | Categorical | [8,9] | 2309 |  | 
 | MANCOL_IM | MAN_COLL Imputed | 9 | 0 | Categorical |  |  |  | 
 | MAN_COLL | Manner of Collision of the First Harmful Event  | 11 | 0 | Categorical | [98,99] | 1012 |  | 
 | MAXSEV_IM | MAX_SEV Imputed | 8 | 0 | Categorical |  |  | Derived Data Element | 
 | MAX_SEV | Maximum Severity in Crash | 9 | 0 | Categorical | [9] | 4480 | Derived Data Element | 
 | MINUTE |  | 61 | 0 | Categorical | [99] | 1127 |  | 
 | MINUTE_IM |  | 60 | 0 | Categorical |  |  |  | 
 | MONTH |  | 12 | 0 | Categorical |  |  |  | 
 | NO_INJ_IM | NUM_INJ Imputed | 18 | 0 | Count |  |  | Derived Data Element | 
 | NUM_INJ | Number Injured in Crash | 20 | 0 | Count | [99] | 4480 | Change 98 to 0; derived data element | 
 | PEDS | Number of persons not in motor vehicles | 10 | 0 | Count |  |  |  | 
 | PERMVIT | Number of Persons in Motor Vehicles in Transport  | 26 | 0 | Count |  |  |  | 
 | PERNOTMVIT | Number of Persons Not in Motor Vehicles in Transport  | 10 | 0 | Count |  |  |  | 
 | PJ |  | 422 | 0 | Drop |  |  |  | 
 | PSU |  | 60 | 0 | Drop |  |  |  | 
 | PSUSTRAT |  | 25 | 0 | Drop |  |  |  | 
 | PSU_VAR |  | 67 | 0 | Drop |  |  |  | 
 | PVH_INVL | Number of Parked/Working Vehicles in the Crash  | 11 | 0 | Count |  |  |  | 
 | REGION |  | 4 | 0 | Drop |  |  |  | 
 | RELJCT1 | Relation to Junction-Within Interchange Area  | 4 | 0 | Categorical | [8,9] | 65920 |  | 
 | RELJCT1_IM | RELJCT1 Imputed | 3 | 54409 | Categorical |  |  |  | 
 | RELJCT2 | Relation to Junction-Specific Location  | 15 | 0 | Categorical | [98,99] | 19721 |  | 
 | RELJCT2_IM | RELJCT2 Imputed | 13 | 0 | Categorical |  |  |  | 
 | REL_ROAD | Relation to Trafficway  | 13 | 0 | Categorical | [98,99] | 190 |  | 
 | SCH_BUS |  | 2 | 0 | Categorical |  |  |  | 
 | STRATUM |  | 9 | 0 | Drop |  |  |  | 
 | TYP_INT | Type of Intersection  | 11 | 0 | Categorical | [98,99] | 26650 |  | 
 | URBANICITY |  | 2 | 0 | Categorical |  |  |  | 
 | VE_FORMS | Number of Motor Vehicles in Transport  | 13 | 0 | Count |  |  |  | 
 | VE_TOTAL | Number of vehicles in crash | 13 | 0 | Count |  |  |  | 
 | WEATHER |  | 13 | 0 | Categorical | [98,99] | 13284 |  | 
 | WEATHER1 |  | 14 | 54745 | Drop |  |  | Discontinued | 
 | WEATHER2 |  | 14 | 54745 | Drop |  |  | Discontinued | 
 | WEATHR_IM | WEATHER Imputed | 11 | 0 | Categorical |  |  |  | 
 | WEIGHT | Case weight | 8816 | 0 | Drop |  |  |  | 
 | WKDY_IM | DAY_WEEK Imputed | 7 | 0 | Categorical |  |  |  | 
 | WRK_ZONE | Work Zone | 5 | 0 | Categorical |  |  |  | 
 | YEAR |  | 5 | 0 | Categorical |  |  |  | ![image.png](attachment:image.png)

# Organize Data

In [10]:
Accident_Features = [
    # Features that CRSS imputed; use unimputed
    ['DAY_WEEK', [9], 7, 0], # Accident
    ['HOUR', [99], 25, 1127], # Accident
    ['LGT_COND', [8,9], 9, 2309], 
    ['RELJCT1', [8,9], 4, 65920],
    ['RELJCT2', [98,99], 15, 19721],
    ['WEATHER', [98,99], 13, 13284],
    # Features with no unknown or missing values
    ['MONTH', [], 12, 0],
    ['PEDS', [], 10, 0],
    ['PERMVIT', [], 26, 0],
    ['PERNOTMVIT', [], 10, 0],
    ['PVH_INVL', [], 11, 0],
    ['REGION', [], 4, 0],
    ['SCH_BUS', [], 2, 0],
    ['URBANICITY', [], 2, 0],
    ['VE_FORMS', [], 13, 0], # Count
    ['VE_TOTAL', [], 13, 0], # Count
    ['WRK_ZONE', [], 5, 0],
    # Features with unknown values, Not imputed by CRSS
    ['INT_HWY', [9], 3, 25],
    ['REL_ROAD', [98,99], 13, 190],
    ['TYP_INT', [98,99], 11, 26650],
]

Accident_Features = sorted(Accident_Features, key=lambda x:x[0])
for feature in Accident_Features:
    print (feature)
print ()

Vehicle_Features = [
    # Features that CRSS imputed; use unimputed
    ['BODY_TYP', [98,99,49,79], 73, 18524],
    ['MOD_YEAR', [9998,9999], 83, 18524], 
    # Features with no unknown or missing values
    ['MODEL', [], 140, 0],
    # Features with unknown values, Not imputed by CRSS
    ['MAKE', [99], 70, 12901],
    ['MOD_YEAR', [9998,9999], 83, 18524],
    ['VALIGN', [8,9], 7, 31554],
    ['VNUM_LAN', [8,9], 10, 127387], # Count
    ['VPROFILE', [8,9], 9, 62776],
    ['VSPD_LIM', [98,99], 20, 62649],
    ['VTRAFCON', [97,99], 19, 30151],
    ['VTRAFWAY', [8,9], 9, 83513],
]

Vehicle_Features = sorted(Vehicle_Features, key=lambda x:x[0])
for feature in Vehicle_Features:
    print (feature)
print ()

Person_Features = [
    # Features that CRSS imputed; use unimputed
    ['AGE', [998,999], 188, 41087], # Person
    ['SEX', [8,9], 4, 26143],
    # Features with no unknown or missing values
    ['PER_TYP', [], 13, 0],
    # Features with unknown values, Not imputed by CRSS
    ['HOSPITAL', [8,9], 9, 13522],
]

Person_Features = sorted(Person_Features, key=lambda x:x[0])
for feature in Person_Features:
    print (feature)
print ()



['DAY_WEEK', [9], 7, 0]
['HOUR', [99], 25, 1127]
['INT_HWY', [9], 3, 25]
['LGT_COND', [8, 9], 9, 2309]
['MONTH', [], 12, 0]
['PEDS', [], 10, 0]
['PERMVIT', [], 26, 0]
['PERNOTMVIT', [], 10, 0]
['PVH_INVL', [], 11, 0]
['REGION', [], 4, 0]
['RELJCT1', [8, 9], 4, 65920]
['RELJCT2', [98, 99], 15, 19721]
['REL_ROAD', [98, 99], 13, 190]
['SCH_BUS', [], 2, 0]
['TYP_INT', [98, 99], 11, 26650]
['URBANICITY', [], 2, 0]
['VE_FORMS', [], 13, 0]
['VE_TOTAL', [], 13, 0]
['WEATHER', [98, 99], 13, 13284]
['WRK_ZONE', [], 5, 0]

['BODY_TYP', [98, 99, 49, 79], 73, 18524]
['MAKE', [99], 70, 12901]
['MODEL', [], 140, 0]
['MOD_YEAR', [9998, 9999], 83, 18524]
['MOD_YEAR', [9998, 9999], 83, 18524]
['VALIGN', [8, 9], 7, 31554]
['VNUM_LAN', [8, 9], 10, 127387]
['VPROFILE', [8, 9], 9, 62776]
['VSPD_LIM', [98, 99], 20, 62649]
['VTRAFCON', [97, 99], 19, 30151]
['VTRAFWAY', [8, 9], 9, 83513]

['AGE', [998, 999], 188, 41087]
['HOSPITAL', [8, 9], 9, 13522]
['PER_TYP', [], 13, 0]
['SEX', [8, 9], 4, 26143]



# def Erase Proportional Number of Samples from Each 

In [11]:
def Erase_Proportional(df, df_Original, df_Features):
    N = df_Original.shape[0]
    n = df.shape[0]
    print ("N = ", N, "n = ", n)
    print ()
    for F in df_Features:
        feature = F[0]
        nUnknown = int(F[3]*n/N + 0.5)
        print ()
        print (feature, n, N, df.shape[0], F[3], nUnknown)
        if nUnknown>0:
            A = random.sample(range(n), nUnknown)
#            A = sorted(A)
#            print (feature, A)
            for i in range (nUnknown):
                df.loc[A[i], feature] = ''
                
    return df
        

In [12]:
def Erase_Proportional_Test():
    df_Original = pd.DataFrame(np.random.randint(0,5,size=(20, 4)), columns=list('ABCD'))
    df = df_Original.copy(deep=True)
    df_Features = [
        ['A',0,0,0],
        ['B',0,0,0],
        ['C',0,0,0],
        ['D',0,0,0],
    ]
    for F in df_Features:
        feature = F[0]
        F[3] = len(df[df[feature]==0])
        
    for F in df_Features:
        feature = F[0]
        df.drop( df[ df[feature]==0].index, inplace=True)
    
    df = df.reset_index(drop=True)
        
    print (df_Original)
    print ()
    print (df_Features)
    print ()
    print (df)
    print ()
    df_New = Erase_Proportional(df, df_Original, df_Features)
    
    print ('df_New Again')
    print ()
    print (df_New)
    
        
Erase_Proportional_Test()

    A  B  C  D
0   2  3  2  3
1   1  3  3  3
2   4  3  0  3
3   3  0  2  0
4   1  1  3  4
5   2  1  3  3
6   0  0  3  3
7   0  2  0  0
8   4  1  2  1
9   0  3  2  2
10  1  4  0  3
11  2  0  1  0
12  4  4  2  3
13  0  1  1  3
14  4  4  1  0
15  2  2  0  4
16  1  4  1  1
17  4  0  1  0
18  0  3  3  4
19  0  0  4  4

[['A', 0, 0, 6], ['B', 0, 0, 5], ['C', 0, 0, 4], ['D', 0, 0, 5]]

   A  B  C  D
0  2  3  2  3
1  1  3  3  3
2  1  1  3  4
3  2  1  3  3
4  4  1  2  1
5  4  4  2  3
6  1  4  1  1

N =  20 n =  7


A 7 20 7 6 2

B 7 20 7 5 2

C 7 20 7 4 1

D 7 20 7 5 2
df_New Again

   A  B  C  D
0  2  3  2  3
1     3  3  3
2  1     3   
3  2  1  3  3
4  4  1  2  1
5     4     3
6  1     1   


# def Binning

In [13]:
def Build_Individual_Feature_with_Dict(df, feature, A):
    D = {}
    for B in A:
        for b in B[1]:
            D[b] = B[2]

    print (feature)
    print (D)
    print (df[feature].value_counts())
    print ('isna(): ', df[feature].isna().sum())

    df[feature].replace(D, inplace=True)
    
    print (df[feature].value_counts())
    print ()

    return df
    

In [14]:
def Test_Build_Individual_Feature_with_Dict():
    df = pd.DataFrame(np.random.randint(0,10,size=(20, 4)), columns=list('ABCD'))
    df['E'] = df['A']
    
    feature = 'A'
    A = [
        ['Low',[0,1,2], 10],
        ['Medium',[3,4,5,6], 20],
        ['High',[7,8,9], 30],
    ]
    
    print (df)
    print ()
    df = Build_Individual_Feature_with_Dict(df, feature, A)
    print (df)
    print ()
    
    return 0

Test_Build_Individual_Feature_with_Dict()


    A  B  C  D  E
0   7  2  0  0  7
1   2  5  3  0  2
2   0  0  7  5  0
3   4  5  3  6  4
4   6  6  7  4  6
5   6  9  7  0  6
6   5  9  5  6  5
7   1  6  8  2  1
8   0  3  4  3  0
9   3  2  0  4  3
10  3  9  1  7  3
11  5  6  3  6  5
12  6  0  1  6  6
13  5  6  5  4  5
14  2  6  5  3  2
15  9  6  7  2  9
16  2  8  7  8  2
17  4  6  9  8  4
18  6  9  2  3  6
19  2  1  0  0  2

A
{0: 10, 1: 10, 2: 10, 3: 20, 4: 20, 5: 20, 6: 20, 7: 30, 8: 30, 9: 30}
2    4
6    4
5    3
0    2
4    2
3    2
7    1
1    1
9    1
Name: A, dtype: int64
isna():  0
20    11
10     7
30     2
Name: A, dtype: int64

     A  B  C  D  E
0   30  2  0  0  7
1   10  5  3  0  2
2   10  0  7  5  0
3   20  5  3  6  4
4   20  6  7  4  6
5   20  9  7  0  6
6   20  9  5  6  5
7   10  6  8  2  1
8   10  3  4  3  0
9   20  2  0  4  3
10  20  9  1  7  3
11  20  6  3  6  5
12  20  0  1  6  6
13  20  6  5  4  5
14  10  6  5  3  2
15  30  6  7  2  9
16  10  8  7  8  2
17  20  6  9  8  4
18  20  9  2  3  6
19  10  1  0  0  2



0

In [15]:
def Bin_Accident_Dataset(df_Accident):
    print ('Bin_Accident_Dataset()')
    
    feature = 'DAY_WEEK'
    A = [
        ['Weekend', [1,7], 0],
        ['Weekday', [2,3,4,5,6], 1],
    ]
    df_Accident = Build_Individual_Feature_with_Dict(df_Accident, feature, A)

    feature = 'HOUR'
    A = [
        ['Early_Morn', [5,6], 0],
        ['Morning', [7,8,9,10], 1],
        ['Mid_Day', [11,12,13,14], 2],
        ['Rush_Hour', [15,16,17], 3],
        ['Early_Eve', [18,19], 4],
        ['Evening', [20,21,22], 5],
        ['Late_Nght',[23,0,1,2,3,4], 6],
             ]
    df_Accident = Build_Individual_Feature_with_Dict(df_Accident, feature, A)

    feature = 'INT_HWY'
    A = [
        ['No', [0], 0],
        ['Yes', [1], 1],
        ['Missing', [9], 999],
    ]
    df_Accident = Build_Individual_Feature_with_Dict(df_Accident, feature, A)

    feature = 'LGT_COND'
    A =  [
        ['Dark', [2], 0],
        ['Dawn_Lighted', [3,4,6], 1],
        ['Dusk', [5], 2],
        ['Daylight', [1,7], 3],
    ]
    df_Accident = Build_Individual_Feature_with_Dict(df_Accident, feature, A)

    feature = 'MONTH'
    A = [
        ['Winter', [1,2,3,12], 0],
        ['Spring_Fall', [4,5,10,11], 1],
        ['Summer', [6,7,8,9], 2],
    ]
    df_Accident = Build_Individual_Feature_with_Dict(df_Accident, feature, A)

    # PEDS is derived 
    # "This data element is the number of Person Forms (Not a Motor Vehicle Occupant) 
    # that are applicable to this case (i.e., non-occupants)."
    # I've changed it from a count variable to a binary variable.  
    feature = 'PEDS'
    B = [x for x in list(df_Accident[feature].unique()) if x not in [0]]
#    print (B)
    A = [
        ['No', [0], 0],
        ['Yes', B, 1],
    ]
    df_Accident = Build_Individual_Feature_with_Dict(df_Accident, feature, A)

    # Derived from PERSON file
    # "[Number of] Persons in Motor Vehicle in Transport"
    feature = 'PERMVIT'
    B = [x for x in list(df_Accident[feature].unique()) if x not in [1,2]]
#    print (B)
    A = [
        ['1', [1], 0],
        ['2', [2], 1],
        ['Multiple', B, 2]
    ]
    df_Accident = Build_Individual_Feature_with_Dict(df_Accident, feature, A)

    feature = 'PERNOTMVIT'
    B = [x for x in list(df_Accident[feature].unique()) if x not in [1,2]]
#    print (B)
    A = [
        ['1', [1], 0],
        ['2', [2], 1],
        ['Multiple', B, 2]
    ]
    df_Accident = Build_Individual_Feature_with_Dict(df_Accident, feature, A)

    feature = 'PVH_INVL'
    B = [x for x in list(df_Accident[feature].unique()) if x not in [1,2]]
#    print (B)
    A = [
        ['1', [1], 0],
        ['2', [2], 1],
        ['Multiple', B, 2]
    ]
    df_Accident = Build_Individual_Feature_with_Dict(df_Accident, feature, A)

    feature = 'REL_ROAD'
    A =  [
        ['Not_on_Road', [2,3,4,5,6,8,10,12], 0],
        ['On_Road', [1,11], 1],
        ['Parking_Area', [7], 2],
        ['Missing/Unknown', [98,99], 999]
    ]
    df_Accident = Build_Individual_Feature_with_Dict(df_Accident, feature, A)

    feature = 'REGION'
    A = [
        ['Northeast', [1], 0],
        ['Midwest', [2], 1],
        ['South', [3], 2],
        ['West', [4], 3]
    ]
    df_Accident = Build_Individual_Feature_with_Dict(df_Accident, feature, A)

    feature = 'RELJCT1'
    A = [
        ['No', [0], 0],
        ['Yes', [1], 1],
        ['Missing', [8,9], 999]
    ]
    df_Accident = Build_Individual_Feature_with_Dict(df_Accident, feature, A)

    feature = 'RELJCT2'
    A = [
        ['A', [2,5,6,19], 0],
        ['B', [1,7,16], 1],
        ['C', [4,8,18], 2],
        ['D', [3,17,20], 3],
    ]
    df_Accident = Build_Individual_Feature_with_Dict(df_Accident, feature, A)

    feature = 'SCH_BUS'
    A = [
        ['No', [0], 0],
        ['Yes', [1], 1]
    ]
    df_Accident = Build_Individual_Feature_with_Dict(df_Accident, feature, A)

    feature = 'TYP_INT'
    A = [
        ['Not an Intersection', [1], 0],
        ['Intersection', [2,3,4,7,10,11], 1],
        ['Roundabout', [5,6], 2],
        ['Unknown', [98,99], 999]
    ]
    df_Accident = Build_Individual_Feature_with_Dict(df_Accident, feature, A)

    feature = 'URBANICITY'
    A = [
        ['Urban', [1], 0],
        ['Rural', [2], 1]
    ]
    df_Accident = Build_Individual_Feature_with_Dict(df_Accident, feature, A)

    # Derived from VEHICLE file
    feature = 'VE_FORMS'
    B = [x for x in list(df_Accident[feature].unique()) if x not in [1,2,3]]
#    print (B)
    A = [
        ['1', [1], 0],
        ['2', [2], 1],
        ['3', [3], 2],
        ['Multiple', B, 3]
    ]
    df_Accident = Build_Individual_Feature_with_Dict(df_Accident, feature, A)

    feature = 'VE_TOTAL'
    B = [x for x in list(df_Accident[feature].unique()) if x not in [1,2,3]]
#    print (B)
    A = [
        ['1', [1], 0],
        ['2', [2], 1],
        ['3', [3], 2],
        ['Multiple', B, 3]
    ]
    df_Accident = Build_Individual_Feature_with_Dict(df_Accident, feature, A)


    feature = 'WEATHER'
    A = [
        ['A', [3,5], 0],
        ['B', [1], 1],
        ['C', [2], 2],
        ['D', [10], 3],
        ['E', [4,6,7,8,11,12], 4],
    ]
    df_Accident = Build_Individual_Feature_with_Dict(df_Accident, feature, A)

    feature = 'WRK_ZONE'
    A = [
        ['0', [0], 0],
        ['1', [1,2,3,4], 1],
    ]
    df_Accident = Build_Individual_Feature_with_Dict(df_Accident, feature, A)
    
    print ('len(df_Accident) binned = ', len(df_Accident))
    print ()
    return df_Accident

# Plan

- In the CRSS dataset, we want to do two things.  
    - Impute unknown values
    - Bin into fewer categories
- Does the order of operations matter?
- General Strategy
    - Pull the features I want to use in the Accident data file
        - If the feature has been imputed by CRSS, pull the unimputed version
    - For each feature, count the number of samples with unknown values and record the proportion, p(feature)
    - Delete any records with unknown values in any features.  Call this dataframe df_A.
    - Create a deep copy of df_A with binned values to be ground truth.  Call this dataframe df_B.
    - Repeat this part twice with different random seeds for the deletion and imputation
        - For each feature, delete the value for p(feature) of the records. Call this dataframe df_C.
        - Bin Then Impute
            - Bin the values in df_C; call it df_D.
            - Impute blank values in df_D; call it df_E
        - Impute Then Bin
            - Impute blank values in df_C; call it df_F.
            - Bin the values in df_F; call it df_G

## Create df_A:  df_Accident with Unknown Values Removed

In [16]:
df_A = pd.DataFrame([])
for F in Accident_Features:
    feature = F[0]
    df_A[feature] = df_Accident[feature]

for F in Accident_Features:
    feature = F[0]
    Unknown = F[1]
    df_A.drop( df_A[ df_A[feature].isin(Unknown)].index, inplace=True)

df_A = df_A.reset_index(drop=True)

for feature in df_A:
    print (feature, len(df_A[feature].unique()))
    
print ('len(df_A) = ', len(df_A))

DAY_WEEK 7
HOUR 24
INT_HWY 2
LGT_COND 7
MONTH 12
PEDS 10
PERMVIT 25
PERNOTMVIT 10
PVH_INVL 11
REGION 4
RELJCT1 2
RELJCT2 13
REL_ROAD 11
SCH_BUS 2
TYP_INT 9
URBANICITY 2
VE_FORMS 13
VE_TOTAL 13
WEATHER 11
WRK_ZONE 5
len(df_A) =  167363


## Create df_B with binned values to be ground truth

In [17]:
df_B = df_A.copy(deep=True)
df_B = Bin_Accident_Dataset(df_B)


Bin_Accident_Dataset()
DAY_WEEK
{1: 0, 7: 0, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1}
6    27868
5    25585
4    24886
3    24691
2    23677
7    22212
1    18444
Name: DAY_WEEK, dtype: int64
isna():  0
1    126707
0     40656
Name: DAY_WEEK, dtype: int64

HOUR
{5: 0, 6: 0, 7: 1, 8: 1, 9: 1, 10: 1, 11: 2, 12: 2, 13: 2, 14: 2, 15: 3, 16: 3, 17: 3, 18: 4, 19: 4, 20: 5, 21: 5, 22: 5, 23: 6, 0: 6, 1: 6, 2: 6, 3: 6, 4: 6}
17    14405
16    13207
15    13032
18    11118
14    10750
12     9523
13     9511
7      8374
19     8039
11     7915
8      7826
10     6717
20     6663
9      6579
21     5853
6      5179
22     4694
23     3736
0      3033
5      2938
1      2428
2      2196
3      1840
4      1807
Name: HOUR, dtype: int64
isna():  0
3    40644
2    37699
1    29496
4    19157
5    17210
6    15040
0     8117
Name: HOUR, dtype: int64

INT_HWY
{0: 0, 1: 1, 9: 999}
0    148955
1     18408
Name: INT_HWY, dtype: int64
isna():  0
0    148955
1     18408
Name: INT_HWY, dtype: int64

LGT_COND
{2: 0, 3:

## Create df_C from df_A with Erased Samples

In [18]:
df_C = df_A.copy(deep=True)
df_C = Erase_Proportional(df_C, df_Accident, Accident_Features)
df_C2 = df_A.copy(deep=True)
df_C2 = Erase_Proportional(df_C2, df_Accident, Accident_Features)
print (df_C.head(20))


N =  259077 n =  167363


DAY_WEEK 167363 259077 167363 0 0

HOUR 167363 259077 167363 1127 728

INT_HWY 167363 259077 167363 25 16

LGT_COND 167363 259077 167363 2309 1492

MONTH 167363 259077 167363 0 0

PEDS 167363 259077 167363 0 0

PERMVIT 167363 259077 167363 0 0

PERNOTMVIT 167363 259077 167363 0 0

PVH_INVL 167363 259077 167363 0 0

REGION 167363 259077 167363 0 0

RELJCT1 167363 259077 167363 65920 42584

RELJCT2 167363 259077 167363 19721 12740

REL_ROAD 167363 259077 167363 190 123

SCH_BUS 167363 259077 167363 0 0

TYP_INT 167363 259077 167363 26650 17216

URBANICITY 167363 259077 167363 0 0

VE_FORMS 167363 259077 167363 0 0

VE_TOTAL 167363 259077 167363 0 0

WEATHER 167363 259077 167363 13284 8581

WRK_ZONE 167363 259077 167363 0 0
N =  259077 n =  167363


DAY_WEEK 167363 259077 167363 0 0

HOUR 167363 259077 167363 1127 728

INT_HWY 167363 259077 167363 25 16

LGT_COND 167363 259077 167363 2309 1492

MONTH 167363 259077 167363 0 0

PEDS 167363 259077 167363 0 0

PERMVI

# Bin Before Imputing
- Bin the values in df_C; call it df_D.
- Impute blank values in df_D; call it df_E
- For each feature, for the samples that were blank, make a crosstab between df_B and df_E


## df_D is df_C binned

In [19]:
df_D = df_C.copy(deep=True)
df_D = Bin_Accident_Dataset(df_D)
#df_D.to_csv('../../Big_Files/OoO_10_19_22_Accident_df_D.txt', sep='\t', index=False)
df_D.to_csv('../../Big_Files/OoO_11_01_22_Accident_df_D.txt', sep='\t', index=False)

df_D2 = df_C2.copy(deep=True)
df_D2 = Bin_Accident_Dataset(df_D2)
#df_D2.to_csv('../../Big_Files/OoO_10_19_22_Accident_df_D2.txt', sep='\t', index=False)
df_D2.to_csv('../../Big_Files/OoO_11_01_22_Accident_df_D2.txt', sep='\t', index=False)


Bin_Accident_Dataset()
DAY_WEEK
{1: 0, 7: 0, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1}
6    27868
5    25585
4    24886
3    24691
2    23677
7    22212
1    18444
Name: DAY_WEEK, dtype: int64
isna():  0
1    126707
0     40656
Name: DAY_WEEK, dtype: int64

HOUR
{5: 0, 6: 0, 7: 1, 8: 1, 9: 1, 10: 1, 11: 2, 12: 2, 13: 2, 14: 2, 15: 3, 16: 3, 17: 3, 18: 4, 19: 4, 20: 5, 21: 5, 22: 5, 23: 6, 0: 6, 1: 6, 2: 6, 3: 6, 4: 6}
17    14334
16    13155
15    12976
18    11077
14    10704
12     9489
13     9463
7      8327
19     8007
11     7886
8      7785
10     6692
20     6631
9      6550
21     5830
6      5154
22     4673
23     3719
0      3021
5      2928
1      2421
2      2187
3      1830
4      1796
        728
Name: HOUR, dtype: int64
isna():  0
3    40465
2    37542
1    29354
4    19084
5    17134
6    14974
0     8082
       728
Name: HOUR, dtype: int64

INT_HWY
{0: 0, 1: 1, 9: 999}
0    148940
1     18407
         16
Name: INT_HWY, dtype: int64
isna():  0
0    148940
1     18407
         16


1    128890
0     33297
2      5053
        123
Name: REL_ROAD, dtype: int64

REGION
{1: 0, 2: 1, 3: 2, 4: 3}
3    94046
2    36047
4    22686
1    14584
Name: REGION, dtype: int64
isna():  0
2    94046
1    36047
3    22686
0    14584
Name: REGION, dtype: int64

RELJCT1
{0: 0, 1: 1, 8: 999, 9: 999}
0    119786
      42584
1      4993
Name: RELJCT1, dtype: int64
isna():  0
0    119786
      42584
1      4993
Name: RELJCT1, dtype: int64

RELJCT2
{2: 0, 5: 0, 6: 0, 19: 0, 1: 1, 7: 1, 16: 1, 4: 2, 8: 2, 18: 2, 3: 3, 17: 3, 20: 3}
1     85288
2     28313
3     23401
      12740
8     10606
5      1754
18     1602
20     1326
4      1261
19      494
7       257
6       162
17      135
16       24
Name: RELJCT2, dtype: int64
isna():  0
1    85569
0    30723
3    24862
2    13469
     12740
Name: RELJCT2, dtype: int64

SCH_BUS
{0: 0, 1: 1}
0    166495
1       868
Name: SCH_BUS, dtype: int64
isna():  0
0    166495
1       868
Name: SCH_BUS, dtype: int64

TYP_INT
{1: 0, 2: 1, 3: 1, 4: 1, 7: 1, 

## Do the Imputation in IVEware
- df_D to df_E
- df_D2 to df_E2

## df_E is df_D (binned) with Missing Values Imputed

In [20]:
#df_E = pd.read_csv('../../Big_Files/OoO_10_19_22_Accident_df_E.csv')
#df_E2 = pd.read_csv('../../Big_Files/OoO_10_19_22_Accident_df_E2.csv')
df_E = pd.read_csv('../../Big_Files/OoO_11_01_22_Accident_df_E.csv')
df_E2 = pd.read_csv('../../Big_Files/OoO_11_01_22_Accident_df_E2.csv')


# Impute Before Binning
- Impute blank values in df_C; call it df_F.
- Bin the values in df_F; call it df_G
- For each feature, for the samples that were blank, make a crosstab between df_B and df_G


In [21]:
#df_C.to_csv('../../Big_Files/OoO_10_19_22_Accident_df_C.txt', sep='\t', index=False)
#df_C2.to_csv('../../Big_Files/OoO_10_19_22_Accident_df_C2.txt', sep='\t', index=False)
df_C.to_csv('../../Big_Files/OoO_11_01_22_Accident_df_C.txt', sep='\t', index=False)
df_C2.to_csv('../../Big_Files/OoO_11_01_22_Accident_df_C2.txt', sep='\t', index=False)

## Do the Imputation in IVEware
- df_C to df_F
- df_C2 to df_F2

## df_F is df_C (unbinned) with Missing Values Imputed
## df_G is df_F binned

In [22]:
#df_F = pd.read_csv('../../Big_Files/OoO_10_19_22_Accident_df_F.csv')
#df_F2 = pd.read_csv('../../Big_Files/OoO_10_19_22_Accident_df_F2.csv')
df_F = pd.read_csv('../../Big_Files/OoO_11_01_22_Accident_df_F.csv')
df_F2 = pd.read_csv('../../Big_Files/OoO_11_01_22_Accident_df_F2.csv')
df_G = Bin_Accident_Dataset(df_F)
df_G2 = Bin_Accident_Dataset(df_F2)
print ('len(df_G) = ', len(df_G))

Bin_Accident_Dataset()
DAY_WEEK
{1: 0, 7: 0, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1}
1    126707
0     40656
Name: DAY_WEEK, dtype: int64
isna():  0
0    167363
Name: DAY_WEEK, dtype: int64

HOUR
{5: 0, 6: 0, 7: 1, 8: 1, 9: 1, 10: 1, 11: 2, 12: 2, 13: 2, 14: 2, 15: 3, 16: 3, 17: 3, 18: 4, 19: 4, 20: 5, 21: 5, 22: 5, 23: 6, 0: 6, 1: 6, 2: 6, 3: 6, 4: 6}
3    40646
2    37696
1    29470
4    19169
5    17215
6    15049
0     8118
Name: HOUR, dtype: int64
isna():  0
6    135099
0     32264
Name: HOUR, dtype: int64

INT_HWY
{0: 0, 1: 1, 9: 999}
0    148952
1     18411
Name: INT_HWY, dtype: int64
isna():  0
0    148952
1     18411
Name: INT_HWY, dtype: int64

LGT_COND
{2: 0, 3: 1, 4: 1, 6: 1, 5: 2, 1: 3, 7: 3}
3    113441
1     30155
0     19893
2      3874
Name: LGT_COND, dtype: int64
isna():  0
1    113441
3     30155
0     23767
Name: LGT_COND, dtype: int64

MONTH
{1: 0, 2: 0, 3: 0, 12: 0, 4: 1, 5: 1, 10: 1, 11: 1, 6: 2, 7: 2, 8: 2, 9: 2}
2    57134
1    55713
0    54516
Name: MONTH, dtype: int64


## df_H is df_D with missing values filled with the mode

In [23]:
df_H = pd.DataFrame([])
df_H2 = pd.DataFrame([])
for feature in df_D:
    mode = df_D[feature].mode(dropna=True)
    m = mode.tolist()[0]
    df_H[feature] = df_D[feature].replace({'':m})
    mode = df_D2[feature].mode(dropna=True)
    m = mode.tolist()[0]
    df_H2[feature] = df_D2[feature].replace({'':m})
    print (feature)
    print ()
    print (df_D[feature].value_counts())
    print ()
    print (df_H[feature].value_counts())
    print ()


DAY_WEEK

1    126707
0     40656
Name: DAY_WEEK, dtype: int64

1    126707
0     40656
Name: DAY_WEEK, dtype: int64

HOUR

3    40465
2    37542
1    29354
4    19084
5    17134
6    14974
0     8082
       728
Name: HOUR, dtype: int64

3    41193
2    37542
1    29354
4    19084
5    17134
6    14974
0     8082
Name: HOUR, dtype: int64

INT_HWY

0    148940
1     18407
         16
Name: INT_HWY, dtype: int64

0    148956
1     18407
Name: INT_HWY, dtype: int64

LGT_COND

3    112406
1     29876
0     19744
2      3845
       1492
Name: LGT_COND, dtype: int64

3    113898
1     29876
0     19744
2      3845
Name: LGT_COND, dtype: int64

MONTH

2    57134
1    55713
0    54516
Name: MONTH, dtype: int64

2    57134
1    55713
0    54516
Name: MONTH, dtype: int64

PEDS

0    155076
1     12287
Name: PEDS, dtype: int64

0    155076
1     12287
Name: PEDS, dtype: int64

PERMVIT

1    63036
2    56717
0    47610
Name: PERMVIT, dtype: int64

1    63036
2    56717
0    47610
Name: PERMVIT, dt

# Analysis

## Plan
- We have these dataframes:
    - df_B is ground truth for binning
    - df_C is our record of which values we deleted
    - df_E is Bin Before Imputing
    - df_G is Impute before Binning
- For each feature:
    - Make df_BA to be samples in df_B whose values in df_C are blank
    - Make df_EA to be samples in df_E whose values in df_C are blank
    - Make df_GA to be samples in df_G whose values in df_C are blank
    - Make crosstabs between (df_BA and df_EA) and (df_BA and df_GA) and (df_EA and df_GA)
    - Count correct imputation; divide by number of samples to give proportion correct
- After the second round of imputation:
    - Make df_BB to be samples in df_B whose values in df_C are blank
    - Make df_EB to be samples in df_E whose values in df_C are blank
    - Make df_GB to be samples in df_G whose values in df_C are blank
    - Make crosstabs between (df_BB and df_EB) and (df_BB and df_GB) and (df_EA and df_EB) and (df_GA and df_GB)
    - Count correct imputation; divide by number of samples to give proportion correct


In [24]:
def Crosstabs(df_C1, df_C2, df_1, df_2, text_1, text_2, feature):
    df_1A = df_1[feature]
    df_2A = df_2[feature]
    df_1A = df_1A[df_C1[feature] == '']
    df_2A = df_2A[df_C2[feature] == '']
    CT = pd.crosstab(df_1A, df_2A, rownames = [text_1], colnames = [text_2])
    A = CT.values.tolist()
    s = 0
    S = 0
    for i in range (len(A)):
        for j in range (len(A[0])):
            S += A[i][j]
            if i==j:
                s += A[i][j]
    print (feature, text_1, text_2, s, S)
    print ()
    print (CT)
    print ()
    if S>0:
        print (s, S, round(s/S*100,2), '%')
    print ()
    print (CT.to_latex())
#    print ()
    print ()
    return s, S

In [32]:
def Compare_OoO():
    
    for feature in Accident_Features:
        print (feature[0])
    print ()
    
    Sa = 0
    Sb = 0
    Sc = 0
    Sd = 0
    Se = 0
    Sf = 0
    Sg = 0
    Sh = 0
    Si = 0
    
    StuffA = []
    StuffB = []
    StuffC = []
    StuffD = []
    
    n = 0
    for Feature in Accident_Features:
        feature = Feature[0]
        print (Feature)
        if Feature[3]!=0:
            n += 1

            a, A = Crosstabs(df_C, df_C, df_B, df_E, 'Ground_Truth_1', 'Bin_Impute_1', feature)
            b, B = Crosstabs(df_C2, df_C2, df_B, df_E2, 'Ground_Truth_2', 'Bin_Impute_2', feature)
            Sa += a/A
            Sb += b/B
            StuffA.append([feature, a, A])
            StuffB.append([feature, b, B])

            c, C = Crosstabs(df_C, df_C, df_B, df_G, 'Ground_Truth_1', 'Impute_Bin_1', feature)
            d, D = Crosstabs(df_C2, df_C2, df_B, df_G2, 'Ground_Truth_2', 'Impute_Bin_2', feature)
            Sc += c/C
            Sd += d/D
            StuffC.append([feature, c, C])
            StuffD.append([feature, d, D])

            e, E = Crosstabs(df_C, df_C, df_E, df_G, 'Bin_Impute_1', 'Impute_Bin_1', feature)
            f, F = Crosstabs(df_C2, df_C2, df_E2, df_G2, 'Bin_Impute_2', 'Impute_Bin_2', feature)
            Se += e/E
            Sf += f/F

            g, G = Crosstabs(df_C, df_C2, df_E, df_E2, 'Bin_Impute_1', 'Bin_Impute_2', feature)
            if G>0:
                Sg += g/G

#            h, H = Crosstabs(df_C, df_C, df_B, df_H, 'Ground_Truth_1', 'Impute_to_Mode_1', feature)
#            i, I = Crosstabs(df_C2, df_C2, df_B, df_H2, 'Ground_Truth_2', 'Impute_to_Mode_2', feature)
#            Sh += h/H
#            Si += i/I
            

    print ('Bin_Impute: ', Sa, Sb, Sa/n*100, Sb/n*100)
    print ()
    for s in StuffA:
        print (s)
    print ()
    for s in StuffB:
        print (s)
    print ()
    print ('Impute_Bin: ', Sc, Sd, Sc/n*100, Sd/n*100)
    for s in StuffC:
        print (s)
    print ()
    for s in StuffD:
        print (s)
    print ()
    print ('Bin_Impute to Impute_Bin: ', Se, Sf, Se/n*100, Sf/n*100)
    print ('Bin_Impute_1 to Bin_Impute_2: ', Sg, Sg/n*100)
#    print ('Impute_to_Mode: ', Sh, Si, Sh/n*100, Si/n*100)
            
    print ()
    


In [33]:
Compare_OoO()

DAY_WEEK
HOUR
INT_HWY
LGT_COND
MONTH
PEDS
PERMVIT
PERNOTMVIT
PVH_INVL
REGION
RELJCT1
RELJCT2
REL_ROAD
SCH_BUS
TYP_INT
URBANICITY
VE_FORMS
VE_TOTAL
WEATHER
WRK_ZONE

['DAY_WEEK', [9], 7, 0]
['HOUR', [99], 25, 1127]
HOUR Ground_Truth_1 Bin_Impute_1 211 728

Bin_Impute_1     0   1   2   3   4   5   6
Ground_Truth_1                            
0                5   4   1   5   7  10   3
1                1  39  41  50   7   3   1
2                4  39  54  50   9   1   0
3                9  32  51  52  20  10   5
4                4   7   7  12  16  15  12
5               10   2   3   2  15  26  18
6                7   2   0   5  12  21  19

211 728 28.98 %

\begin{tabular}{lrrrrrrr}
\toprule
Bin\_Impute\_1 &   0 &   1 &   2 &   3 &   4 &   5 &   6 \\
Ground\_Truth\_1 &     &     &     &     &     &     &     \\
\midrule
0              &   5 &   4 &   1 &   5 &   7 &  10 &   3 \\
1              &   1 &  39 &  41 &  50 &   7 &   3 &   1 \\
2              &   4 &  39 &  54 &  50 &   9 &   1 & 

  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())


 Bin_Impute_1 Impute_Bin_1 10 16

Impute_Bin_1   0  1
Bin_Impute_1       
0             10  4
1              2  0

10 16 62.5 %

\begin{tabular}{lrr}
\toprule
Impute\_Bin\_1 &   0 &  1 \\
Bin\_Impute\_1 &     &    \\
\midrule
0            &  10 &  4 \\
1            &   2 &  0 \\
\bottomrule
\end{tabular}


INT_HWY Bin_Impute_2 Impute_Bin_2 15 16

Impute_Bin_2   0
Bin_Impute_2    
0             15
1              1

15 16 93.75 %

\begin{tabular}{lr}
\toprule
Impute\_Bin\_2 &   0 \\
Bin\_Impute\_2 &     \\
\midrule
0            &  15 \\
1            &   1 \\
\bottomrule
\end{tabular}


INT_HWY Bin_Impute_1 Bin_Impute_2 0 0

Empty DataFrame
Columns: []
Index: []


\begin{tabular}{l}
\toprule
Empty DataFrame
Columns: Int64Index([], dtype='int64', name='Bin\_Impute\_2')
Index: Int64Index([], dtype='int64', name='Bin\_Impute\_1') \\
\bottomrule
\end{tabular}


['LGT_COND', [8, 9], 9, 2309]
LGT_COND Ground_Truth_1 Bin_Impute_1 1074 1492

Bin_Impute_1     0    1   2    3
Ground_Truth_1        

  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())


RELJCT1 Ground_Truth_2 Impute_Bin_2 39350 42584

Impute_Bin_2        0     1
Ground_Truth_2             
0               39306  1595
1                1639    44

39350 42584 92.41 %

\begin{tabular}{lrr}
\toprule
Impute\_Bin\_2 &      0 &     1 \\
Ground\_Truth\_2 &        &       \\
\midrule
0              &  39306 &  1595 \\
1              &   1639 &    44 \\
\bottomrule
\end{tabular}


RELJCT1 Bin_Impute_1 Impute_Bin_1 39477 42584

Impute_Bin_1      0     1
Bin_Impute_1             
0             39423  1552
1              1555    54

39477 42584 92.7 %

\begin{tabular}{lrr}
\toprule
Impute\_Bin\_1 &      0 &     1 \\
Bin\_Impute\_1 &        &       \\
\midrule
0            &  39423 &  1552 \\
1            &   1555 &    54 \\
\bottomrule
\end{tabular}


RELJCT1 Bin_Impute_2 Impute_Bin_2 39379 42584

Impute_Bin_2      0     1
Bin_Impute_2             
0             39309  1569
1              1636    70

39379 42584 92.47 %

\begin{tabular}{lrr}
\toprule
Impute\_Bin\_2 &      0 &     

  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())


REL_ROAD Ground_Truth_1 Impute_Bin_1 102 123

Impute_Bin_1     0   1
Ground_Truth_1        
0               16  12
1                6  86
2                3   0

102 123 82.93 %

\begin{tabular}{lrr}
\toprule
Impute\_Bin\_1 &   0 &   1 \\
Ground\_Truth\_1 &     &     \\
\midrule
0              &  16 &  12 \\
1              &   6 &  86 \\
2              &   3 &   0 \\
\bottomrule
\end{tabular}


REL_ROAD Ground_Truth_2 Impute_Bin_2 105 123

Impute_Bin_2     0   1
Ground_Truth_2        
0               16   6
1                8  89
2                4   0

105 123 85.37 %

\begin{tabular}{lrr}
\toprule
Impute\_Bin\_2 &   0 &   1 \\
Ground\_Truth\_2 &     &     \\
\midrule
0              &  16 &   6 \\
1              &   8 &  89 \\
2              &   4 &   0 \\
\bottomrule
\end{tabular}


REL_ROAD Bin_Impute_1 Impute_Bin_1 105 123

Impute_Bin_1   0   1
Bin_Impute_1        
0             16   9
1              6  89
2              3   0

105 123 85.37 %

\begin{tabular}{lrr}
\toprule
Impute\

  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())


WEATHER Ground_Truth_2 Bin_Impute_2 4796 8581

Bin_Impute_2     0     1    2    3    4
Ground_Truth_2                         
0                0    55    3    9    2
1               32  4510  592  918  148
2                5   580   82  112   24
3                6   973  131  199   18
4                1   127   17   32    5

4796 8581 55.89 %

\begin{tabular}{lrrrrr}
\toprule
Bin\_Impute\_2 &   0 &     1 &    2 &    3 &    4 \\
Ground\_Truth\_2 &     &       &      &      &      \\
\midrule
0              &   0 &    55 &    3 &    9 &    2 \\
1              &  32 &  4510 &  592 &  918 &  148 \\
2              &   5 &   580 &   82 &  112 &   24 \\
3              &   6 &   973 &  131 &  199 &   18 \\
4              &   1 &   127 &   17 &   32 &    5 \\
\bottomrule
\end{tabular}


WEATHER Ground_Truth_1 Impute_Bin_1 4730 8581

Impute_Bin_1      0     1    2    4
Ground_Truth_1                     
0                 3    45    4    2
1               923  4633  588  138
2               129

  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())


In [35]:
feature = 'WEATHER'
#feature = 'HOUR'
#feature = 'REL_ROAD'
#feature = 'TYP_INT'
print (df_Accident.shape[0])
print (df_Accident[feature].value_counts())
print ()
print (df_Accident[feature].value_counts(normalize=True))
print ()
print (df_A[feature].value_counts())
print ()
print (df_A[feature].value_counts(normalize=True))
print ()
print (df_B[feature].value_counts())
print ()
print (df_B[feature].value_counts(normalize=True))
print ()
Crosstabs(df_C, df_C, df_B, df_B, 'Ground_Truth_1', 'Ground_Truth_1', feature)
Crosstabs(df_C2, df_C2, df_B, df_B, 'Ground_Truth_2', 'Ground_Truth_2', feature)
Crosstabs(df_C, df_C, df_B, df_E, 'Ground_Truth_1', 'Bin_Impute_1', feature)
Crosstabs(df_C2, df_C2, df_B, df_E2, 'Ground_Truth_2', 'Bin_Impute_2', feature)
Crosstabs(df_C, df_C, df_B, df_G, 'Ground_Truth_1', 'Impute_Bin_1', feature)
Crosstabs(df_C2, df_C2, df_B, df_G2, 'Ground_Truth_2', 'Impute_Bin_2', feature)
Crosstabs(df_C, df_C, df_E, df_G, 'Bin_Impute_1', 'Impute_Bin_1', feature)
Crosstabs(df_C2, df_C2, df_E2, df_G2, 'Bin_Impute_2', 'Impute_Bin_2', feature)
Crosstabs(df_C, df_C2, df_E, df_E2, 'Bin_Impute_1', 'Bin_Impute_2', feature)
Crosstabs(df_C, df_C, df_B, df_H, 'Ground_Truth_1', 'Impute_to_Mode_1', feature)
Crosstabs(df_C2, df_C2, df_B, df_H2, 'Ground_Truth_2', 'Impute_to_Mode_2', feature)



259077
1     179070
10     36853
2      23303
98     12636
4       4531
5       1053
99       648
3        382
6        178
8        163
11       130
12        88
7         42
Name: WEATHER, dtype: int64

1     0.691184
10    0.142247
2     0.089946
98    0.048773
4     0.017489
5     0.004064
99    0.002501
3     0.001474
6     0.000687
8     0.000629
11    0.000502
12    0.000340
7     0.000162
Name: WEATHER, dtype: float64

1     121525
10     25244
2      15679
4       3350
5        793
3        310
6        142
8        119
11       112
12        62
7         27
Name: WEATHER, dtype: int64

1     0.726116
10    0.150834
2     0.093683
4     0.020016
5     0.004738
3     0.001852
6     0.000848
8     0.000711
11    0.000669
12    0.000370
7     0.000161
Name: WEATHER, dtype: float64

1    121525
3     25244
2     15679
4      3812
0      1103
Name: WEATHER, dtype: int64

1    0.726116
3    0.150834
2    0.093683
4    0.022777
0    0.006590
Name: WEATHER, dtype: float64

WEATHER Gro

  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())
  print (CT.to_latex())


(69, 8581)