# Count Samples with Missing Features

In [1]:
%%latex
\tableofcontents

<IPython.core.display.Latex object>

# Setup
## Import Libraries

In [2]:
import sys, copy, math, time, os

print ('Python version: {}'.format(sys.version))

import numpy as np
print ('NumPy version: {}'.format(np.__version__))
np.set_printoptions(suppress=True)


import pandas as pd
print ('Pandas version:  {}'.format(pd.__version__))
pd.set_option('display.max_rows', 500)

# Library for reading Microsoft Access files
import pandas_access as mdb


# Set Randomness.  Copied from https://www.kaggle.com/code/abazdyrev/keras-nn-focal-loss-experiments
import random


Python version: 3.9.16 (main, Dec  7 2022, 10:02:13) 
[Clang 14.0.0 (clang-1400.0.29.202)]
NumPy version: 1.24.0
Pandas version:  1.5.2


# Import Data

## Get Data
- The Get_Data_from_Original() reads the (original) CRSS files from the CRSS directory, preprocesses it, and writes it to files in a folder outside this GitHub repo (because the files are too large for my subscription), and returns the dataframes.
- The Get_Data_from_Temp_Files() reads the temp files and returns the dataframes.  I created this option for running repeatedly during writing and debugging, because it's much faster.

In [3]:
def Get_Data():
    print ('Get_Data')
    Data = pd.read_csv('../../Big_Files/CRSS_Discretized_All_12_22_22.csv', low_memory=False)
    print ('Data.shape = ', Data.shape)
    return Data

In [4]:
Data = Get_Data()
display(Data)


Get_Data
Data.shape =  (619027, 107)


Unnamed: 0,CASENUM,HOUR_IM,HOUR,INT_HWY,LGTCON_IM,LGT_COND,MONTH,PEDS,PERMVIT,REL_ROAD,...,PER_NO,PER_TYP,REST_MIS,REST_USE,SEAT_IM,SEAT_POS,SEX,SEX_IM,VEH_AGE,VEH_AGE_IM
0,201600014311,2,2,0,3,3,0,0,1,1,...,1,2,1,1,2,3,1,1,3.0,5
1,201600014311,2,2,0,3,3,0,0,1,1,...,1,2,1,1,2,3,1,1,0.0,4
2,201600014315,4,4,0,1,1,0,0,2,1,...,1,2,1,1,2,3,1,1,4.0,5
3,201600014315,4,4,0,1,1,0,0,2,1,...,1,2,1,1,2,3,0,0,2.0,5
4,201600014315,4,4,0,1,1,0,0,2,1,...,2,1,1,1,0,0,1,1,2.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
619022,202003020795,3,3,0,0,0,1,0,2,1,...,1,2,1,1,2,3,1,1,4.0,5
619023,202003020807,6,6,0,1,Unknown,0,0,2,1,...,1,2,1,1,2,3,0,0,1.0,5
619024,202003020807,6,6,0,1,Unknown,0,0,2,1,...,2,1,1,1,1,1,0,0,1.0,5
619025,202003020807,6,6,0,1,Unknown,0,0,2,1,...,3,1,1,1,3,4,1,1,1.0,5


# Filter to Features we will Use

In [11]:
def Thin_Features(data):
    print ('Thin_Features()')

    Merge = [
        'CASENUM',
        'VEH_NO',
        'PER_NO',        
    ]

    Accident = [
        'DAY_WEEK',
        'HOUR',
        'INT_HWY',
        'LGT_COND',
        'MONTH',
#        'PEDS',
        'PERMVIT',
        'PERNOTMVIT',
        'PJ',
        'PSU',
        'PVH_INVL',
        'REGION',
        'REL_ROAD',
        'RELJCT1',
        'RELJCT2',
        'SCH_BUS',
        'TYP_INT',
        'URBANICITY',
        'VE_FORMS',
        'VE_TOTAL',
        'WEATHER',
        'WRK_ZONE',
        'YEAR',
    ]
    
    Vehicle = [
        'BODY_TYP',
        'BUS_USE',
        'EMER_USE',
        'MAKE',
#        'MOD_YEAR',
        'MODEL',
        'NUMOCCS',
        'VALIGN',
        'VNUM_LAN',
        'VPROFILE',
        'VSPD_LIM',
#        'VSURCOND',
        'VTRAFCON',
        'VTRAFWAY',
    ]
    
    Person = [
        'AGE',
        'LOCATION',
        'PER_TYP',
        'SEX',
        'HOSPITAL',    
    ]

#    Engineered = [
#        'VEH_AGE',
#        'AGE_x_SEX',
#        'AGE_x_SCH_BUS'
#    ]
    
    # Put features in alphabetical order
    Features = Accident + Vehicle + Person # + Engineered
    Features = sorted(Features)
#    Features = Merge + Features
    for feature in Features:
        print (feature)
    print ()
    
    data = data.filter(Features, axis=1)
    
    print ('data.shape: ', data.shape)
    
    print ('End Thin_Features()')
    print ()
        
    return data

def Test_Thin_Features():
    data = Get_Data()
    data = Thin_Features(data)
#    for feature in data:
#        display(data[feature].value_counts())
    return data
        
Data = Test_Thin_Features()
display(Data)

Get_Data
Data.shape =  (619027, 107)
Thin_Features()
AGE
BODY_TYP
BUS_USE
DAY_WEEK
EMER_USE
HOSPITAL
HOUR
INT_HWY
LGT_COND
LOCATION
MAKE
MODEL
MONTH
NUMOCCS
PERMVIT
PERNOTMVIT
PER_TYP
PJ
PSU
PVH_INVL
REGION
RELJCT1
RELJCT2
REL_ROAD
SCH_BUS
SEX
TYP_INT
URBANICITY
VALIGN
VE_FORMS
VE_TOTAL
VNUM_LAN
VPROFILE
VSPD_LIM
VTRAFCON
VTRAFWAY
WEATHER
WRK_ZONE
YEAR

data.shape:  (619027, 37)
End Thin_Features()



Unnamed: 0,AGE,BODY_TYP,BUS_USE,DAY_WEEK,EMER_USE,HOSPITAL,HOUR,INT_HWY,LGT_COND,MAKE,...,VALIGN,VE_FORMS,VE_TOTAL,VPROFILE,VSPD_LIM,VTRAFCON,VTRAFWAY,WEATHER,WRK_ZONE,YEAR
0,3,1,1,1,1,0,2,0,3,0,...,1,2,2,1,7,Unknown,Unknown,1,0,2016
1,2,2,1,1,1,0,2,0,3,4,...,1,2,2,1,7,Unknown,Unknown,1,0,2016
2,1,5,1,1,1,0,4,0,1,8,...,1,2,2,2,7,Unknown,Unknown,1,0,2016
3,1,0,1,1,1,0,4,0,1,6,...,1,2,2,2,7,Unknown,Unknown,1,0,2016
4,1,0,1,1,1,0,4,0,1,6,...,1,2,2,2,7,Unknown,Unknown,1,0,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
619022,2,5,1,1,1,0,3,0,0,2,...,1,2,2,0,1,1,0,1,0,2020
619023,1,3,1,1,1,0,6,0,Unknown,2,...,0,2,2,1,5,Unknown,0,1,0,2020
619024,1,3,1,1,1,0,6,0,Unknown,2,...,0,2,2,1,5,Unknown,0,1,0,2020
619025,1,3,1,1,1,0,6,0,Unknown,2,...,0,2,2,1,5,Unknown,0,1,0,2020


# Count Occurrences of "Unknown" in Rows

In [9]:
#Data['Unknown'] = Data.apply(lambda x: x.str.contains("Unknown").sum(), axis=1)
Data['Unknown'] = Data.isin(['Unknown']).sum(1)
display(Data)
print (Data.Unknown.value_counts())
Data.Unknown.value_counts(normalize=True)


Unnamed: 0,AGE,BODY_TYP,BUS_USE,DAY_WEEK,EMER_USE,HOSPITAL,HOUR,INT_HWY,LGT_COND,MAKE,...,VE_FORMS,VE_TOTAL,VPROFILE,VSPD_LIM,VTRAFCON,VTRAFWAY,WEATHER,WRK_ZONE,YEAR,Unknown
0,3,1,1,1,1,0,2,0,3,0,...,2,2,1,7,Unknown,Unknown,1,0,2016,2
1,2,2,1,1,1,0,2,0,3,4,...,2,2,1,7,Unknown,Unknown,1,0,2016,2
2,1,5,1,1,1,0,4,0,1,8,...,2,2,2,7,Unknown,Unknown,1,0,2016,3
3,1,0,1,1,1,0,4,0,1,6,...,2,2,2,7,Unknown,Unknown,1,0,2016,3
4,1,0,1,1,1,0,4,0,1,6,...,2,2,2,7,Unknown,Unknown,1,0,2016,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
619022,2,5,1,1,1,0,3,0,0,2,...,2,2,0,1,1,0,1,0,2020,0
619023,1,3,1,1,1,0,6,0,Unknown,2,...,2,2,1,5,Unknown,0,1,0,2020,2
619024,1,3,1,1,1,0,6,0,Unknown,2,...,2,2,1,5,Unknown,0,1,0,2020,2
619025,1,3,1,1,1,0,6,0,Unknown,2,...,2,2,1,5,Unknown,0,1,0,2020,2


0     327658
1     125463
2      71208
3      46054
4      24856
5      10408
6       5465
7       3742
8       2033
9       1133
10       612
11       217
12       113
13        40
14        19
15         3
16         2
17         1
Name: Unknown, dtype: int64


0     0.529311
1     0.202678
2     0.115032
3     0.074397
4     0.040153
5     0.016813
6     0.008828
7     0.006045
8     0.003284
9     0.001830
10    0.000989
11    0.000351
12    0.000183
13    0.000065
14    0.000031
15    0.000005
16    0.000003
17    0.000002
Name: Unknown, dtype: float64

# Count Missing per Feature

In [18]:
for feature in Data:
    if 'Unknown' in Data[feature].unique():
        print (feature, Data[feature].value_counts()['Unknown'])
    else:
        print (feature, 0)

AGE 39525
BODY_TYP 21859
BUS_USE 6690
DAY_WEEK 0
EMER_USE 5320
HOSPITAL 12304
HOUR 1684
INT_HWY 0
LGT_COND 3814
MAKE 13328
MODEL 0
MONTH 0
NUMOCCS 22142
PERMVIT 0
PERNOTMVIT 0
PER_TYP 0
PJ 0
PSU 0
PVH_INVL 0


  if 'Unknown' in Data[feature].unique():
  if 'Unknown' in Data[feature].unique():
  if 'Unknown' in Data[feature].unique():
  if 'Unknown' in Data[feature].unique():
  if 'Unknown' in Data[feature].unique():


REGION 0
RELJCT1 0
RELJCT2 43534
REL_ROAD 0
SCH_BUS 0
SEX 25579
TYP_INT 67861
URBANICITY 0
VALIGN 42739
VE_FORMS 0
VE_TOTAL 0
VPROFILE 84506
VSPD_LIM 83155
VTRAFCON 39795
VTRAFWAY 111189
WEATHER 28669
WRK_ZONE 0
YEAR 0


  if 'Unknown' in Data[feature].unique():
  if 'Unknown' in Data[feature].unique():
  if 'Unknown' in Data[feature].unique():
