In [163]:
import pandas as pd

In [167]:
schema = pd.read_csv('/mnt/nas/natality/natality_schema_table.txt', sep='\t', index_col=0)
schema_manual = pd.read_excel('/mnt/nas/natality/schema_notes.xlsx')

In [53]:
import os
def get_schema_from_year(year):
    if not os.path.isdir('/mnt/nas/natality/' + str(year)):
        return None
    for f in os.listdir('/mnt/nas/natality/' + str(year)):
        if f.endswith('.dct'):
            column_df, columns, column_widths = read_dct_file(
                os.path.join('/mnt/nas/natality/', str(year), f)
            )
            return column_df, columns, column_widths
def decode_baby_col(row):    
    row = row.strip()
    parent = row.split(')')
    start = int(parent[0].split('(')[1].strip())
    remaining = ')'.join(parent[1:])
    remaining = remaining.strip(' ')
    find_format = remaining.split('%')
    dtypes_and_name = find_format[0].strip(' ')
    types = [t for t in dtypes_and_name.split(' ') if t]
    assert len(types) == 2, types
    dtype = types[0].strip(' ')
    name = types[1].strip(' ')
    num_c = int(float(find_format[1].split(' ')[0].strip(' ')[:-1]))
    end_of_str = 'f'.join(find_format[1].split('f')[1:])
    end_of_str = end_of_str.strip(' \r\n')        
    return {
        'start': start,
        'end': start + num_c,
        'dtype': dtype,
        'name': name,
        'desc': end_of_str.strip('"')
    }

def read_dct_file(dct):
    """
    read dct file which defines field encoded in main text files
    """
    with open(dct) as r:
        chars_to_tab = r.readlines()
        col_data = [decode_baby_col(c) for c in chars_to_tab if c.startswith('_column')]
        
    col_data_df = pd.DataFrame(col_data)
    start = 1
    columns = []
    column_widths = []
    for i, c in col_data_df.iterrows():
        s = c['start']
        e = c['end']
        if s != start:
            column_widths.append(s - start)
            columns.append('blank{0}'.format(i))
        columns.append(c['name'])
        column_widths.append(e - s)
        start = e
    return col_data_df, columns, column_widths

In [106]:
schema_by_year = {}
for c in schema:
    tmp = schema[c].dropna()
    schema_by_year[c] = {}
    for idx, val in tmp.iteritems():        
        if val == 'No Desc':
            schema_by_year[c][idx] = idx
        else:
            schema_by_year[c][idx] = val
            

In [110]:
df = pd.read_csv(
    '/mnt/nas/natality/processed/Nat_2018_processed.txt', nrows=10000, sep='\t',
    index_col=0
)
#df = df.rename(columns=schema_by_year['2018'])

In [123]:
col_not_null = (df.isnull().sum(axis=0) < 100)
col_not_null[col_not_null == True]

dob_yy          True
dob_mm          True
dob_tt          True
dob_wk          True
bfacil          True
f_facility      True
bfacil3         True
mager           True
mager14         True
mager9          True
mbstate_rec     True
restatus        True
mrace31         True
mrace6          True
mrace15         True
mbrace          True
mhisp_r         True
f_mhisp         True
mracehisp       True
mar_p           True
dmar            True
f_mar_p         True
meduc           True
f_meduc         True
fagecomb        True
fagerec11       True
frace31         True
frace6          True
frace15         True
fhisp_r         True
                ... 
no_abnorm       True
ca_anen         True
ca_mnsb         True
ca_cchd         True
ca_cdh          True
ca_omph         True
ca_gast         True
f_ca_anen       True
f_ca_menin      True
f_ca_heart      True
f_ca_hernia     True
f_ca_ompha      True
f_ca_gastro     True
ca_limb         True
ca_cleft        True
ca_clpal        True
ca_downs     

In [159]:
df['restatus'].value_counts()

1    7973
2    2001
3      23
4       3
Name: restatus, dtype: int64

In [158]:
df[df['f_facility'] == 0][['bfacil', 'f_facility', 'f_tpcv', 'f_meduc', 'f_mpcb', 'f_mhisp', 'bfacil3']]

Unnamed: 0,bfacil,f_facility,f_tpcv,f_meduc,f_mpcb,f_mhisp,bfacil3
3669,1,0,0,0,0,0,1
4431,1,0,1,1,1,1,1
7620,1,0,0,0,0,0,1
7890,1,0,1,1,1,1,1
9302,1,0,0,0,0,0,1


In [153]:
df['f_mhisp'].value_counts()

1    9997
0       3
Name: f_meduc, dtype: int64

In [138]:
schema_by_year['2018']['f_facility']

'Reporting Flag for Birth Place 0 Non-Reporting'

In [133]:
df['ca_limb'].value_counts()

N    9994
Y       6
Name: ca_limb, dtype: int64