In [1]:
import pandas as pd
import sys
import json

In [2]:
def read_data():
    data = []
    f = '../Data/codebook_21.txt'
    with open(f) as f_pointer:
        for row in f_pointer.readlines():
            data.append(row.strip('/n'))
    return data

Based on eyeballing the codebook, it looks like all of our data is a numerical format. Let's try to read everything into a nest of dictionaries

In [7]:
def is_int(row, meta_heading):
#     if meta_heading == 'FARM':
#         print (row)

    try:
        int(row[0])
        return True
    except:
        return False
      
def is_metaheading(row):
    if '\t' in row and row[0].isalpha():
        return True
    else:
        return False
    
def is_subheader(row, meta_heading):
    
    # these are meta headings where the keys have tabs
    # this throws off the logic below, but b/c we know from the codebook that they have no subheadings
    # we automatically return False
    if meta_heading in ['FARM','UNITSSTR', 'RELATE', 'RELATED', 'WKSWORK2', 'MULTGEND', 'METRO',
                        'WORKEDYR', 'SCHLTYPE', 'VETVIETN', 'EMPSTATD', 'MIGRATE1D', 'LANGUAGE',
                       'GQ', 'MIGPLAC1']:
        return False

    a = row[0].isalpha()
    b = '\t' not in row
    c = ('-' in row) or (':' in row)
    
    rt = (a and b) or c

    
    return rt

def is_header_break(row):
    return row == '\n'

# This skips subheadings right now by a list
# But are there any subheadings that we want stored in the old way right now? Unknown. 
def create_codebook_store(data):
    store = {}
    
    sub_headings_direct = ['REGION', 'GQ', 'ACREHOUS', 'BPL']
    has_subheading = False
    sub_heading_direct_placement = False
    
    for idx, row in enumerate(data):
        
        # find the meta heading and create that dictionary 
        if is_metaheading(row):
            meta_heading = row.split('\t') [0]
            meta_dict = {}
            has_subheading = False

            if meta_heading in sub_headings_direct:
                sub_heading_direct_placement = True
            continue


        # find the sub headings and create inner dictionaries
        elif is_subheader(row, meta_heading):

            sub_heading = row.strip('\n')
            
            has_subheading = True

            if not sub_heading_direct_placement:
                meta_dict[sub_heading] = {}
                
            continue

        # find the integers and add them, along with their meanings, to the inner dictionary
        elif is_int(row, meta_heading):
            
            lines = row.split('\t')
            key = int(lines[0].strip('/t'))
            value = lines[-1].strip('\n')
            
            if has_subheading and not sub_heading_direct_placement:
                meta_dict[sub_heading][key] = value
            
            elif has_subheading and sub_heading_direct_placement:
                meta_dict[key] = sub_heading
            
            else:
                meta_dict[key] = value
            
            continue

        # find the header break
        elif is_header_break(row):
            store[meta_heading] = meta_dict
            has_subheading = False
            skip_subheading = False
 
    return store 

data = read_data()
store = create_codebook_store(data)

In [9]:
def write_store_to_disk(d, f_name):
    j_file = json.dumps(d)
    f = open(f_name, "w")
    f.write(j_file)
    f.close()

write_store_to_disk(store, '../Data/codebook.json')

In [10]:
## This is tripping the is_subheading function, need to rectify 
store['BPL']

{1: 'Alabama',
 2: 'Alaska',
 4: 'Arizona',
 5: 'Arkansas',
 6: 'California',
 8: 'Colorado',
 9: 'Connecticut',
 10: 'Delaware',
 11: 'District of Columbia',
 12: 'Florida',
 13: 'Georgia',
 15: 'Hawaii',
 16: 'Idaho',
 17: 'Illinois',
 18: 'Indiana',
 19: 'Iowa',
 20: 'Kansas',
 21: 'Kentucky',
 22: 'Louisiana',
 23: 'Maine',
 24: 'Maryland',
 25: 'Massachusetts',
 26: 'Michigan',
 27: 'Minnesota',
 28: 'Mississippi',
 29: 'Missouri',
 30: 'Montana',
 31: 'Nebraska',
 32: 'Nevada',
 33: 'New Hampshire',
 34: 'New Jersey',
 35: 'New Mexico',
 36: 'New York',
 37: 'North Carolina',
 38: 'North Dakota',
 39: 'Ohio',
 40: 'Oklahoma',
 41: 'Oregon',
 42: 'Pennsylvania',
 44: 'Rhode Island',
 45: 'South Carolina',
 46: 'South Dakota',
 47: 'Tennessee',
 48: 'Texas',
 49: 'Utah',
 50: 'Vermont',
 51: 'Virginia',
 53: 'Washington',
 54: 'West Virginia',
 55: 'Wisconsin',
 56: 'Wyoming',
 90: 'Native American',
 99: 'United States, ns',
 100: 'American Samoa',
 105: 'Guam',
 110: 'Puerto Rico