In [1]:
import os
from functools import reduce
import pickle
import pandas as pd

In [2]:
#################################
WORK_DIR = 'C:/Users/school/Desktop/DATA_PREP_DIR/'
IGNORE_PICKLES = False # refreshes data from scratch when True
#################################

# Directories that exist
SF_DIR = os.path.join(WORK_DIR, '2019_acs/sf')
SF_TEMPLATE_DIR = os.path.join(WORK_DIR, '2019_acs/sf_template')

# Directories created by this script
INT_DIR = os.path.join(WORK_DIR, 'INTERMEDIATE_DATA')

try:
    os.makedirs(INT_DIR)
except Exception as e:
    print(e)

[WinError 183] Cannot create a file when that file already exists: 'C:/Users/school/Desktop/DATA_PREP_DIR/INTERMEDIATE_DATA'


In [3]:
def save_pkl(file_path, data):
    with open(file_path, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def read_pkl(file_path):
    with open(file_path, 'rb') as handle:
        data = pickle.load(handle)
        return data

Summary files start with e, m, or g, indicating:

* e: estimates
* m: margins of error
* g: geography files

In [4]:
sf_files = os.listdir(SF_DIR)
# verify present files fall into the three categories, as denoted by first letter of filename
for first_letter in [x[0] for x in sf_files]:
    assert first_letter in ['e', 'm', 'g']

#### Build 'Estimates' Data Dict

In [5]:
# Build data dictionary of encoded column name: description
SF_DATA_DICT_dict = {}
for f in os.listdir(SF_TEMPLATE_DIR):
    if not f.startswith('seq'):
        continue
    file_path = os.path.join(SF_TEMPLATE_DIR, f)
    template_data = pd.read_excel(file_path, sheet_name='e')
    mapping = dict(zip(template_data.columns, template_data.iloc[0]))
    SF_DATA_DICT_dict.update(mapping)
SF_DATA_DICT = pd.Series(SF_DATA_DICT_dict).reset_index()
SF_DATA_DICT.columns = ['column', 'description']
SF_DATA_DICT.to_pickle(os.path.join(INT_DIR, 'SF_DATA_DICT.pkl'))
SF_DATA_DICT

Unnamed: 0,column,description
0,FILEID,FILEID
1,FILETYPE,FILETYPE
2,STUSAB,STUSAB
3,CHARITER,CHARITER
4,SEQUENCE,SEQUENCE
...,...,...
35528,C23002C_023,SEX BY AGE BY EMPLOYMENT STATUS FOR THE POPULA...
35529,C23002C_024,SEX BY AGE BY EMPLOYMENT STATUS FOR THE POPULA...
35530,C23002C_025,SEX BY AGE BY EMPLOYMENT STATUS FOR THE POPULA...
35531,C23002C_026,SEX BY AGE BY EMPLOYMENT STATUS FOR THE POPULA...


#### Build 'Estimates' table

In [6]:
# First, prepare dict of {<int>seq: column names} for estimates to speed up lookups
COL_NAME_LU = {}
for template in os.listdir(SF_TEMPLATE_DIR):
    if not template.startswith('seq'):
        continue
    seq = int(template.split('.')[0][3:])
    columns = pd.read_excel(os.path.join(SF_TEMPLATE_DIR, template), sheet_name='e').columns
    COL_NAME_LU[seq] = columns

In [None]:
if IGNORE_PICKLES:
    # build dataframes by state, then later all data will be concatenated
    # This makes things more efficient due to the large volume of tables.
    RESULTS = {}
    sf_e_files = [x for x in sf_files if x[0]=='e']
    for i, f in enumerate(sf_e_files):
        print('\rPercent complete: {}%'.format(round((float(i+1)/len(sf_e_files))*100., 2)), end='')
        # get filetype
        ftype = f[0]
        # get path to file for reading
        file_path = os.path.join(SF_DIR, f)
        # get state
        state = f[6:8]
        # get sequence
        seq = int(f[8:12])
        # look up column names for this file
        columns = COL_NAME_LU[seq]
        # read the data and set the appropriate column names
        data = pd.read_csv(file_path, header=None, names=columns)
        # drop unnecessary columns
        data.drop(columns=['FILEID', 'FILETYPE', 'CHARITER', 'SEQUENCE'], inplace=True)
        # set index for concatenation - needed because `.merge()` on
        # this many files takes FOREVER.
        data.set_index(['STUSAB','LOGRECNO'], inplace=True)

        # merge data for current state
        if state not in RESULTS.keys():
            RESULTS[state] = data
        else:
            RESULTS[state] = pd.concat([RESULTS[state], data], axis=1)


    save_pkl(os.path.join(INT_DIR, 'RESULTS.pkl'), RESULTS)
    print('\nDone!\nNumber of states processed: {}'.format(len(RESULTS)))
else:
    print('Loading previous result...')
    RESULTS = read_pkl(os.path.join(INT_DIR, 'RESULTS.pkl'))
    print('Done')

Percent complete: 2.44%

In [None]:
print('Concatenating results...')

if IGNORE_PICKLES:
    # Reset indicies
    for i, (state, df) in enumerate(RESULTS.items()):
        df.reset_index(inplace=True)

    # Concatenate tables from different states
    ESTIMATES = pd.concat([df for df in RESULTS.values()])
    ESTIMATES.to_pickle(os.path.join(INT_DIR, 'ESTIMATES.pkl'))
else:
    print('Loading previous result...')
    ESTIMATES = read_pkl(os.path.join(INT_DIR, 'ESTIMATES.pkl'))
    print('Done')    

del RESULTS
ESTIMATES

#### Bring in Geo Data

In [None]:
GEO_PATH = os.path.join(WORK_DIR, '2019_acs/geo/1_year_Mini_Geo.xlsx')
geo = pd.read_excel(GEO_PATH, sheet_name=None)
geo[list(geo.keys())[5]]

In [None]:
# Combine all sheets from geo file into single table
for i, (sheet, data) in enumerate(geo.items()):
    print('\rPercent complete: {}%'.format(round((float(i+1)/len(geo))*100., 2)), end='')
    # rename columns so state and log rec no are the same names
    data.rename(columns={
        'State': 'STUSAB',
        'Logical Record Number': 'LOGRECNO',
        'Geography ID': 'GEOID',
        'Geography Name': 'GEONAME'}, inplace=True)
    # set the state abbrev to lowercase to match the summary data for the join
    data.STUSAB = data.STUSAB.str.strip().str.lower()

geo_prep = pd.concat(geo.values())
geo_prep

In [None]:
# Merge geo data into ESTIMATES
ESTIMATES_GEO = ESTIMATES.sort_index(axis=1).merge(geo_prep, on=['STUSAB', 'LOGRECNO'], how='left')
ESTIMATES_GEO

In [None]:
# reorder columns
info_cols = ['STUSAB', 'LOGRECNO', 'FILEID', 'FILETYPE', 'CHARITER', 'SEQUENCE', 'GEOID', 'GEONAME']
all_other_cols = sorted(list(set(ESTIMATES_GEO.columns)-set(info_cols)))
ESTIMATES_GEO = ESTIMATES_GEO[info_cols + all_other_cols]
ESTIMATES_GEO

In [None]:
from collections import defaultdict
counts = defaultdict(int)
for col in ESTIMATES_GEO.columns:
    counts[col] += 1

for k,v in counts.items():
    if v > 1:
        print(k)