In [1]:
import os
from functools import reduce
import pickle
import pandas as pd

In [2]:
#################################
WORK_DIR = '/media/school/project/temp_2018-5-year/2018_acs'
RECALC_DATA = True # refreshes data from scratch when True, reads pickles when False
#################################

# Directories that exist
SF_DIR = os.path.join(WORK_DIR, 'Tracts_Block_Groups_Only')
SF_TEMPLATE_DIR = os.path.join(WORK_DIR, '2018_5yr_Summary_FileTemplates')

# Directories created by this script
INT_DIR = os.path.join(WORK_DIR, 'INTERMEDIATE_DATA_2/')
RES_DIR = os.path.join(WORK_DIR, 'RESULTS_2/')

try:
    os.makedirs(INT_DIR)
except Exception as e:
    print(e)
    
try:
    os.makedirs(RES_DIR)
except Exception as e:
    print(e)

[Errno 17] File exists: '/media/school/project/temp_2018-5-year/2018_acs/INTERMEDIATE_DATA_2/'
[Errno 17] File exists: '/media/school/project/temp_2018-5-year/2018_acs/RESULTS_2/'


In [3]:
def save_pkl(file_path, data):
    with open(file_path, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def read_pkl(file_path):
    with open(file_path, 'rb') as handle:
        data = pickle.load(handle)
        return data

Summary files start with e, m, or g, indicating:

* e: estimates
* m: margins of error
* g: geography files

In [4]:
print(SF_DIR)
sf_files = os.listdir(SF_DIR)
# verify present files fall into the three categories, as denoted by first letter of filename
for first_letter in [x[0] for x in sf_files]:
    assert first_letter in ['e', 'm', 'g']

/media/school/project/temp_2018-5-year/2018_acs/Tracts_Block_Groups_Only


#### Build 'Estimates' Data Dict

In [5]:
# Build data dictionary of encoded column name: description
SF_DATA_DICT_dict = {}
for f in os.listdir(SF_TEMPLATE_DIR):
    if not f.startswith('seq'):
        continue
    file_path = os.path.join(SF_TEMPLATE_DIR, f)
    template_data = pd.read_excel(file_path, sheet_name='e')
    mapping = dict(zip(template_data.columns, template_data.iloc[0]))
    SF_DATA_DICT_dict.update(mapping)
SF_DATA_DICT = pd.Series(SF_DATA_DICT_dict).reset_index()
SF_DATA_DICT.columns = ['column', 'description']
SF_DATA_DICT.to_pickle(os.path.join(INT_DIR, 'SF_DATA_DICT.pkl'))
SF_DATA_DICT

Unnamed: 0,column,description
0,FILEID,FILEID
1,FILETYPE,FILETYPE
2,STUSAB,STUSAB
3,CHARITER,CHARITER
4,SEQUENCE,SEQUENCE
...,...,...
26997,B99088_004,ALLOCATION OF TRAVEL TIME TO WORK FOR WORKPLAC...
26998,B99088_005,ALLOCATION OF TRAVEL TIME TO WORK FOR WORKPLAC...
26999,B99089_001,ALLOCATION OF VEHICLES AVAILABLE FOR WORKERS F...
27000,B99089_002,ALLOCATION OF VEHICLES AVAILABLE FOR WORKERS F...


#### Build 'Estimates' table

In [6]:
# First, prepare dict of {<int>seq: column names} for estimates to speed up lookups
COL_NAME_LU = {}
for template in os.listdir(SF_TEMPLATE_DIR):
    if not template.startswith('seq'):
        continue
    seq = int(template.split('.')[0][3:])
    columns = pd.read_excel(os.path.join(SF_TEMPLATE_DIR, template), sheet_name='e').columns
    COL_NAME_LU[seq] = columns

In [None]:
if RECALC_DATA:
    # build dataframes by state, then later all data will be concatenated
    # This makes things more efficient due to the large volume of tables.
    RESULTS = {}
    sf_e_files = [x for x in sf_files if x[0]=='e']
    for i, f in enumerate(sf_e_files):
        print('\rPercent complete: {}%'.format(round((float(i+1)/len(sf_e_files))*100., 2)), end='')
        # get filetype
        ftype = f[0]
        # get path to file for reading
        file_path = os.path.join(SF_DIR, f)
        # get state
        state = f[6:8]
        # get sequence
        seq = int(f[8:12])
        # look up column names for this file
        columns = COL_NAME_LU[seq]
        # read the data and set the appropriate column names
        data = pd.read_csv(file_path, header=None, names=columns, low_memory=False)
        # drop unnecessary columns
        data.drop(columns=['FILEID', 'FILETYPE', 'CHARITER', 'SEQUENCE'], inplace=True)
        # set index for concatenation - needed because `.merge()` on
        # this many files takes FOREVER.
        data.set_index(['STUSAB','LOGRECNO'], inplace=True)

        # merge data for current state
        if state not in RESULTS.keys():
            RESULTS[state] = data
        else:
            RESULTS[state] = pd.concat([RESULTS[state], data], axis=1)


    save_pkl(os.path.join(INT_DIR, 'RESULTS.pkl'), RESULTS)
    print('\nDone!\nNumber of states processed: {}'.format(len(RESULTS)))
else:
    print('Loading previous result...')
    RESULTS = read_pkl(os.path.join(INT_DIR, 'RESULTS.pkl'))
    print('Done')

Percent complete: 83.72%

In [None]:
print('Concatenating results...')

if RECALC_DATA:
    # Reset indicies
    for i, (state, df) in enumerate(RESULTS.items()):
        df.reset_index(inplace=True)

    # Concatenate tables from different states
    ESTIMATES = pd.concat([df for df in RESULTS.values()])
    ESTIMATES.to_pickle(os.path.join(INT_DIR, 'ESTIMATES.pkl'))
else:
    print('Loading previous result...')
    ESTIMATES = pd.read_pickle(os.path.join(INT_DIR, 'ESTIMATES.pkl'))
    print('Done')    

del RESULTS # free up some memory, we don't need those anymore.
ESTIMATES

## Geo Data

#### Build 'Geo' Data Dict

In [None]:
# Build data dictionary of encoded column name: description
file_path = os.path.join(SF_TEMPLATE_DIR, '2019_SFGeoFileTemplate.xlsx')
template_data = pd.read_excel(file_path)
GEO_COLS = template_data.columns
GEO_DATA_DICT_dict = dict(zip(GEO_COLS, template_data.iloc[0]))
    
GEO_DATA_DICT = pd.Series(GEO_DATA_DICT_dict).reset_index()
GEO_DATA_DICT.columns = ['column', 'description']
GEO_DATA_DICT.to_pickle(os.path.join(INT_DIR, 'GEO_DATA_DICT.pkl'))
GEO_DATA_DICT.head(10)

In [None]:
if RECALC_DATA:
    GEO_TABLE = pd.DataFrame()
    sf_g_files = [x for x in sf_files if x[0]=='g' and x.endswith('.csv')]
    columns = GEO_COLS

    for i, f in enumerate(sf_g_files):
        print('\rPercent complete: {}%'.format(round((float(i+1)/len(sf_g_files))*100., 2)), end='')
        # get filetype
        ftype = f[0]
        # get path to file for reading
        file_path = os.path.join(SF_DIR, f)
        # get state
        state = f.split('.')[0][-2:]

        # read the data and set the appropriate column names
        try:
            data = pd.read_csv(file_path, header=None, names=columns, encoding="ISO-8859-1")
        except Exception as e:
            print(e)
            print('\nFailure on file {}\n'.format(f))
            continue

        GEO_TABLE = GEO_TABLE.append(data)


    save_pkl(os.path.join(INT_DIR, 'GEO_TABLE.pkl'), GEO_TABLE)
    print('\nDone!')
else:
    print('Loading previous result...')
    GEO_TABLE = read_pkl(os.path.join(INT_DIR, 'GEO_TABLE.pkl'))
    print('Done')
    
GEO_TABLE = GEO_TABLE[['STUSAB', 'LOGRECNO']+list(set(GEO_TABLE.columns)-set(['STUSAB', 'LOGRECNO']))]
GEO_TABLE.STUSAB = GEO_TABLE.STUSAB.str.strip().str.lower()
GEO_TABLE.columns = [x.replace('.', '') for x in GEO_TABLE.columns]
GEO_TABLE

In [None]:
GEO_TABLE.iloc[2]

---

#### Gazeteer File

In [None]:
GAZ_TABLE = pd.read_csv('/media/school/project/temp_2018-5-year/gaz_tract_2018/2018_Gaz_tracts_national.txt'), sep='\t')
GAZ_TABLE.rename(columns={'USPS': 'STUSAB'}, inplace=True)
GAZ_TABLE.STUSAB = GAZ_TABLE.STUSAB.str.strip().str.lower()
GAZ_TABLE

In [None]:
set(GAZ_TABLE.GEOID.values).intersection(set(GEO_TABLE.GEOID.values))

In [None]:
for state in GAZ_TABLE.STUSAB.unique():
    print('')
    print(state)
    print('GAZ: {}'.format(GAZ_TABLE[GAZ_TABLE.STUSAB==state].GEOID.nunique()))
    print('GEO: {}'.format(GEO_TABLE[GEO_TABLE.STUSAB==state].GEOID.nunique()))

---

In [None]:
ESTIMATES_DATA_DICT = SF_DATA_DICT[~SF_DATA_DICT.column.isin(['FILEID', 'FILETYPE', 'CHARITER', 'SEQUENCE'])].reset_index(drop=True)
ESTIMATES_DATA_DICT.loc[ESTIMATES_DATA_DICT.column=='STUSAB', 'description'] = 'State ID'
ESTIMATES_DATA_DICT.loc[ESTIMATES_DATA_DICT.column=='LOGRECNO', 'description'] = 'Logical Record Number'
ESTIMATES_DATA_DICT.head(2)

In [None]:
ESTIMATES.head(2)

In [None]:
GEO_DATA_DICT = GEO_DATA_DICT[~GEO_DATA_DICT.column.isin(['FILEID', 'FILETYPE', 'CHARITER', 'SEQUENCE'])].reset_index(drop=True)
GEO_DATA_DICT.loc[GEO_DATA_DICT.column=='STUSAB', 'description'] = 'State ID'
GEO_DATA_DICT.loc[GEO_DATA_DICT.column=='LOGRECNO', 'description'] = 'Logical Record Number'
GEO_DATA_DICT.head(2)

In [None]:
GEO_TABLE.head(2)

In [None]:
GAZ_TABLE.head(2)

In [None]:
OUTPUTS = {
    'cen_20191_estimates_dd': ESTIMATES_DATA_DICT,
    'cen_20191_estimates': ESTIMATES,
    'cen_20191_geo_dd': GEO_DATA_DICT,
    'cen_20191_geo': GEO_TABLE,
    'cen_20191_gaz': GAZ_TABLE,
}

for f, df in OUTPUTS.items():
    print('Writing {} table'.format(f))
    #df.to_csv(os.path.join(RES_DIR, '{}.csv'.format(f)), index=None)
    df.to_pickle(os.path.join(RES_DIR, '{}.pkl'.format(f)))