In [1]:
import os
from functools import reduce
import pickle
import pandas as pd

In [2]:
#################################
WORK_DIR = '/media/school/project/temp_2018-5-year/2018_acs'
RECALC_DATA = True # refreshes data from scratch when True, reads pickles when False
#################################

# Directories that exist
SF_DIR = os.path.join(WORK_DIR, 'All_Geographies_Not_Tracts_Block_Groups')
SF_TEMPLATE_DIR = os.path.join(WORK_DIR, '2018_5yr_Summary_FileTemplates')

# Directories created by this script
INT_DIR = os.path.join(WORK_DIR, 'INTERMEDIATE_DATA')
RES_DIR = os.path.join(WORK_DIR, 'RESULTS/')

try:
    os.makedirs(INT_DIR)
except Exception as e:
    print(e)
    
try:
    os.makedirs(RES_DIR)
except Exception as e:
    print(e)

[Errno 17] File exists: '/media/school/project/temp_2018-5-year/2018_acs/INTERMEDIATE_DATA'
[Errno 17] File exists: '/media/school/project/temp_2018-5-year/2018_acs/RESULTS/'


In [3]:
def save_pkl(file_path, data):
    with open(file_path, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def read_pkl(file_path):
    with open(file_path, 'rb') as handle:
        data = pickle.load(handle)
        return data

Summary files start with e, m, or g, indicating:

* e: estimates
* m: margins of error
* g: geography files

In [4]:
print(SF_DIR)
sf_files = os.listdir(SF_DIR)
# verify present files fall into the three categories, as denoted by first letter of filename
for first_letter in [x[0] for x in sf_files]:
    assert first_letter in ['e', 'm', 'g']

/media/school/project/temp_2018-5-year/2018_acs/All_Geographies_Not_Tracts_Block_Groups


#### Build 'Estimates' Data Dict

In [5]:
# Build data dictionary of encoded column name: description
SF_DATA_DICT_dict = {}
for f in os.listdir(SF_TEMPLATE_DIR):
    if not f.startswith('seq'):
        continue
    file_path = os.path.join(SF_TEMPLATE_DIR, f)
    template_data = pd.read_excel(file_path, sheet_name='e')
    mapping = dict(zip(template_data.columns, template_data.iloc[0]))
    SF_DATA_DICT_dict.update(mapping)
SF_DATA_DICT = pd.Series(SF_DATA_DICT_dict).reset_index()
SF_DATA_DICT.columns = ['column', 'description']
SF_DATA_DICT.to_pickle(os.path.join(INT_DIR, 'SF_DATA_DICT.pkl'))
SF_DATA_DICT

Unnamed: 0,column,description
0,FILEID,FILEID
1,FILETYPE,FILETYPE
2,STUSAB,STUSAB
3,CHARITER,CHARITER
4,SEQUENCE,SEQUENCE
...,...,...
26997,B99088_004,ALLOCATION OF TRAVEL TIME TO WORK FOR WORKPLAC...
26998,B99088_005,ALLOCATION OF TRAVEL TIME TO WORK FOR WORKPLAC...
26999,B99089_001,ALLOCATION OF VEHICLES AVAILABLE FOR WORKERS F...
27000,B99089_002,ALLOCATION OF VEHICLES AVAILABLE FOR WORKERS F...


#### Build 'Estimates' table

In [6]:
# First, prepare dict of {<int>seq: column names} for estimates to speed up lookups
COL_NAME_LU = {}
for template in os.listdir(SF_TEMPLATE_DIR):
    if not template.startswith('seq'):
        continue
    seq = int(template.split('.')[0][3:])
    columns = pd.read_excel(os.path.join(SF_TEMPLATE_DIR, template), sheet_name='e').columns
    COL_NAME_LU[seq] = columns

In [7]:
# if RECALC_DATA:
#     # build dataframes by state, then later all data will be concatenated
#     # This makes things more efficient due to the large volume of tables.
#     RESULTS = {}
#     sf_e_files = [x for x in sf_files if x[0]=='e']
#     for i, f in enumerate(sf_e_files):
#         print('\rPercent complete: {}%'.format(round((float(i+1)/len(sf_e_files))*100., 2)), end='')
#         # get filetype
#         ftype = f[0]
#         # get path to file for reading
#         file_path = os.path.join(SF_DIR, f)
#         # get state
#         state = f[6:8]
#         # get sequence
#         seq = int(f[8:12])
#         # look up column names for this file
#         columns = COL_NAME_LU[seq]
#         # read the data and set the appropriate column names
#         data = pd.read_csv(file_path, header=None, names=columns, low_memory=False)
#         # drop unnecessary columns
#         data.drop(columns=['FILEID', 'FILETYPE', 'CHARITER', 'SEQUENCE'], inplace=True)
#         # set index for concatenation - needed because `.merge()` on
#         # this many files takes FOREVER.
#         data.set_index(['STUSAB','LOGRECNO'], inplace=True)

#         # merge data for current state
#         if state not in RESULTS.keys():
#             RESULTS[state] = data
#         else:
#             RESULTS[state] = pd.concat([RESULTS[state], data], axis=1)


#     save_pkl(os.path.join(INT_DIR, 'RESULTS.pkl'), RESULTS)
#     print('\nDone!\nNumber of states processed: {}'.format(len(RESULTS)))
# else:
#     print('Loading previous result...')
#     RESULTS = read_pkl(os.path.join(INT_DIR, 'RESULTS.pkl'))
#     print('Done')

In [8]:
import numpy as np

def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            #print("******************************")
            #print("Column: ",col)
            #print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            #print("dtype after: ",props[col].dtype)
            #print("******************************")
    
    # Print final result
    #print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

In [9]:
sf_e_files = [x for x in sf_files if x[0]=='e']
states = set()
for i, f in enumerate(sf_e_files):
    # get state
    state = f[6:8]
    # get sequence
    states.update([state])

# build dataframes by state, then later all data will be concatenated
# This makes things more efficient due to the large volume of tables.
for ST in states:
    print(ST+'\n')
    RESULTS = {}
    sf_e_files = [x for x in sf_files if x[0]=='e']
    for i, f in enumerate(sf_e_files):
        print('\rPercent complete: {}%'.format(round((float(i+1)/len(sf_e_files))*100., 2)), end='')
        # get filetype
        ftype = f[0]
        # get path to file for reading
        file_path = os.path.join(SF_DIR, f)
        # get state
        state = f[6:8]
        if state != ST:
            continue
        # get sequence
        seq = int(f[8:12])
        # look up column names for this file
        columns = COL_NAME_LU[seq]
        # read the data and set the appropriate column names
        data = pd.read_csv(file_path, header=None, names=columns, low_memory=False)
        # drop unnecessary columns
        data.drop(columns=['FILEID', 'FILETYPE', 'CHARITER', 'SEQUENCE'], inplace=True)
        # set index for concatenation - needed because `.merge()` on
        # this many files takes FOREVER.
        data.set_index(['STUSAB','LOGRECNO'], inplace=True)

        # merge data for current state
        if ST not in RESULTS.keys():
            RESULTS[ST] = data
        else:
            RESULTS[ST] = pd.concat([RESULTS[ST], data], axis=1)
    print('')
    RESULTS[ST], _ = reduce_mem_usage(RESULTS[ST])
    save_pkl(os.path.join(INT_DIR, 'STATE__{}.pkl'.format(ST)), RESULTS)
    print('')



ct

Percent complete: 100.0%
Memory usage of properties dataframe is : 410.9174966812134  MB
Memory usage is:  220.1667022705078  MB
This is  53.57929609926331 % of the initial size

oh

Percent complete: 100.0%
Memory usage of properties dataframe is : 1883.4229431152344  MB
Memory usage is:  1060.9062881469727  MB
This is  56.32862719576963 % of the initial size

ks

Percent complete: 100.0%
Memory usage of properties dataframe is : 1347.2735967636108  MB
Memory usage is:  712.0222883224487  MB
This is  52.849123595448766 % of the initial size

wv

Percent complete: 100.0%
Memory usage of properties dataframe is : 523.7910432815552  MB
Memory usage is:  266.61621856689453  MB
This is  50.9012557558338 % of the initial size

in

Percent complete: 100.0%
Memory usage of properties dataframe is : 1262.8244104385376  MB
Memory usage is:  692.3289947509766  MB
This is  54.823852708909335 % of the initial size

dc

Percent complete: 100.0%
Memory usage of properties dataframe is : 10.29874


mt

Percent complete: 100.0%
Memory usage of properties dataframe is : 566.8395309448242  MB
Memory usage is:  282.86998748779297  MB
This is  49.90300994291934 % of the initial size

ca

Percent complete: 100.0%
Memory usage of properties dataframe is : 1892.6917562484741  MB
Memory usage is:  1143.5949726104736  MB
This is  60.4216174575202 % of the initial size

la

Percent complete: 100.0%
Memory usage of properties dataframe is : 876.6238632202148  MB
Memory usage is:  471.6411361694336  MB
This is  53.8019960393154 % of the initial size

tn

Percent complete: 100.0%
Memory usage of properties dataframe is : 1005.1514053344727  MB
Memory usage is:  550.5414962768555  MB
This is  54.771996870825454 % of the initial size

nm

Percent complete: 100.0%
Memory usage of properties dataframe is : 575.9023704528809  MB
Memory usage is:  301.0022964477539  MB
This is  52.26620203196078 % of the initial size

us

Percent complete: 100.0%
Memory usage of properties dataframe is : 9002.16076

In [10]:
RESULTS

{'ne':                  B27001_001  B27001_002  B27001_003  B27001_004  B27001_005  \
 STUSAB LOGRECNO                                                               
 ne     1            1875468      932067       81044       77390        3654   
        2            1385192      683229       62585       59784        2801   
        3             490276      248838       18459       17606         853   
        4            1543638      765582       68302       65332        2970   
        5            1220185      603887       54477       52117        2360   
 ...                     ...         ...         ...         ...         ...   
        7234            3130        1600          86          80           6   
        7235             699         349          20          16           4   
        7236            8636        4221         349         344           5   
        7237            1992        1068          53          53           0   
        7238               0      

In [None]:
print('Concatenating results...')
INT_STATE_FILES = [x for x in os.listdir(INT_DIR) if x.startswith('STATE__')]
ESTIMATES = pd.DataFrame()

# Reset indicies
for i, STATE_FILE in enumerate(INT_STATE_FILES):
    df = list(pd.read_pickle(os.path.join(INT_DIR, STATE_FILE)).values())[0]
    df.reset_index(inplace=True)

    # Concatenate tables from different states
    ESTIMATES = pd.concat([ESTIMATES, df])
    print(ESTIMATES.shape)    
ESTIMATES.to_pickle(os.path.join(INT_DIR, 'ESTIMATES.pkl'))

del RESULTS # free up some memory, we don't need those anymore.
ESTIMATES

Concatenating results...
(3964, 26998)
(15663, 26998)
(17658, 26998)
(23324, 26998)
(31519, 26998)
(36592, 26998)
(39614, 26998)
(40243, 26998)
(42958, 26998)
(52102, 26998)
(55145, 26998)
(56936, 26998)
(58162, 26998)
(62382, 26998)
(66044, 26998)
(73582, 26998)
(76535, 26998)
(78082, 26998)
(82340, 26998)
(91661, 26998)
(100850, 26998)
(102814, 26998)
(108568, 26998)
(111364, 26998)
(114147, 26998)
(157852, 26998)
(162732, 26998)
(167913, 26998)


## Geo Data

#### Build 'Geo' Data Dict

In [None]:
# Build data dictionary of encoded column name: description
file_path = os.path.join(SF_TEMPLATE_DIR, '2019_SFGeoFileTemplate.xlsx')
template_data = pd.read_excel(file_path)
GEO_COLS = template_data.columns
GEO_DATA_DICT_dict = dict(zip(GEO_COLS, template_data.iloc[0]))
    
GEO_DATA_DICT = pd.Series(GEO_DATA_DICT_dict).reset_index()
GEO_DATA_DICT.columns = ['column', 'description']
GEO_DATA_DICT.to_pickle(os.path.join(INT_DIR, 'GEO_DATA_DICT.pkl'))
GEO_DATA_DICT.head(10)

In [None]:
if RECALC_DATA:
    GEO_TABLE = pd.DataFrame()
    sf_g_files = [x for x in sf_files if x[0]=='g' and x.endswith('.csv')]
    columns = GEO_COLS

    for i, f in enumerate(sf_g_files):
        print('\rPercent complete: {}%'.format(round((float(i+1)/len(sf_g_files))*100., 2)), end='')
        # get filetype
        ftype = f[0]
        # get path to file for reading
        file_path = os.path.join(SF_DIR, f)
        # get state
        state = f.split('.')[0][-2:]

        # read the data and set the appropriate column names
        try:
            data = pd.read_csv(file_path, header=None, names=columns, encoding="ISO-8859-1")
        except Exception as e:
            print(e)
            print('\nFailure on file {}\n'.format(f))
            continue

        GEO_TABLE = GEO_TABLE.append(data)


    save_pkl(os.path.join(INT_DIR, 'GEO_TABLE.pkl'), GEO_TABLE)
    print('\nDone!')
else:
    print('Loading previous result...')
    GEO_TABLE = read_pkl(os.path.join(INT_DIR, 'GEO_TABLE.pkl'))
    print('Done')
    
GEO_TABLE = GEO_TABLE[['STUSAB', 'LOGRECNO']+list(set(GEO_TABLE.columns)-set(['STUSAB', 'LOGRECNO']))]
GEO_TABLE.STUSAB = GEO_TABLE.STUSAB.str.strip().str.lower()
GEO_TABLE.columns = [x.replace('.', '') for x in GEO_TABLE.columns]
GEO_TABLE

In [None]:
GEO_TABLE.iloc[2]

---

#### Gazeteer File

In [None]:
GAZ_TABLE = pd.read_csv('/media/school/project/temp_2018-5-year/gaz_tract_2018/2018_Gaz_tracts_national.txt'), sep='\t')
GAZ_TABLE.rename(columns={'USPS': 'STUSAB'}, inplace=True)
GAZ_TABLE.STUSAB = GAZ_TABLE.STUSAB.str.strip().str.lower()
GAZ_TABLE

In [None]:
set(GAZ_TABLE.GEOID.values).intersection(set(GEO_TABLE.GEOID.values))

In [None]:
for state in GAZ_TABLE.STUSAB.unique():
    print('')
    print(state)
    print('GAZ: {}'.format(GAZ_TABLE[GAZ_TABLE.STUSAB==state].GEOID.nunique()))
    print('GEO: {}'.format(GEO_TABLE[GEO_TABLE.STUSAB==state].GEOID.nunique()))

---

In [None]:
ESTIMATES_DATA_DICT = SF_DATA_DICT[~SF_DATA_DICT.column.isin(['FILEID', 'FILETYPE', 'CHARITER', 'SEQUENCE'])].reset_index(drop=True)
ESTIMATES_DATA_DICT.loc[ESTIMATES_DATA_DICT.column=='STUSAB', 'description'] = 'State ID'
ESTIMATES_DATA_DICT.loc[ESTIMATES_DATA_DICT.column=='LOGRECNO', 'description'] = 'Logical Record Number'
ESTIMATES_DATA_DICT.head(2)

In [None]:
ESTIMATES.head(2)

In [None]:
GEO_DATA_DICT = GEO_DATA_DICT[~GEO_DATA_DICT.column.isin(['FILEID', 'FILETYPE', 'CHARITER', 'SEQUENCE'])].reset_index(drop=True)
GEO_DATA_DICT.loc[GEO_DATA_DICT.column=='STUSAB', 'description'] = 'State ID'
GEO_DATA_DICT.loc[GEO_DATA_DICT.column=='LOGRECNO', 'description'] = 'Logical Record Number'
GEO_DATA_DICT.head(2)

In [None]:
GEO_TABLE.head(2)

In [None]:
GAZ_TABLE.head(2)

In [None]:
OUTPUTS = {
    'cen_20191_estimates_dd': ESTIMATES_DATA_DICT,
    'cen_20191_estimates': ESTIMATES,
    'cen_20191_geo_dd': GEO_DATA_DICT,
    'cen_20191_geo': GEO_TABLE,
    'cen_20191_gaz': GAZ_TABLE,
}

for f, df in OUTPUTS.items():
    print('Writing {} table'.format(f))
    #df.to_csv(os.path.join(RES_DIR, '{}.csv'.format(f)), index=None)
    df.to_pickle(os.path.join(RES_DIR, '{}.pkl'.format(f)))