In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import re
from functools import reduce

In [2]:
pd.options.display.max_columns = 150

## Bring in Datasets
Read all csv files from the tract and blockgroup folders

In [3]:
df_all = {} # the keys are the names of the files, and the values are the dataframes read from them
#READ AND RENAME FILES
PATH1 = '/home/mirabel/Dropbox (GaTech)/CDS-2019-AlbanyHub/Census/tract_data'
PATH2 = '/home/mirabel/Dropbox (GaTech)/CDS-2019-AlbanyHub/Census/blockgroup_data'
for folder in os.listdir(PATH2):
    #get all the files that are stored in folders
    if os.path.isdir(PATH2+'/'+folder):
        for file in os.listdir(PATH2+'/'+folder):
            if file.endswith('_with_ann.csv'):
                df_all[folder] = pd.read_csv(PATH2+'/'+folder+'/'+file)
    else:
        df_all[folder[:-4]] = pd.read_csv(PATH2+'/'+folder)
        
for folder in os.listdir(PATH1):
    #get all the files that are stored in folders
    if os.path.isdir(PATH1+'/'+folder):
        for file in os.listdir(PATH1+'/'+folder):
            if file.endswith('_with_ann.csv'):
                df_all[folder] = pd.read_csv(PATH1+'/'+folder+'/'+file)
    else:
        df_all[folder[:-4]] = pd.read_csv(PATH1+'/'+folder)


In [4]:
#fix tract 2017 age data
master_col_names = pd.read_csv(PATH1+'/10_tract_age/ACS_10_5YR_S0101_metadata.csv', header=None)
df = df_all['17_tract_age'].copy()
print(df.shape)
master_col_names.columns = ['label', 'desc']
name_dict= {master_col_names.loc[i, 'desc']:master_col_names.loc[i, 'label'] for i in range(len(master_col_names))}
df.columns = range(len(df.columns))
for c in df.columns:
    if df.loc[0,c] in name_dict.keys():
        df.rename(columns={c:name_dict[df.loc[0,c]]}, inplace=True)
    else:
        df.drop(columns=c, inplace=True)
for k, v in name_dict.items():
    if v not in df.columns:
        df[v] = np.nan
df_all['17_tract_age'] = df
print(df.shape)
print(df_all['14_tract_age'].shape)

(44, 459)
(44, 219)
(44, 219)


In [5]:
#fix tract employment data
master_col_names = pd.read_csv(PATH1+'/17_tract_emp/ACS_17_5YR_S2301_metadata.csv', header=None)
master_col_names.columns = ['label', 'desc']
name_dict= {master_col_names.loc[i, 'desc']:master_col_names.loc[i, 'label'] for i in range(len(master_col_names))}
for year in range(10, 15):
    df = df_all[str(year)+'_tract_emp'].copy()
    print(year)
    print(df.shape)
    RENAME = {}
    for c in df.columns:
        if df.loc[0,c] in name_dict.keys():
            RENAME[c] = name_dict[df.loc[0,c]]
        else:
            df.drop(columns=c, inplace=True)
    df.rename(columns=RENAME, inplace=True)        
    for k, v in name_dict.items():
        if v not in df.columns:
            df[v] = np.nan
    df_all[str(year)+'_tract_emp'] = df
    print(df.shape)


10
(44, 243)
(44, 283)
11
(44, 243)
(44, 283)
12
(44, 243)
(44, 283)
13
(44, 243)
(44, 283)
14
(44, 243)
(44, 283)


In [6]:
df_types = {}
type_prefix = {}
for k in df_all.keys():
    df_types[k[3:]] = [] 
type_prefix['blockgroup_age'] = 'ab'
type_prefix['blockgroup_employment'] = 'e'
type_prefix['blockgroup_population'] = 'p'
type_prefix['blockgroup_income'] = 'i'
type_prefix['blockgroup_race'] = 'rb'
type_prefix['blockgroup_vacancy'] = 'v'
type_prefix['blockgroup_owner_renter'] = 'or'
type_prefix['tract_age'] = 'at'
type_prefix['tract_race'] = 'rt'
type_prefix['tract_emp'] = 'et'
type_prefix['tract_medinc'] = 'm'

## Clean up fields and add a column for year

In [7]:
#Convert Geo Display label into 3 separatecolumns
#Clean several different census labels
re_string1 = 'Census Tract (\d+(?:.\d+)?)'
re_string2 = 'Block Group (\d+(?:.\d+)?)'
for k, v in df_all.items():
    #Add Year column
    year = "20"+k[:2]
    v['Year'] = int(year)
    if 'blockgroup' in k:
        #Split geo display label
        new = v["GEO.display-label"].str.split(", ", expand = True)
        v["block"] = new[0] 
        v["tract"] = new[1] 
        v["county"] = new[2] 
        v.drop('GEO.display-label', axis = 1, inplace=True)
        
        #Get block number out of 'Block Group xx.xx'
        l = len(v['block'])
        for i in range(1,l):
            s = v.loc[i, 'block']
            m = re.search(re_string2, s)
            v.loc[i, 'block'] = m.group(1)    
        #reorder columns
        cols = list(v.columns)
        cols.insert(2, 'tract')
        cols.insert(3, 'block')
        cols.insert(4, 'county')
        cols.insert(5, 'Year')
        del cols[-4]
        del cols[-3]
        del cols[-2]
        del cols[-1]
        v = v.reindex(columns=cols).rename(columns={'block':'blockgroup'})
    elif 'tract' in k:
        #Split geo display label
        new = v["GEO.display-label"].str.split(", ", expand = True)
        v["tract"] = new[0] 
        v["county"] = new[1]
        v.drop('GEO.display-label', axis = 1, inplace=True)
        #reorder columns
        cols = list(v.columns)
        cols.insert(2, 'tract')
        cols.insert(3, 'county')
        cols.insert(4, 'Year')
        del cols[-3]
        del cols[-2]
        del cols[-1]
        v = v.reindex(columns=cols)
    else:
        print('ERR')
    #FIX tract
    l = len(v['tract'])
    #get tract number out of 'Census Tract xx.xx'
    for i in range(1,l):
        s = v.loc[i, 'tract']
        m = re.search(re_string1, s)
        v.loc[i, 'tract'] = m.group(1)
    #Change tract label to bring in line with other source
    for i in range(1,l):
        if '.' in v.loc[i, 'tract']:
            s = v.loc[i, 'tract'].split('.')
            if len(s[0]) >=1 and len(s[0]) <= 3:
                v.loc[i, 'tract'] = s[0] + s[1]
        else:
            if len(v.loc[i, 'tract']) >=1 and len(v.loc[i, 'tract']) <=4:
                v.loc[i, 'tract'] = v.loc[i, 'tract'] + '00'   
    df_all[k] = v

In [8]:
#Combine all years in long format per census type
for k,v in df_all.items():
    #Add the suffix 
    v = v.copy()
    pre = type_prefix[k[3:]]+'_'
    if 'tract' in k:
        cols = v.columns[5:]
    elif 'blockgroup' in k:
        cols = v.columns[6:]
    cols = {c:pre+c for c in cols}
    v.rename(columns=cols, inplace=True)
    df_types[k[3:]].append(v)
#Concatenate all the different years
for k, v in df_types.items():
    df_types[k] = pd.concat(df_types[k], sort=False)
    df_types[k].index = range(len(df_types[k]))

In [9]:
#Combine type of census record in wide format
BLOCK = [df_types[k].sort_values(by=['Year', 'tract', 'blockgroup']).set_index(['GEO.id', 'Year']) for k in df_types.keys() if 'blockgroup' in k]
TRACT = [df_types[k].sort_values(by=['Year', 'tract'], ascending=False).set_index('GEO.id') for k in df_types.keys() if 'tract' in k]

df_b = pd.concat(BLOCK, axis=1)
df_b = df_b.loc[:,~df_b.columns.duplicated()] #drop duplicate columns
df_t = pd.concat(TRACT, axis=1)
df_t = df_t.loc[:,~df_t.columns.duplicated()] #drop duplicate columns

## Drop unnecessary columns and make column names descriptive

In [10]:
#Drop "Margin of error" columns
for c in df_b.columns:
    if 'HD02' in c:
        df_b.drop(columns=c, inplace=True)
for c in df_t.columns:
    if 'MOE' in c:
        df_t.drop(columns=c, inplace=True)

In [11]:
#Rename some columns by hand
df_b = df_b.rename(columns={'p_HD01_VD01': 'TotalPopulation', 'e_HD01_VD01': 'TotalLabor',
                             'e_HD01_VD02': 'TotalInLaborforce', 'e_HD01_VD03': 'TotalCivilLabor',
                             'e_HD01_VD04': 'EmployedCivlLabor', 'e_HD01_VD05': 'UnemployedCivilLabor',
                             'e_HD01_VD06': 'TotalArmedForces', 'e_HD01_VD07': 'TotalNotInLaborforce',
                             'v_HD01_VD01': 'TotalHomes', 'v_HD01_VD02': 'TotalOccupiedHomes',
                             'v_HD01_VD03': 'TotalVacantHomes',
                             'or_HD01_VD02': 'TotalOwnedHomes', 'or_HD01_VD03': 'TotalRentedHomes'})
labelst = df_t.loc['Id'].copy()
labelsb = df_b.loc['Id'].copy()

In [12]:
wordreplace= {
    'Employment':'Emp',
    'Unemployment':'Unemp',
    'LaborForceParticipation':'LFP',
    
    'Estimate':'',
    'EducationalAttainment':'Edu',
    'Population':'',
    'RaceAndHispanicOrLatinoOrigin':'',
    'Years':'',
    'Sex':'',
    'PovertyStatusInThePast12Months':'',
    'DisabilityStatus':'',
    'SummaryIndicators':'',
    'IncomeDollars':'',

    'BachelorSDegreeOrHigher':'Bachelors',
    'SomeCollegeOrAssociateSDegree':'SomeCollege',
    'HighSchoolGraduateIncludesEquivalency':'HS',
    'LessThanHighSchoolGraduate':'LessHS',
    
    'WithAnyDisability':'Disabled',
    
    'WhiteAlone':'White',  
    'BlackOrAfricanAmericanAlone':'Black',
    'AmericanIndianAndAlaskaNativeAlone':'Native',
    'AsianAlone':'Asian',
    'NativeHawaiianAndOtherPacificIslanderAlone':'PacificIslander',
    'SomeOtherRaceAlone':'OtherRace',
    'NotHispanicOrLatino':'NonHispanic',
    'HispanicOrLatino':'Hispanic',
    'Origin':'',
    'HispanicOfAnyRace':'Hispanic',
    
    'TwoOrMoreRaces':'2Plus',
    'TwoRacesExcludingSomeOtherRaceAndThreeOrMoreRaces':'ExcOther',
    'TwoRacesIncludingSomeOtherRace':'IncOther',
    
    'WithOwnChildrenUnder18W':'W',
    'WithOwnChildren':'Children',
    'Only':'',
    'Under6To17':'6To17',
    
    'BelowPovertyLevel':'BPL',
    'AtOrAboveThePovertyLevel':'APL',
    
    'RatioMalesPer100Females':'SexRatio',
    
    'MarriedCoupleFamilies':'MIncome',
    'NonfamilyHouseholds':'NfIncome',
    'Households':'HIncome',
    'Families':'FIncome',
    '00000':'00k',
    '0000':'0k',
    '000':'k',
    
 }
#BLOCK LEVEL
desc = {}
DROP = ['or_HD01_VD01']
for c in labelsb.columns:
    if 'HD' in c:
        desc[c] = labelsb[c][2013]
for k, v in desc.items():
    words = re.split('[^a-zA-Z0-9]',  v.title())
    r = ''.join(words)
    for w1, w2 in wordreplace.items():
        r = r.replace(w1, w2)
    #if it is total, it is a repeat
    if r == 'Total':
        DROP.append(k)
    else:
        RENAME[k] = r
# #df_b[['ab_HD01_VD01', 'rb_HD01_VD01','total_pop', 'r_HD01_VD01','total_occ_homes']] -> these three are repeats
df_b.drop(columns=DROP, inplace=True, errors='ignore')
df_b.rename(columns=RENAME, inplace=True)

df_b.rename(columns={'Male':'TotalMale', 'Female':'TotalFemale'}, inplace=True)

In [13]:
#Tract Level
desc = {}
DROP = ['rt_HD01_VD01']
RENAME={}

for c in labelst.columns:
    if 'HD' in c or 'HC' in c:
        desc[c] = labelst[c].iloc[0]
        

for k, v in desc.items():
    if type(v)==float:
        DROP.append(k)
    elif 'SELECTED AGE CATEGORIES' in v or 'Margin of Error' in v or 'PERCENT ALLOCATED' in v or 'SUMMARY INDICATORS' in v:
        DROP.append(k)
    elif v == 'Total; Estimate; Total population':
        RENAME[k] = 'TotalPopulation'
    elif v == 'Households; Estimate; Total':
        RENAME[k] = 'HTotal'
    elif v == 'Families; Estimate; Total':
        RENAME[k] = 'FTotal'
    elif v == 'Married-couple families; Estimate; Total':
        RENAME[k] = 'MTotal'
    elif v == 'Nonfamily households; Estimate; Total':
        RENAME[k] = 'NfTotal'
    elif k[0:2] == 'et' or k[0:2] == 'at' or k[0:2] == 'rt' or k[0:1]=='m':
        words = re.split('[^a-zA-Z0-9]',  v.title())
        r = ''.join(words)
        for w1, w2 in wordreplace.items():
            r = r.replace(w1, w2)
        RENAME[k] = r
# #df_b[['ab_HD01_VD01', 'rb_HD01_VD01','total_pop', 'r_HD01_VD01','total_occ_homes']] -> these three are repeats
df_t.drop(columns=DROP, inplace=True, errors='ignore')
df_t.rename(columns=RENAME, inplace=True)


In [14]:
df_t.shape

(352, 273)

In [15]:
df_b.shape

(560, 101)

In [23]:
df_t.to_csv('CensusTract.csv')
df_b.to_csv('CensusBlock.csv')

## tract land area clean-up

In [None]:
df_land_area.head()

In [None]:
new2 = df_land_area["GEO.display-label"].str.split(", ", expand = True)
df_land_area["county"] = new2[0] 

In [None]:
df_land_area = df_land_area.drop('GEO.display-label', axis = 1)

In [None]:
df_land_area['tract'] = df_land_area['GCT_STUB.display-label.1'] 
df_land_area = df_land_area.drop('GCT_STUB.display-label.1', axis=1)

In [None]:
df_land_area.head()

In [None]:
type(df_land_area['tract'][0])

In [None]:
l = len(df_land_area['tract'])
df_land_area1 = df_land_area.copy()
re_string = 'Census Tract (\d+(?:.\d+)?)'
for i in range(2,l):
    s = df_land_area.loc[i, 'tract']
    m = re.search(re_string, s)
    df_land_area1.loc[i, 'tract'] = m.group(1)

In [None]:
tract_series = df_land_area1['tract']
l = len(df_land_area1['tract'])
for i in range(1,l):
    if '.' in tract_series[i]:
        s = tract_series[i].split('.')
        if len(s[0]) == 1:
            tract_series[i] = s[0] + s[1]
        elif len(s[0]) == 2:
            tract_series[i] = s[0] + s[1]
        elif len(s[0]) == 3:
            tract_series[i] = s[0] + s[1]
    else:
        if len(tract_series[i]) == 3:
            tract_series[i] = df_land_area1['tract'][i] + '00'
        elif len(tract_series[i]) == 2:
            tract_series[i] = df_land_area1['tract'][i] + '00'
        elif len(tract_series[i]) == 1:
            tract_series[i] = df_land_area1['tract'][i] + '00'
        elif len(tract_series[i]) == 4:
            tract_series[i] = df_land_area1['tract'][i] + '00'

In [None]:
df_land_area1.head()

## new column for merge

In [None]:
df_inc1['block_tract'] = df_inc1['block'] + '_' + df_inc1['tract']
df_inc1.head()

In [None]:
df_pop1['block_tract'] = df_pop1['block'] + '_' + df_pop1['tract']
df_pop1.head()

In [None]:
cols = list(df_land_area1)

cols.insert(14, cols.pop(cols.index('tract')))
df_land_area1 = df_land_area1.loc[:, cols]

#len(cols)
df_land_area1.drop(1, axis=0, inplace=True)
df_land_area1.head()