In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

Download EPA data:

In [2]:
epa_suffolk = pd.read_csv('./data/epa_data/EJSCREEN_2023_Tracts_Suffolk.csv', encoding='latin-1', low_memory=False)
epa_LA = pd.read_csv('./data/epa_data/EJSCREEN_2023_Tracts_LA.csv', encoding='latin-1', low_memory=False)
epa_harris = pd.read_csv('./data/epa_data/EJSCREEN_2023_Tracts_Harris.csv', encoding='latin-1', low_memory=False)
epa_dallas = pd.read_csv('./data/epa_data/EJSCREEN_2023_Tracts_Dallas.csv', encoding='latin-1', low_memory=False)

In [4]:
#epa_suffolk.info()

Download ACS data:

In [129]:
# acs_suffolk = pd.read_csv('./data/census_data/Suffolk_County_Cleaned_Data.csv', 
#                        encoding='latin-1', low_memory=False)
# acs_LA = pd.read_csv('./data/census_data/LA_County_Cleaned_Data_With-Header.csv', 
#                        encoding='latin-1', low_memory=False)
# acs_harris = pd.read_csv('./data/census_data/Harris_County_Cleaned_Data.csv', 
#                        encoding='latin-1', low_memory=False)

In [3]:
acs_suffolk = pd.read_csv('./data/census_data/ACSDP5Y2021.DP05-Data_Suffolk.csv', 
                       encoding='latin-1', low_memory=False)
acs_LA = pd.read_csv('./data/census_data/ACSDP5Y2021.DP05-Data_LA.csv', 
                       encoding='latin-1', low_memory=False)
acs_harris = pd.read_csv('./data/census_data/ACSDP5Y2021.DP05-Data_Harris.csv', 
                       encoding='latin-1', low_memory=False)
acs_dallas = pd.read_csv('./data/census_data/ACSDP5Y2021.DP05-Data_Dallas.csv', 
                       encoding='latin-1', low_memory=False)

In [4]:
# acs columns to select
# ADDED: "White Only Pop" - DP05_0037
acs_columns = ['GEO_ID','NAME', 'DP05_0063E', 'DP05_0063M', 'DP05_0037E', 'DP05_0037M', 'DP05_0064E', 'DP05_0064M',
 'DP05_0065E', 'DP05_0065M', 'DP05_0067E', 'DP05_0067M', 'DP05_0071E',
 'DP05_0071M'];

In [5]:
# save acs column info
acs_col_info = acs_suffolk[acs_suffolk.index.isin([0])]

for col in acs_columns:
    print(col, ':', acs_col_info[col][0])

GEO_ID : Geography
NAME : Geographic Area Name
DP05_0063E : Estimate!!Race alone or in combination with one or more other races!!Total population
DP05_0063M : Margin of Error!!Race alone or in combination with one or more other races!!Total population
DP05_0037E : Estimate!!RACE!!Total population!!One race!!White
DP05_0037M : Margin of Error!!RACE!!Total population!!One race!!White
DP05_0064E : Estimate!!Race alone or in combination with one or more other races!!Total population!!White
DP05_0064M : Margin of Error!!Race alone or in combination with one or more other races!!Total population!!White
DP05_0065E : Estimate!!Race alone or in combination with one or more other races!!Total population!!Black or African American
DP05_0065M : Margin of Error!!Race alone or in combination with one or more other races!!Total population!!Black or African American
DP05_0067E : Estimate!!Race alone or in combination with one or more other races!!Total population!!Asian
DP05_0067M : Margin of Error!!R

In [6]:
# select subset of acs data
# drop column info (row 0)
acs_suffolk_clean = acs_suffolk[acs_columns].drop(0,axis=0)
acs_LA_clean = acs_LA[acs_columns].drop(0,axis=0)
acs_harris_clean = acs_harris[acs_columns].drop(0,axis=0)
acs_dallas_clean = acs_dallas[acs_columns].drop(0,axis=0)

acs_suffolk_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 1 to 235
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   GEO_ID      235 non-null    object
 1   NAME        235 non-null    object
 2   DP05_0063E  235 non-null    object
 3   DP05_0063M  235 non-null    object
 4   DP05_0037E  235 non-null    object
 5   DP05_0037M  235 non-null    object
 6   DP05_0064E  235 non-null    object
 7   DP05_0064M  235 non-null    object
 8   DP05_0065E  235 non-null    object
 9   DP05_0065M  235 non-null    object
 10  DP05_0067E  235 non-null    object
 11  DP05_0067M  235 non-null    object
 12  DP05_0071E  235 non-null    object
 13  DP05_0071M  235 non-null    object
dtypes: object(14)
memory usage: 25.8+ KB


In [8]:
# list of columns with integer data
int_values = np.array(acs_columns)[2:]

# change str to int type
acs_suffolk_clean[int_values] = acs_suffolk_clean[int_values].astype(int)
acs_LA_clean[int_values] = acs_LA_clean[int_values].astype(int)
acs_harris_clean[int_values] = acs_harris_clean[int_values].astype(int)
acs_dallas_clean[int_values] = acs_dallas_clean[int_values].astype(int)

acs_suffolk_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 1 to 235
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   GEO_ID      235 non-null    object
 1   NAME        235 non-null    object
 2   DP05_0063E  235 non-null    int64 
 3   DP05_0063M  235 non-null    int64 
 4   DP05_0037E  235 non-null    int64 
 5   DP05_0037M  235 non-null    int64 
 6   DP05_0064E  235 non-null    int64 
 7   DP05_0064M  235 non-null    int64 
 8   DP05_0065E  235 non-null    int64 
 9   DP05_0065M  235 non-null    int64 
 10  DP05_0067E  235 non-null    int64 
 11  DP05_0067M  235 non-null    int64 
 12  DP05_0071E  235 non-null    int64 
 13  DP05_0071M  235 non-null    int64 
dtypes: int64(12), object(2)
memory usage: 25.8+ KB


In [9]:
def clean_column(text):
    return int(text[9:])

#test:
id_test = acs_harris_clean['GEO_ID'][1]
clean_column(id_test)

48201100001

In [10]:
# add ID column
acs_suffolk_clean.insert(loc = 1,
          column = 'ID',
          value = acs_suffolk_clean['GEO_ID'].apply(clean_column))
acs_LA_clean.insert(loc = 1,
          column = 'ID',
          value = acs_LA_clean['GEO_ID'].apply(clean_column))
acs_harris_clean.insert(loc = 1,
          column = 'ID',
          value = acs_harris_clean['GEO_ID'].apply(clean_column))
acs_dallas_clean.insert(loc = 1,
          column = 'ID',
          value = acs_dallas_clean['GEO_ID'].apply(clean_column))

acs_suffolk_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 1 to 235
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   GEO_ID      235 non-null    object
 1   ID          235 non-null    int64 
 2   NAME        235 non-null    object
 3   DP05_0063E  235 non-null    int64 
 4   DP05_0063M  235 non-null    int64 
 5   DP05_0037E  235 non-null    int64 
 6   DP05_0037M  235 non-null    int64 
 7   DP05_0064E  235 non-null    int64 
 8   DP05_0064M  235 non-null    int64 
 9   DP05_0065E  235 non-null    int64 
 10  DP05_0065M  235 non-null    int64 
 11  DP05_0067E  235 non-null    int64 
 12  DP05_0067M  235 non-null    int64 
 13  DP05_0071E  235 non-null    int64 
 14  DP05_0071M  235 non-null    int64 
dtypes: int64(13), object(2)
memory usage: 27.7+ KB


In [11]:
acs_suffolk_clean.head()

Unnamed: 0,GEO_ID,ID,NAME,DP05_0063E,DP05_0063M,DP05_0037E,DP05_0037M,DP05_0064E,DP05_0064M,DP05_0065E,DP05_0065M,DP05_0067E,DP05_0067M,DP05_0071E,DP05_0071M
1,1400000US25025000101,25025000101,"Census Tract 1.01, Suffolk County, Massachusetts",1824,331,1348,326,1377,326,127,91,329,151,135,87
2,1400000US25025000102,25025000102,"Census Tract 1.02, Suffolk County, Massachusetts",3706,750,2109,788,2118,791,423,240,771,271,531,309
3,1400000US25025000201,25025000201,"Census Tract 2.01, Suffolk County, Massachusetts",4309,684,2914,441,3313,652,418,249,511,377,408,177
4,1400000US25025000202,25025000202,"Census Tract 2.02, Suffolk County, Massachusetts",4094,475,2479,379,2853,469,431,305,495,197,1104,339
5,1400000US25025000301,25025000301,"Census Tract 3.01, Suffolk County, Massachusetts",2827,269,2026,337,2095,341,228,124,451,207,309,145


In [12]:
epa_suffolk.head()

Unnamed: 0,ID,STATE_NAME,ST_ABBREV,CNTY_NAME,REGION,PM25,OZONE,DSLPM,CANCER,RESP,RSEI_AIR,PTRAF,PRE1960,PRE1960PCT,PNPL,PRMP,PTSDF,UST,PWDIS
0,25025000101,Massachusetts,MA,Suffolk,1,7.231061,57.52787,0.421661,30.0,0.3,4327.941599,1072.806706,397.0,0.686851,0.08276,0.155476,26.33804,4.707373,0.002001
1,25025000102,Massachusetts,MA,Suffolk,1,7.231061,57.52787,0.421661,30.0,0.3,4332.727526,1812.831306,487.0,0.326846,0.083028,0.141833,23.256403,3.936035,0.003599
2,25025000201,Massachusetts,MA,Suffolk,1,7.237915,57.31368,0.342112,20.0,0.3,4356.252038,863.891322,1308.0,0.709712,0.085904,0.120828,13.3851,1.433462,0.00231
3,25025000202,Massachusetts,MA,Suffolk,1,7.241805,57.40047,0.345565,20.0,0.3,4422.413875,736.756283,1179.0,0.763107,0.087418,0.130583,19.255717,2.800577,0.001759
4,25025000301,Massachusetts,MA,Suffolk,1,7.238449,57.24225,0.339133,20.0,0.3,4339.000103,1137.412735,523.0,0.422797,0.084361,0.112525,9.918938,1.540744,0.001451


## Merge ACS and EPA

In [13]:
acs_epa_merged_suffolk = pd.merge(acs_suffolk_clean, epa_suffolk, how='inner', on='ID')
acs_epa_merged_LA = pd.merge(acs_LA_clean, epa_LA, how='inner', on='ID')
acs_epa_merged_harris = pd.merge(acs_harris_clean, epa_harris, how='inner', on='ID')
acs_epa_merged_dallas = pd.merge(acs_dallas_clean, epa_dallas, how='inner', on='ID')

In [15]:
# re-order columns
df = acs_epa_merged_suffolk
df.insert(3, 'STATE_NAME', df.pop('STATE_NAME'))
df.insert(4, 'ST_ABBREV', df.pop('ST_ABBREV'))
df.insert(5, 'CNTY_NAME', df.pop('CNTY_NAME'))
df.insert(6, 'REGION', df.pop('REGION'))
acs_epa_merged_reordered_suffolk = df
#acs_epa_merged_reordered_suffolk.info()

# re-order columns
df = acs_epa_merged_LA
df.insert(3, 'STATE_NAME', df.pop('STATE_NAME'))
df.insert(4, 'ST_ABBREV', df.pop('ST_ABBREV'))
df.insert(5, 'CNTY_NAME', df.pop('CNTY_NAME'))
df.insert(6, 'REGION', df.pop('REGION'))
acs_epa_merged_reordered_LA = df
#acs_epa_merged_reordered_LA.info()

# re-order columns
df = acs_epa_merged_harris
df.insert(3, 'STATE_NAME', df.pop('STATE_NAME'))
df.insert(4, 'ST_ABBREV', df.pop('ST_ABBREV'))
df.insert(5, 'CNTY_NAME', df.pop('CNTY_NAME'))
df.insert(6, 'REGION', df.pop('REGION'))
acs_epa_merged_reordered_harris = df
#acs_epa_merged_reordered_harris.info()

# re-order columns
df = acs_epa_merged_dallas
df.insert(3, 'STATE_NAME', df.pop('STATE_NAME'))
df.insert(4, 'ST_ABBREV', df.pop('ST_ABBREV'))
df.insert(5, 'CNTY_NAME', df.pop('CNTY_NAME'))
df.insert(6, 'REGION', df.pop('REGION'))
acs_epa_merged_reordered_dallas = df
#acs_epa_merged_reordered_dallas.info()

In [16]:
acs_epa_merged_reordered_suffolk.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   GEO_ID      235 non-null    object 
 1   ID          235 non-null    int64  
 2   NAME        235 non-null    object 
 3   STATE_NAME  235 non-null    object 
 4   ST_ABBREV   235 non-null    object 
 5   CNTY_NAME   235 non-null    object 
 6   REGION      235 non-null    int64  
 7   DP05_0063E  235 non-null    int64  
 8   DP05_0063M  235 non-null    int64  
 9   DP05_0037E  235 non-null    int64  
 10  DP05_0037M  235 non-null    int64  
 11  DP05_0064E  235 non-null    int64  
 12  DP05_0064M  235 non-null    int64  
 13  DP05_0065E  235 non-null    int64  
 14  DP05_0065M  235 non-null    int64  
 15  DP05_0067E  235 non-null    int64  
 16  DP05_0067M  235 non-null    int64  
 17  DP05_0071E  235 non-null    int64  
 18  DP05_0071M  235 non-null    int64  
 19  PM25        235 non-null    f

In [17]:
for col in acs_columns:
    print(col, ':', acs_col_info[col][0])

GEO_ID : Geography
NAME : Geographic Area Name
DP05_0063E : Estimate!!Race alone or in combination with one or more other races!!Total population
DP05_0063M : Margin of Error!!Race alone or in combination with one or more other races!!Total population
DP05_0037E : Estimate!!RACE!!Total population!!One race!!White
DP05_0037M : Margin of Error!!RACE!!Total population!!One race!!White
DP05_0064E : Estimate!!Race alone or in combination with one or more other races!!Total population!!White
DP05_0064M : Margin of Error!!Race alone or in combination with one or more other races!!Total population!!White
DP05_0065E : Estimate!!Race alone or in combination with one or more other races!!Total population!!Black or African American
DP05_0065M : Margin of Error!!Race alone or in combination with one or more other races!!Total population!!Black or African American
DP05_0067E : Estimate!!Race alone or in combination with one or more other races!!Total population!!Asian
DP05_0067M : Margin of Error!!R

In [18]:
# define new acs col names
new_col_names = {'DP05_0063E': 'Total_Pop',
                 'DP05_0063M': 'Total_Pop_Error',
                 'DP05_0037E': 'White_Only_Pop' , 
                 'DP05_0037M': 'White_Only_Pop_Error',
                 'DP05_0064E': 'White_Pop', 
                 'DP05_0064M': 'White_Pop_Error', 
                 'DP05_0065E': 'Black_or_African_American_Pop', 
                 'DP05_0065M': 'Black_or_African_American_Pop_Error', 
                 'DP05_0067E': 'Asian_Pop',  
                 'DP05_0067M': 'Asian_Pop_Error',  
                 'DP05_0071E': 'Hispanic_or_Latino_Pop', 
                 'DP05_0071M': 'Hispanic_or_Latino_Pop_Error'}

In [19]:
acs_epa_merged_reordered_suffolk = acs_epa_merged_reordered_suffolk.rename(columns=new_col_names)
acs_epa_merged_reordered_LA = acs_epa_merged_reordered_LA.rename(columns=new_col_names)
acs_epa_merged_reordered_harris = acs_epa_merged_reordered_harris.rename(columns=new_col_names)
acs_epa_merged_reordered_dallas = acs_epa_merged_reordered_dallas.rename(columns=new_col_names)

In [20]:
acs_epa_merged_reordered_suffolk.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Data columns (total 33 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   GEO_ID                               235 non-null    object 
 1   ID                                   235 non-null    int64  
 2   NAME                                 235 non-null    object 
 3   STATE_NAME                           235 non-null    object 
 4   ST_ABBREV                            235 non-null    object 
 5   CNTY_NAME                            235 non-null    object 
 6   REGION                               235 non-null    int64  
 7   Total_Pop                            235 non-null    int64  
 8   Total_Pop_Error                      235 non-null    int64  
 9   White_Only_Pop                       235 non-null    int64  
 10  White_Only_Pop_Error                 235 non-null    int64  
 11  White_Pop                       

In [23]:
acs_epa_merged_reordered_LA.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2498 entries, 0 to 2497
Data columns (total 33 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   GEO_ID                               2498 non-null   object 
 1   ID                                   2498 non-null   int64  
 2   NAME                                 2498 non-null   object 
 3   STATE_NAME                           2498 non-null   object 
 4   ST_ABBREV                            2498 non-null   object 
 5   CNTY_NAME                            2498 non-null   object 
 6   REGION                               2498 non-null   int64  
 7   Total_Pop                            2498 non-null   int64  
 8   Total_Pop_Error                      2498 non-null   int64  
 9   White_Only_Pop                       2498 non-null   int64  
 10  White_Only_Pop_Error                 2498 non-null   int64  
 11  White_Pop                     

In [21]:
acs_epa_merged_reordered_harris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1115 entries, 0 to 1114
Data columns (total 33 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   GEO_ID                               1115 non-null   object 
 1   ID                                   1115 non-null   int64  
 2   NAME                                 1115 non-null   object 
 3   STATE_NAME                           1115 non-null   object 
 4   ST_ABBREV                            1115 non-null   object 
 5   CNTY_NAME                            1115 non-null   object 
 6   REGION                               1115 non-null   int64  
 7   Total_Pop                            1115 non-null   int64  
 8   Total_Pop_Error                      1115 non-null   int64  
 9   White_Only_Pop                       1115 non-null   int64  
 10  White_Only_Pop_Error                 1115 non-null   int64  
 11  White_Pop                     

In [22]:
acs_epa_merged_reordered_dallas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 645 entries, 0 to 644
Data columns (total 33 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   GEO_ID                               645 non-null    object 
 1   ID                                   645 non-null    int64  
 2   NAME                                 645 non-null    object 
 3   STATE_NAME                           645 non-null    object 
 4   ST_ABBREV                            645 non-null    object 
 5   CNTY_NAME                            645 non-null    object 
 6   REGION                               645 non-null    int64  
 7   Total_Pop                            645 non-null    int64  
 8   Total_Pop_Error                      645 non-null    int64  
 9   White_Only_Pop                       645 non-null    int64  
 10  White_Only_Pop_Error                 645 non-null    int64  
 11  White_Pop                       

In [24]:
acs_epa_merged_reordered_suffolk.to_csv("./data/census_data/suffolk_merged_acs+epa_tracts_rename.csv", index=False)
acs_epa_merged_reordered_LA.to_csv("./data/census_data/LA_merged_acs+epa_tracts_rename.csv", index=False)
acs_epa_merged_reordered_harris.to_csv("./data/census_data/harris_merged_acs+epa_tracts_rename.csv", index=False)
acs_epa_merged_reordered_dallas.to_csv("./data/census_data/dallas_merged_acs+epa_tracts_rename.csv", index=False)

In [38]:
# test
suffolk_test = pd.read_csv('./data/census_data/suffolk_merged_acs+epa_tracts_rename.csv', encoding='latin-1', low_memory=False)
LA_test = pd.read_csv('./data/census_data/LA_merged_acs+epa_tracts_rename.csv', encoding='latin-1', low_memory=False)
harris_test = pd.read_csv('./data/census_data/harris_merged_acs+epa_tracts_rename.csv', encoding='latin-1', low_memory=False)
dallas_test = pd.read_csv('./data/census_data/dallas_merged_acs+epa_tracts_rename.csv', encoding='latin-1', low_memory=False)

suffolk_test
#LA_test
#harris_test
#dallas_test

Unnamed: 0,GEO_ID,ID,NAME,STATE_NAME,ST_ABBREV,CNTY_NAME,REGION,Total_Pop,Total_Pop_Error,White_Only_Pop,...,RESP,RSEI_AIR,PTRAF,PRE1960,PRE1960PCT,PNPL,PRMP,PTSDF,UST,PWDIS
0,1400000US25025000101,25025000101,"Census Tract 1.01, Suffolk County, Massachusetts",Massachusetts,MA,Suffolk,1,1824,331,1348,...,0.3,4327.941599,1072.806706,397.0,0.686851,0.082760,0.155476,26.338040,4.707373,0.002001
1,1400000US25025000102,25025000102,"Census Tract 1.02, Suffolk County, Massachusetts",Massachusetts,MA,Suffolk,1,3706,750,2109,...,0.3,4332.727526,1812.831306,487.0,0.326846,0.083028,0.141833,23.256403,3.936035,0.003599
2,1400000US25025000201,25025000201,"Census Tract 2.01, Suffolk County, Massachusetts",Massachusetts,MA,Suffolk,1,4309,684,2914,...,0.3,4356.252038,863.891322,1308.0,0.709712,0.085904,0.120828,13.385100,1.433462,0.002310
3,1400000US25025000202,25025000202,"Census Tract 2.02, Suffolk County, Massachusetts",Massachusetts,MA,Suffolk,1,4094,475,2479,...,0.3,4422.413875,736.756283,1179.0,0.763107,0.087418,0.130583,19.255717,2.800577,0.001759
4,1400000US25025000301,25025000301,"Census Tract 3.01, Suffolk County, Massachusetts",Massachusetts,MA,Suffolk,1,2827,269,2026,...,0.3,4339.000103,1137.412735,523.0,0.422797,0.084361,0.112525,9.918938,1.540744,0.001451
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230,1400000US25025981600,25025981600,"Census Tract 9816, Suffolk County, Massachusetts",Massachusetts,MA,Suffolk,1,0,13,0,...,,3044.930911,56.346821,0.0,0.000000,0.061235,0.244218,7.628301,1.825356,0.245649
231,1400000US25025981700,25025981700,"Census Tract 9817, Suffolk County, Massachusetts",Massachusetts,MA,Suffolk,1,0,13,0,...,,3628.924545,2765.808744,0.0,0.000000,0.080886,0.603712,52.473968,5.237528,0.005129
232,1400000US25025981800,25025981800,"Census Tract 9818, Suffolk County, Massachusetts",Massachusetts,MA,Suffolk,1,47,44,10,...,0.3,4853.382243,1418.707751,0.0,0.000000,0.112089,0.303752,35.614694,12.912027,0.002831
233,1400000US25025981900,25025981900,"Census Tract 9819, Suffolk County, Massachusetts",Massachusetts,MA,Suffolk,1,0,13,0,...,0.3,0.000000,,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,


## CHECK MISSING TRACTS:

In [27]:
merged = pd.merge(acs_suffolk_clean, epa_suffolk, how='outer', on='ID', indicator=True)
missing_from_epa_suffolk = merged[merged['_merge'] == 'left_only']
missing_from_acs_suffolk = merged[merged['_merge'] == 'right_only']
print('suffolk: missing from acs = ', len(missing_from_acs_suffolk))
print('suffolk: missing from epa = ', len(missing_from_epa_suffolk))

suffolk: missing from acs =  0
suffolk: missing from epa =  0


In [28]:
merged = pd.merge(acs_LA_clean, epa_LA, how='outer', on='ID', indicator=True)
missing_from_epa_LA = merged[merged['_merge'] == 'left_only']
missing_from_acs_LA = merged[merged['_merge'] == 'right_only']
print('LA: missing from acs = ', len(missing_from_acs_LA))
print('LA: missing from epa = ', len(missing_from_epa_LA))

LA: missing from acs =  0
LA: missing from epa =  0


In [29]:
merged = pd.merge(acs_harris_clean, epa_harris, how='outer', on='ID', indicator=True)
missing_from_epa_harris = merged[merged['_merge'] == 'left_only']
missing_from_acs_harris = merged[merged['_merge'] == 'right_only']
print('harris: missing from acs = ', len(missing_from_acs_harris))
print('harris: missing from epa = ', len(missing_from_epa_harris))

harris: missing from acs =  0
harris: missing from epa =  0


In [30]:
merged = pd.merge(acs_dallas_clean, epa_dallas, how='outer', on='ID', indicator=True)
missing_from_epa_dallas = merged[merged['_merge'] == 'left_only']
missing_from_acs_dallas = merged[merged['_merge'] == 'right_only']
print('harris: missing from acs = ', len(missing_from_acs_dallas))
print('harris: missing from epa = ', len(missing_from_epa_dallas))

harris: missing from acs =  0
harris: missing from epa =  0


In [None]:
df = df.rename(columns={'oldName1': 'newName1', 'oldName2': 'newName2'})