### Data Merge

##### Declarations

In [2]:
#Importing utilities file with library imports and helper functions
%run "utils.ipynb"

In [3]:
# Setting pandas option to show all columns, upto 100 columns
pd.set_option('display.max_columns', 100)

In [5]:
#Loading input pickle files
df_income_1901_edited=pd.read_pickle(input_path+"\\df_income_1901_edited.pkl")
df_health_outcomes_edited=pd.read_pickle(input_path+"\\df_health_outcomes_edited.pkl")
df_sdoh_edited=pd.read_pickle(input_path+"\\df_sdoh_edited.pkl")
df_state_politics_edited=pd.read_pickle(input_path+"\\df_state_politics_edited.pkl")
df_state_public_spend_edited=pd.read_pickle(input_path+"\\df_state_public_spend_edited.pkl")
df_acs_edited=pd.read_pickle(input_path+"\\df_acs_edited.pkl")
df_zip2st_edited=pd.read_pickle(input_path+"\\df_zip2st_edited.pkl")

In [6]:
#Printing shapes to check which file needs most attention for data compression
for df in [df_income_1901_edited, df_health_outcomes_edited, df_sdoh_edited, df_state_politics_edited, df_state_public_spend_edited, df_acs_edited]:
    print(df.shape)

(33774, 15)
(1169992, 10)
(291024, 4)
(57, 3)
(255, 3)
(33120, 14)


##### ZIP level data

In [7]:
df_income_1901_edited.ZIP.nunique(), df_health_outcomes_edited.ZIP.nunique(), df_sdoh_edited.ZIP.nunique(), df_acs_edited.ZIP.nunique()

(33774, 32409, 32336, 33120)

In [8]:
df_income_1901_essential=df_income_1901_edited[['ZIP', 'Estimate Households Total', 'Percent Households lt 10k', 
                                                'Estimate Households Median income (dollars)', 'Estimate Families Total',
                                                'Estimate Families Median income (dollars)', 'Estimate Nonfamily households Total',
                                                'Estimate Nonfamily households Median income (dollars)', 'Percent Households that are Families']]
df_income_1901_essential.shape

(33774, 9)

In [9]:
df_health_outcomes_essential=df_health_outcomes_edited[['ZIP', 'TotalPopulation', 'Short_Question_Text', 'Data_Value']]
df_health_outcomes_essential_wide=df_health_outcomes_essential.pivot(index=['ZIP', 'TotalPopulation'], columns='Short_Question_Text', 
                                                                     values='Data_Value').reset_index().rename_axis(None, axis=1)
#display(df_health_outcomes_essential_wide.head())
df_health_outcomes_essential_wide.shape

(32409, 39)

In [10]:
df_sdoh_essential_wide=df_sdoh_edited.pivot(index=['ZIP', 'TotalPopulation'], columns='Measure', 
                                                                     values='Data_Value').reset_index().rename_axis(None, axis=1)
#display(df_sdoh_essential_wide.head())
df_sdoh_essential_wide.shape

(32336, 11)

##### State level data

In [11]:
#Checking if any of the state level data needs to be pivoted
print("df_state_politics_edited", df_state_politics_edited.shape, df_state_politics_edited.State.nunique())
print("df_zip2st_edited", df_zip2st_edited.shape, df_zip2st_edited.ZIP.nunique())
print("df_state_public_spend_edited", df_state_public_spend_edited.shape, df_state_public_spend_edited['State Name'].nunique())

df_state_politics_edited (57, 3) 57
df_zip2st_edited (39368, 2) 39368
df_state_public_spend_edited (255, 3) 51


In [12]:
#Turns out state_public_spend_edited will need to be pivoted as it has multiple values of year for each state
df_state_public_spend_essential_wide=df_state_public_spend_edited.pivot(
    index=['State Name'], columns='TimeFrame', values='Data').reset_index().rename_axis(
        None, axis=1).add_prefix('spend_').rename(columns={'spend_State Name':'State Name'})
df_state_public_spend_essential_wide.head()

Unnamed: 0,State Name,spend_2017,spend_2018,spend_2019,spend_2020,spend_2021
0,Alabama,56.43214,57.23,54.25,46.81,52.35
1,Alaska,114.43666,96.52,98.25,215.15,90.17
2,Arizona,8.83289,9.66,15.25,24.34,15.32
3,Arkansas,52.29329,50.57,48.23,45.9,42.94
4,California,61.69589,65.71,71.89,69.87,78.47


In [13]:
#Geographic information (Lat-Long)
geocoords=df_health_outcomes_edited[['ZIP', 'longitude', 'latitude']].drop_duplicates()
geocoords.shape, geocoords.ZIP.nunique()

((32409, 3), 32409)

##### Creating combined dataset

In [14]:
#Combined_dataset_zip contain ZIP level features. Combined_dataset_state contains State level features. Combined_dataset contains all features
combined_dataset_zip=pd.merge(pd.merge(pd.merge(df_income_1901_essential, df_health_outcomes_essential_wide, on="ZIP", how="outer"), 
         df_sdoh_essential_wide, on="ZIP", how="outer"), df_acs_edited, on="ZIP", how="outer")
combined_dataset_state=pd.merge(pd.merge(df_state_politics_edited, df_state_public_spend_essential_wide, on="State Name", how="outer"),
         df_zip2st_edited, on="State", how="outer")
combined_dataset=pd.merge(pd.merge(combined_dataset_zip, combined_dataset_state, on="ZIP", how="outer"), geocoords, on="ZIP", how="left")
combined_dataset.shape

(39503, 80)

In [15]:
combined_dataset.sample(5)

Unnamed: 0,ZIP,Estimate Households Total,Percent Households lt 10k,Estimate Households Median income (dollars),Estimate Families Total,Estimate Families Median income (dollars),Estimate Nonfamily households Total,Estimate Nonfamily households Median income (dollars),Percent Households that are Families,TotalPopulation_x,...,State Name,State,Political Affiliation (2008-2020 presidential elections),spend_2017,spend_2018,spend_2019,spend_2020,spend_2021,longitude,latitude
30510,89311,77.0,0.0,49659.0,60.0,77727.0,17.0,,0.779221,235.0,...,Nevada,NV,Deep Blue,6.75379,8.49,10.76,12.19,13.52,-114.200026,39.000789
31512,93675,1368.0,2.0,67262.0,1133.0,90234.0,235.0,29779.0,0.828216,3721.0,...,California,CA,Deep Blue,61.69589,65.71,71.89,69.87,78.47,-119.193493,36.722848
19939,57537,34.0,0.0,69079.0,29.0,69408.0,5.0,,0.852941,83.0,...,South Dakota,SD,Deep Red,36.83437,35.06,35.48,36.04,37.06,-101.092386,44.555364
5523,17551,3871.0,6.9,70196.0,2006.0,93971.0,1865.0,26931.0,0.518212,10857.0,...,Pennsylvania,PA,Light Blue,12.63415,14.5,15.06,14.99,15.75,-76.373732,39.978497
830,3261,1770.0,2.3,122294.0,1390.0,131136.0,380.0,41648.0,0.785311,4246.0,...,New Hampshire,NH,Deep Blue,22.45401,22.85,21.4,23.61,24.72,-71.209562,43.213049


In [16]:
combined_dataset['longitude'].isnull().sum()

7094

In [17]:
#Saving combined dataset
combined_dataset.to_pickle(output_path+"\\combined_dataset_anuvrat.pkl")
combined_dataset.to_csv(output_path+"\\combined_dataset_anuvrat.csv")

##### Checking if combined_dataset has values for all ZIPs

In [18]:
# % Missing values for each column. Function defined in helper functions notebook
perc_missing_num(combined_dataset)

Number of records in dataset: 39503
Missing records summary:


Unnamed: 0,column,number_missing,percent_missing
0,ZIP,3,0.007594
70,State Name,132,0.334152
71,State,132,0.334152
72,Political Affiliation (2008-2020 presidential ...,319,0.807534
73,spend_2017,319,0.807534
75,spend_2019,319,0.807534
74,spend_2018,319,0.807534
76,spend_2020,319,0.807534
77,spend_2021,2399,6.072956
4,Estimate Families Total,5729,14.502696


In [72]:
combined_dataset_nona_strict=combined_dataset.dropna()

In [73]:
# % Missing values for each column. Function defined in helper functions notebook
#perc_missing_num(combined_dataset_nona_strict)

In [74]:
combined_dataset_nona_strict.shape

(24089, 80)

In [75]:
states_in_original=combined_dataset['State Name'].unique()
states_in_nona_strict=combined_dataset_nona_strict['State Name'].unique()

In [76]:
print("States not present in nona but present in original dataset")
for state in states_in_original:
    if state not in states_in_nona_strict: print(state)

States not present in nona but present in original dataset
Puerto Rico
nan
Rhode Island
Delaware
West Virginia
Florida
Kansas
Utah
American Samoa
Northern Mariana Islands
Trust Territories
Guam
Virgin Islands


In [77]:
#combined_dataset[combined_dataset['State Name'].isin(['Kansas', 'Utah', 'Florida'])].sample(20)
print("# unique values for spend 2021 for Utah and Kansas:", 
      combined_dataset[combined_dataset['State Name'].isin(['Utah', 'Kansas'])]['spend_2021'].unique())
print("# unique values for Annual Checkup for Florida:", 
      combined_dataset[combined_dataset['State Name'].isin(['Florida'])]['Annual Checkup'].unique())

# Note: Kansas and Utah does not have Spend 2021 variable populated. 
## Note2: Florida doesn't have majority of health outcomes populated.


# unique values for spend 2021 for Utah and Kansas: [nan]
# unique values for Annual Checkup for Florida: [nan]


In [68]:
# It seems resonable to drop 'spend_2021' to get Kansas and Utah back in the data. 
# However, I can't see a path to include Florida. 
# We can probably take it as a separate case and verify our findings from rest of the data.
combined_dataset_nona=combined_dataset.drop(columns=['spend_2021']).dropna()

In [69]:
# % Missing values for each column. Function defined in helper functions notebook
#perc_missing_num(combined_dataset_nona)

In [70]:
states_in_nona=combined_dataset_nona['State Name'].unique()
print("States not present in nona but present in original dataset")
for state in states_in_original:
    if state not in states_in_nona: print(state)

States not present in nona but present in original dataset
Puerto Rico
nan
Florida
American Samoa
Northern Mariana Islands
Trust Territories
Guam
Virgin Islands


In [71]:
#Summary of #ZIPs by State to see national coverage. Unfortunately Florida is not included
combined_dataset_nona[['State Name', 'ZIP']].groupby(by='State Name').count()

Unnamed: 0_level_0,ZIP
State Name,Unnamed: 1_level_1
Alabama,514
Alaska,138
Arizona,335
Arkansas,410
California,1408
Colorado,390
Connecticut,250
Delaware,54
District of Columbia,21
Georgia,601


In [94]:
combined_dataset_nona_florida=combined_dataset[combined_dataset['State Name'].isin(['Florida'])].dropna(axis=1, how='all').dropna()
combined_dataset_nona_florida.shape

(893, 51)

Insights:
<ol>
<li>29 health outcomes features don't have data for 8,068 or 20% ZIPs. Most of these ZIPs should be 0 or low population as per Census procedure documented in the notes for this dataset.</li>
<li>Another 21 features don't have data for at least 7,000 or more ZIPs.</li>
<li>Another 18 features don't have data for at least 5,704 or more ZIPs.</li>
<li>Removing all ZIPs with missing data leaves us with 24,089 ZIPs (61%) out of 39,503 ZIPs in the original combined dataset. But it will exclude Kansas, Utah and Florida all together.</li>
<li>Kansas and Utah do not have Spend 2021 variable populated. Florida doesn't have majority of health outcomes populated.</li>
<li>Recommend to  drop spend 2021 to get Kansas and Utah back in the data. </li>
<li>However, I can't see a path to include Florida. We can probably take it as a separate case and verify our findings from rest of the data.</li>
<li>This leaves us with 25,321 ZIPs with 80 features in the primary dataset and 893 ZIPs with 51 features in Florida dataset. </li>
<li>This seems like a reasonable sample size with national coverage. Dropping ZIPs with missing data will remove the noise.</li>
<li>Recommend to move ahead with 25K ZIPs for further analysis.</li>
</ol>

##### Saving dataset without any missing information for any ZIP

In [91]:
#Saving dataset without any missing information
combined_dataset_nona.to_pickle(output_path+"\\combined_dataset_nona.pkl")
combined_dataset_nona.to_csv(output_path+"\\combined_dataset_nona.csv")

#Saving separate dataset for Florida
combined_dataset_nona_florida.to_pickle(output_path+"\\combined_dataset_nona_florida.pkl")
combined_dataset_nona_florida.to_csv(output_path+"\\combined_dataset_nona_florida.csv")