### Data Merge

##### Declarations

In [1]:
#Defining libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
#Importing helper functions
%run "helper_functions.ipynb"

In [94]:
#Setting pandas option to disable scientific notation
pd.set_option('display.max_rows', 10000)

In [3]:
#Input and output paths for all pickle files. To be used to load and save these files
#Output
input_path="C:\\Users\\Anuvrat\\OneDrive\\Documents\\milestone1\\data\\inputs"
output_path="C:\\Users\\Anuvrat\\OneDrive\\Documents\\milestone1\\data\\inputs"

In [108]:
#Loading input pickle files
df_income_1901_edited=pd.read_pickle(input_path+"\\df_income_1901_edited.pkl")
df_health_outcomes_edited=pd.read_pickle(input_path+"\\df_health_outcomes_edited.pkl")
df_sdoh_edited=pd.read_pickle(input_path+"\\df_sdoh_edited.pkl")
df_state_politics_edited=pd.read_pickle(input_path+"\\df_state_politics_edited.pkl")
df_state_public_spend_edited=pd.read_pickle(input_path+"\\df_state_public_spend_edited.pkl")
df_acs_edited=pd.read_pickle(input_path+"\\df_acs_edited.pkl")
df_zip2st_edited=pd.read_pickle(input_path+"\\df_zip2st_edited.pkl")

In [36]:
#Printing shapes to check which file needs most attention for data compression
for df in [df_income_1901_edited, df_health_outcomes_edited, df_sdoh_edited, df_state_politics_edited, df_state_public_spend_edited, df_acs_edited]:
    print(df.shape)

(33774, 15)
(1169992, 10)
(291024, 5)
(57, 3)
(515, 6)
(33120, 14)


##### ZIP level data

In [71]:
df_income_1901_edited.ZIP.nunique(), df_health_outcomes_edited.ZIP.nunique(), df_sdoh_edited.ZIP.nunique(), df_acs_edited.ZIP.nunique()

(33774, 32409, 32336, 33120)

In [79]:
df_income_1901_essential=df_income_1901_edited[['ZIP', 'Estimate Households Total', 'Percent Households lt 10k', 
                                                'Estimate Households Median income (dollars)', 'Estimate Families Total',
                                                'Estimate Families Median income (dollars)', 'Estimate Nonfamily households Total',
                                                'Estimate Nonfamily households Median income (dollars)', 'Percent Households that are Families']]
df_income_1901_essential.shape

(33774, 9)

In [68]:
df_health_outcomes_essential=df_health_outcomes_edited[['ZIP', 'TotalPopulation', 'Short_Question_Text', 'Data_Value']]
df_health_outcomes_essential_wide=df_health_outcomes_essential.pivot(index=['ZIP', 'TotalPopulation'], columns='Short_Question_Text', 
                                                                     values='Data_Value').reset_index().rename_axis(None, axis=1)
#display(df_health_outcomes_essential_wide.head())
df_health_outcomes_essential_wide.shape

(32409, 39)

In [77]:
df_sdoh_essential_wide=df_sdoh_edited.pivot(index=['ZIP', 'TotalPopulation'], columns='Measure', 
                                                                     values='Data_Value').reset_index().rename_axis(None, axis=1)
#display(df_sdoh_essential_wide.head())
df_sdoh_essential_wide.shape

(32336, 11)

##### State level data

In [126]:
#Checking if any of the state level data needs to be pivoted
print("df_state_politics_edited", df_state_politics_edited.shape, df_state_politics_edited.State.nunique())
print("df_zip2st_edited", df_zip2st_edited.shape, df_zip2st_edited.ZIP.nunique())
print("df_state_public_spend_edited", df_state_public_spend_edited.shape, df_state_public_spend_edited['State Name'].nunique())

df_state_politics_edited (57, 3) 57
df_zip2st_edited (39368, 2) 39368
df_state_public_spend_edited (255, 3) 51


In [118]:
#Turns out state_public_spend_edited will need to be pivoted as it has multiple values of year for each state
df_state_public_spend_essential_wide=df_state_public_spend_edited.pivot(
    index=['State Name'], columns='TimeFrame', values='Data').reset_index().rename_axis(
        None, axis=1).add_prefix('spend_').rename(columns={'spend_State Name':'State Name'})
df_state_public_spend_essential_wide.head()

Unnamed: 0,State Name,spend_2017,spend_2018,spend_2019,spend_2020,spend_2021
0,Alabama,56.43214,57.23,54.25,46.81,52.35
1,Alaska,114.43666,96.52,98.25,215.15,90.17
2,Arizona,8.83289,9.66,15.25,24.34,15.32
3,Arkansas,52.29329,50.57,48.23,45.9,42.94
4,California,61.69589,65.71,71.89,69.87,78.47


##### Creating combined dataset

In [127]:
#Combined_dataset_zip contain ZIP level features. Combined_dataset_state contains State level features. Combined_dataset contains all features
combined_dataset_zip=pd.merge(pd.merge(pd.merge(df_income_1901_essential, df_health_outcomes_essential_wide, on="ZIP", how="outer"), 
         df_sdoh_essential_wide, on="ZIP", how="outer"), df_acs_edited, on="ZIP", how="outer")
combined_dataset_state=pd.merge(pd.merge(df_state_politics_edited, df_state_public_spend_essential_wide, on="State Name", how="outer"),
         df_zip2st_edited, on="State", how="outer")
combined_dataset=pd.merge(combined_dataset_zip, combined_dataset_state, on="ZIP", how="outer")
combined_dataset.shape

(39503, 78)

In [128]:
combined_dataset.to_pickle(output_path+"\\combined_dataset_anuvrat.pkl")

In [131]:
combined_dataset.sample(5)

Unnamed: 0,ZIP,Estimate Households Total,Percent Households lt 10k,Estimate Households Median income (dollars),Estimate Families Total,Estimate Families Median income (dollars),Estimate Nonfamily households Total,Estimate Nonfamily households Median income (dollars),Percent Households that are Families,TotalPopulation_x,...,Percent Margin of Error!!HISPANIC OR LATINO AND RACE!!Total population!!Hispanic or Latino (of any race),Estimate!!Total housing units,State Name,State,Political Affiliation (2008-2020 presidential elections),spend_2017,spend_2018,spend_2019,spend_2020,spend_2021
8138,25846,42.0,0.0,,30.0,,12.0,,0.714286,363.0,...,13.5,149.0,West Virginia,WV,Deep Red,57.28298,59.61,58.74,62.83,
10131,30728,7227.0,6.1,46702.0,4739.0,56608.0,2488.0,22759.0,0.655735,19612.0,...,1.2,8834.0,Georgia,GA,Light Red,21.27243,22.42,27.35,27.78,30.82
22770,64622,223.0,1.8,76349.0,158.0,83690.0,65.0,46488.0,0.70852,600.0,...,5.4,232.0,Missouri,MO,Deep Red,5.74266,7.07,7.13,7.06,6.54
7024,22713,694.0,2.6,99861.0,512.0,123875.0,182.0,,0.737752,1406.0,...,0.3,780.0,Virginia,VA,Deep Blue,38.12053,38.08,39.6,36.26,41.98
2958,11560,2436.0,4.1,162143.0,1994.0,216389.0,442.0,63077.0,0.818555,6464.0,...,4.6,2551.0,New York,NY,Deep Blue,86.81281,83.99,83.21,84.87,92.44
