### Data Merge

##### Declarations

In [1]:
#Importing utilities file with library imports and helper functions
%run "utils.ipynb"

In [2]:
#Loading input pickle files
df_income_1901_edited=pd.read_pickle(input_path+"\\df_income_1901_edited.pkl")
df_health_outcomes_edited=pd.read_pickle(input_path+"\\df_health_outcomes_edited.pkl")
df_sdoh_edited=pd.read_pickle(input_path+"\\df_sdoh_edited.pkl")
df_state_politics_edited=pd.read_pickle(input_path+"\\df_state_politics_edited.pkl")
df_state_public_spend_edited=pd.read_pickle(input_path+"\\df_state_public_spend_edited.pkl")
df_acs_edited=pd.read_pickle(input_path+"\\df_acs_edited.pkl")
df_zip2st_edited=pd.read_pickle(input_path+"\\df_zip2st_edited.pkl")
df_urban_rural_edited=pd.read_pickle(input_path+"\\df_urban_rural_edited.pkl")

In [3]:
#Printing shapes to check which file needs most attention for data compression
for df in [df_income_1901_edited, df_health_outcomes_edited, df_sdoh_edited, df_state_politics_edited, 
           df_state_public_spend_edited, df_acs_edited, df_urban_rural_edited]:
    print(df.shape)

(33774, 15)
(1169992, 10)
(291024, 4)
(57, 3)
(255, 3)
(33120, 14)
(33178, 2)


##### ZIP level data

In [4]:
[df_income_1901_edited.ZIP.nunique(), df_health_outcomes_edited.ZIP.nunique(), df_sdoh_edited.ZIP.nunique(), 
 df_acs_edited.ZIP.nunique(), df_urban_rural_edited.ZIP.nunique()]

[33774, 32409, 32336, 33120, 33178]

In [5]:
df_income_1901_essential=df_income_1901_edited[['ZIP', 'Estimate Households Total', 'Percent Households lt 10k', 
                                                'Estimate Households Median income (dollars)', 'Estimate Families Total',
                                                'Estimate Families Median income (dollars)', 'Estimate Nonfamily households Total',
                                                'Estimate Nonfamily households Median income (dollars)', 'Percent Households that are Families']]
df_income_1901_essential.shape

(33774, 9)

In [6]:
df_health_outcomes_essential=df_health_outcomes_edited[['ZIP', 'TotalPopulation', 'Short_Question_Text', 'Data_Value']]
df_health_outcomes_essential_wide=df_health_outcomes_essential.pivot(index=['ZIP', 'TotalPopulation'], columns='Short_Question_Text', 
                                                                     values='Data_Value').reset_index().rename_axis(None, axis=1)
#display(df_health_outcomes_essential_wide.head())
df_health_outcomes_essential_wide.shape

(32409, 39)

In [7]:
df_sdoh_essential_wide=df_sdoh_edited.pivot(index=['ZIP', 'TotalPopulation'], columns='Measure', 
                                                                     values='Data_Value').reset_index().rename_axis(None, axis=1)
#display(df_sdoh_essential_wide.head())
df_sdoh_essential_wide.shape

(32336, 11)

##### State level data

In [8]:
#Checking if any of the state level data needs to be pivoted
print("df_state_politics_edited", df_state_politics_edited.shape, df_state_politics_edited.State.nunique())
print("df_zip2st_edited", df_zip2st_edited.shape, df_zip2st_edited.ZIP.nunique())
print("df_state_public_spend_edited", df_state_public_spend_edited.shape, df_state_public_spend_edited['State Name'].nunique())

df_state_politics_edited (57, 3) 57
df_zip2st_edited (39368, 2) 39368
df_state_public_spend_edited (255, 3) 51


In [9]:
#Turns out state_public_spend_edited will need to be pivoted as it has multiple values of year for each state
df_state_public_spend_essential_wide=df_state_public_spend_edited.pivot(
    index=['State Name'], columns='TimeFrame', values='Data').reset_index().rename_axis(
        None, axis=1).add_prefix('spend_').rename(columns={'spend_State Name':'State Name'})
df_state_public_spend_essential_wide.head()

Unnamed: 0,State Name,spend_2017,spend_2018,spend_2019,spend_2020,spend_2021
0,Alabama,56.43214,57.23,54.25,46.81,52.35
1,Alaska,114.43666,96.52,98.25,215.15,90.17
2,Arizona,8.83289,9.66,15.25,24.34,15.32
3,Arkansas,52.29329,50.57,48.23,45.9,42.94
4,California,61.69589,65.71,71.89,69.87,78.47


In [10]:
#Geographic information (Lat-Long)
geocoords=df_health_outcomes_edited[['ZIP', 'longitude', 'latitude']].drop_duplicates()
geocoords.shape, geocoords.ZIP.nunique()

((32409, 3), 32409)

##### Creating combined dataset

In [54]:
# Combined_dataset_zip contains ZIP level features.
# Combined_dataset_state contains State level features.
# Combined_dataset contains all features

# Merging dataframes for ZIP level features
combined_dataset_zip = pd.merge(
    pd.merge(
        pd.merge(
            pd.merge(
                df_income_1901_essential,
                df_health_outcomes_essential_wide,
                on="ZIP",
                how="outer",
            ),
            df_sdoh_essential_wide,
            on="ZIP",
            how="outer",
        ),
        df_acs_edited,
        on="ZIP",
        how="outer",
    ),
    df_urban_rural_edited,
    on="ZIP",
    how="left",
)

# Merging dataframes for State level features
combined_dataset_state = pd.merge(
    pd.merge(
        df_state_politics_edited,
        df_state_public_spend_essential_wide,
        on="State Name",
        how="outer",
    ),
    df_zip2st_edited,
    on="State",
    how="outer",
)

# Merging ZIP and State level features
combined_dataset = pd.merge(
    pd.merge(combined_dataset_zip, combined_dataset_state, on="ZIP", how="outer"),
    geocoords,
    on="ZIP",
    how="left",
)

# Print the shape of the combined dataset
combined_dataset.shape

(39503, 81)

In [55]:
combined_dataset.sample(5)

Unnamed: 0,ZIP,Estimate Households Total,Percent Households lt 10k,Estimate Households Median income (dollars),Estimate Families Total,Estimate Families Median income (dollars),Estimate Nonfamily households Total,Estimate Nonfamily households Median income (dollars),Percent Households that are Families,TotalPopulation_x,All Teeth Lost,Annual Checkup,Any Disability,Arthritis,Binge Drinking,COPD,Cancer (except skin),Cervical Cancer Screening,Cholesterol Screening,Chronic Kidney Disease,Cognitive Disability,Colorectal Cancer Screening,Core preventive services for older men,Core preventive services for older women,Coronary Heart Disease,Current Asthma,Current Smoking,Dental Visit,Depression,Diabetes,General Health,Health Insurance,Hearing Disability,High Blood Pressure,High Cholesterol,Independent Living Disability,Mammography,Mental Health,Mobility Disability,Obesity,Physical Health,Physical Inactivity,Self-care Disability,Sleep <7 hours,Stroke,Taking BP Medication,Vision Disability,TotalPopulation_y,Crowding among housing units,Housing cost burden among households,No broadband internet subscription among households,No high school diploma among adults aged 25 years or older,Persons aged 65 years or older,Persons living below 150% of the poverty level,Persons of racial or ethnic minority status,Single-parent households,Unemployment among people 16 years and older in the labor force,Estimate!!SEX AND AGE!!Total population,Estimate!!SEX AND AGE!!Total population!!Sex ratio (males per 100 females),Percent!!RACE!!Total population,Percent!!RACE!!Total population!!One race,Percent!!RACE!!Total population!!One race!!Black or African American,Percent Margin of Error!!RACE!!Total population!!One race!!Black or African American,Percent!!RACE!!Total population!!One race!!American Indian and Alaska Native,Percent Margin of Error!!RACE!!Total population!!One race!!American Indian and Alaska Native,Percent!!RACE!!Total population!!One race!!Native Hawaiian and Other Pacific Islander,Percent Margin of Error!!RACE!!Total population!!One race!!Native Hawaiian and Other Pacific Islander,Percent!!HISPANIC OR LATINO AND RACE!!Total population!!Hispanic or Latino (of any race),Percent Margin of Error!!HISPANIC OR LATINO AND RACE!!Total population!!Hispanic or Latino (of any race),Estimate!!Total housing units,Percent_Urban,State Name,State,Political Affiliation (2008-2020 presidential elections),spend_2017,spend_2018,spend_2019,spend_2020,spend_2021,longitude,latitude
18146,53190,6962.0,12.7,56923.0,3342.0,83022.0,3620.0,32059.0,0.480034,19200.0,13.0,70.4,26.1,18.2,25.0,4.8,4.6,74.3,65.6,2.1,16.7,59.2,38.7,30.5,4.0,11.7,13.4,63.7,26.5,6.2,13.4,8.7,5.2,21.3,28.7,8.5,76.4,20.1,8.4,33.4,9.4,20.8,3.0,30.4,2.1,67.5,4.8,19104.0,2.0,35.8,20.2,8.2,11.2,33.1,19.5,3.6,4.1,19484.0,103.7,19484.0,95.0,2.8,0.7,0.2,0.2,0.4,0.4,11.8,2.5,7789.0,0.653205,Wisconsin,WI,Light Blue,14.53865,17.43,17.38,17.67,17.47,-88.733373,42.805973
7809,24894,125.0,15.2,,74.0,21563.0,51.0,,0.592,388.0,37.3,78.1,52.5,40.5,11.1,18.8,7.1,76.1,81.0,4.2,28.2,59.1,36.7,26.8,11.4,14.2,35.7,35.5,33.2,18.6,36.7,16.5,11.6,46.2,44.3,19.1,73.4,25.4,29.9,44.9,23.3,45.2,9.1,44.3,5.2,80.8,11.2,406.0,0.0,16.0,38.7,26.3,21.7,63.5,3.2,0.0,17.3,172.0,149.3,172.0,100.0,0.0,18.3,0.0,18.3,0.0,18.3,0.0,18.3,92.0,0.0,West Virginia,WV,Deep Red,57.28298,59.61,58.74,62.83,,-81.703573,37.282804
36344,35270,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Alabama,AL,Deep Red,56.43214,57.23,54.25,46.81,52.35,,
28170,79372,546.0,3.7,61563.0,430.0,74688.0,116.0,23182.0,0.787546,1434.0,16.8,69.1,32.8,23.9,17.9,7.6,6.0,78.3,82.2,3.1,15.7,61.0,40.2,33.2,6.0,10.0,17.8,51.5,23.6,12.0,21.1,24.7,7.2,33.2,36.7,9.0,72.9,17.5,15.2,37.4,13.2,29.3,4.6,34.8,3.1,74.9,6.3,1400.0,3.2,19.0,19.6,20.2,10.8,30.1,53.4,8.3,10.3,1488.0,105.0,1488.0,87.4,0.6,1.0,1.2,1.5,0.0,3.0,50.2,8.4,578.0,0.0,Texas,TX,Deep Red,21.57633,16.92,17.16,20.41,17.91,-102.488403,33.434247
8297,26287,1148.0,5.9,53021.0,748.0,68667.0,400.0,28676.0,0.651568,3131.0,20.8,80.0,38.5,35.9,12.4,11.6,8.2,82.9,87.8,3.6,17.1,69.3,48.5,37.4,9.2,11.4,21.8,54.1,26.1,14.2,22.6,8.9,9.7,41.9,41.6,10.4,75.8,18.4,20.0,38.2,15.7,30.7,4.5,38.1,3.9,81.6,5.6,2923.0,1.8,15.8,20.0,11.8,19.8,22.7,4.7,5.5,9.0,3186.0,100.6,3186.0,97.6,1.4,1.2,0.0,1.1,0.0,1.1,1.0,1.3,1749.0,0.0,West Virginia,WV,Deep Red,57.28298,59.61,58.74,62.83,,-79.690372,39.165829


In [56]:
combined_dataset['longitude'].isnull().sum()

7094

In [57]:
#Saving combined dataset
combined_dataset.to_pickle(output_path+"\\combined_dataset_anuvrat.pkl")
combined_dataset.to_csv(output_path+"\\combined_dataset_anuvrat.csv")

##### Checking if combined_dataset has values for all ZIPs

In [58]:
# % Missing values for each column. Function defined in helper functions notebook
perc_missing_num(combined_dataset)

Number of records in dataset: 39503
Missing records summary:


Unnamed: 0,column,number_missing,percent_missing
0,ZIP,3,0.007594
71,State Name,132,0.334152
72,State,132,0.334152
73,Political Affiliation (2008-2020 presidential ...,319,0.807534
74,spend_2017,319,0.807534
76,spend_2019,319,0.807534
75,spend_2018,319,0.807534
77,spend_2020,319,0.807534
78,spend_2021,2399,6.072956
4,Estimate Families Total,5729,14.502696


##### Dataset with strict removal of any missing value (except % Urban)

In [59]:
# Giving -99 values to missing urban_percentage as we don't want to drop ZIPs just because of this column
combined_dataset_per_urban_99=combined_dataset.copy()
combined_dataset_per_urban_99['Percent_Urban'].where(combined_dataset_per_urban_99['Percent_Urban'].notnull(), -99, inplace=True)

In [60]:
# Creating the dataset
combined_dataset_nona_strict=combined_dataset_per_urban_99.dropna()

In [61]:
# % Missing values for each column. Function defined in helper functions notebook
#perc_missing_num(combined_dataset_nona_strict)

In [62]:
combined_dataset_nona_strict.shape

(24089, 81)

#### Checking which states are completely absent in the new dataset

In [66]:
# List of states in original dataset and dataset without any missing information
states_in_original=combined_dataset['State Name'].unique()
states_in_nona_strict=combined_dataset_nona_strict['State Name'].unique()

In [67]:
print("States not present in nona but present in original dataset")
for state in states_in_original:
    if state not in states_in_nona_strict: print(state)

States not present in nona but present in original dataset
Puerto Rico
nan
Rhode Island
Delaware
West Virginia
Florida
Kansas
Utah
American Samoa
Northern Mariana Islands
Trust Territories
Guam
Virgin Islands


In [69]:
#combined_dataset[combined_dataset['State Name'].isin(['Kansas', 'Utah', 'Florida', 'Delaware', 'West Virginia'])].sample(20)
print("# unique values for spend 2021 for Utah, Kansas, Delaware, and West Virginia:", 
      combined_dataset[combined_dataset['State Name'].isin(['Utah', 'Kansas', 'Delaware', 'West Virginia'])]['spend_2021'].unique())
print("# unique values for Annual Checkup for Florida:", 
      combined_dataset[combined_dataset['State Name'].isin(['Florida'])]['Annual Checkup'].unique())

# Note: Kansas, Utah, Delaware, and West Virginia does not have Spend 2021 variable populated. 
## Note2: Florida doesn't have majority of health outcomes populated.


# unique values for spend 2021 for Utah, Kansas, Delaware, and West Virginia: [nan]
# unique values for Annual Checkup for Florida: [nan]


In [70]:
# It seems resonable to drop 'spend_2021' to get Kansas, Utah, Delaware, and West Virginia back in the data. 
# However, I can't see a path to include Florida. 
# We can probably take it as a separate case and verify our findings from rest of the data.
combined_dataset_nona=combined_dataset.drop(columns=['spend_2021']).dropna()

In [71]:
# % Missing values for each column. Function defined in helper functions notebook
#perc_missing_num(combined_dataset_nona)

In [72]:
states_in_nona=combined_dataset_nona['State Name'].unique()
print("States not present in nona but present in original dataset")
for state in states_in_original:
    if state not in states_in_nona: print(state)

States not present in nona but present in original dataset
Puerto Rico
nan
Florida
American Samoa
Northern Mariana Islands
Trust Territories
Guam
Virgin Islands


In [73]:
#Summary of #ZIPs by State to see national coverage. Unfortunately Florida is not included
combined_dataset_nona[['State Name', 'ZIP']].groupby(by='State Name').count()

Unnamed: 0_level_0,ZIP
State Name,Unnamed: 1_level_1
Alabama,511
Alaska,138
Arizona,289
Arkansas,408
California,1365
Colorado,368
Connecticut,244
Delaware,53
District of Columbia,21
Georgia,569


In [75]:
combined_dataset_nona_florida=combined_dataset[combined_dataset['State Name'].isin(['Florida'])].dropna(axis=1, how='all').dropna()
combined_dataset_nona_florida.shape

(839, 52)

Insights:
<ol>
<li>29 health outcomes features don't have data for 8,068 or 20% ZIPs. Most of these ZIPs should be 0 or low population as per Census procedure documented in the notes for this dataset.</li>
<li>Another 21 features don't have data for at least 7,000 or more ZIPs.</li>
<li>Another 18 features don't have data for at least 5,704 or more ZIPs.</li>
<li>Removing all ZIPs with missing data leaves us with 24,089 ZIPs (61%) out of 39,503 ZIPs in the original combined dataset. But it will exclude Kansas, Utah and Florida all together.</li>
<li>Kansas, Utah, Delaware and West Virginia do not have Spend 2021 variable populated. Florida doesn't have majority of health outcomes populated.</li>
<li>Recommend to  drop spend 2021 to get Kansas, Utah, Delaware and West Virginia back in the data. </li>
<li>However, I can't see a path to include Florida. We can probably take it as a separate case and verify our findings from rest of the data.</li>
<li>This leaves us with 25,321 ZIPs with 80 features in the primary dataset and 893 ZIPs with 51 features in Florida dataset. </li>
<li>This seems like a reasonable sample size with national coverage. Dropping ZIPs with missing data will remove the noise.</li>
<li>Recommend to move ahead with 25K ZIPs for further analysis.</li>
</ol>

##### Saving dataset without any missing information for any ZIP

In [76]:
#Saving dataset without any missing information
combined_dataset_nona.to_pickle(output_path+"\\combined_dataset_nona.pkl")
combined_dataset_nona.to_csv(output_path+"\\combined_dataset_nona.csv")

#Saving separate dataset for Florida
combined_dataset_nona_florida.to_pickle(output_path+"\\combined_dataset_nona_florida.pkl")
combined_dataset_nona_florida.to_csv(output_path+"\\combined_dataset_nona_florida.csv")