### Exploring inputs

#### Importing libraries

In [1]:
#Defining libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re

In [2]:
#Importing helper functions
%run "helper_functions.ipynb"

In [110]:
#Setting pandas option to disable scientific notation
pd.set_option('display.max_rows', 10000)

#### Importing datasets to be used

In [3]:
#Input and output paths for all CSV, Excel, etc. files. To be used to load these files
#Alternatively, can also read the json files using census API where relevant
input_path="C:\\Users\\Anuvrat\\OneDrive\\Documents\\milestone1\\data\\inputs"
output_path="C:\\Users\\Anuvrat\\OneDrive\\Documents\\milestone1\\data\\inputs"

In [4]:
# Census income levels
# Income in the Past 12 Months (in 2022 Inflation-Adjusted Dollars) : ACSST5Y2022.S1901
df_income_1901 = pd.read_csv(
    input_path + "\ACSST5Y2022-S1901-Data.csv", dtype={"Geographic Area Name": "string"}
)
# df_income_1901.head(5)

In [5]:
# Health Outcomes
df_health_outcomes = pd.read_csv(
    input_path
    + "\PLACES__Local_Data_for_Better_Health__ZCTA_Data_2023_release_20240127.csv",
    low_memory=False,
    dtype={
        "Year": "string",
        "LocationID": "string",
        "LocationName": "string",
        "Category": "string",
        "Measure": "string",
        "Data_Value_Unit": "string",
        "Data_Value": "float",
        "TotalPopulation": "int64",
        "Short_Question_Text": "string",
        "Geolocation": "string",
    },
)
# df_health_outcomes.head(5)

In [6]:
# Social Determinants of Health
df_sdoh = pd.read_csv(
    input_path + "\SDOH_Measures_for_ZCTA__ACS_2017-2021_20240121.csv",
    dtype={"LocationName": "string"},
)
# df_sdoh.head(5)

In [7]:
# Policital affiliation by state    ### You will need to install openpyxl for read_excel to work
df_state_politics = pd.read_excel(
    input_path + "\States Political Affiliations Jan-2024.xlsx", sheet_name="Data"
)
# df_state_politics.head(5)

In [116]:
# Public health spending by state
df_state_public_spend = pd.read_csv(
    input_path + "\Per person state public health funding.csv",
    skiprows=6,
    dtype={"Fips": "string"},
)
# df_state_public_spend.head()

(255, 6)

In [9]:
# American Community Survey : ACSDP5Y2020.DP05
df_acs = pd.read_csv(
    input_path + "\ACSDP5Y2020.DP05-Data.csv", skiprows=1, low_memory=False
)
# df_acs.head(5)

In [87]:
# State-ZIP mapping
df_zip2st = pd.read_excel(
    input_path + "\ZIP_TRACT_122023.xlsx",
    sheet_name="Export Worksheet",
    dtype={"ZIP": "string"},
)
# df_zip2st.head(5)

#### Keeping only selected fields

In [10]:
# Census Income levels
df_income_1901_keep = df_income_1901[
    [
        "Geographic Area Name",
        "Estimate Households Total",
        "Estimate Households Total Less than $10,000",
        "Estimate Households Median income (dollars)",
        "Margin of Error Households Median income (dollars)",
        "Estimate Families Total",
        "Estimate Families Total Less than $10,000",
        "Estimate Families Median income (dollars)",
        "Margin of Error Families Median income (dollars)",
        "Estimate Nonfamily households Total",
        "Estimate Nonfamily households Total Less than $10,000",
        "Estimate Nonfamily households Median income (dollars)",
    ]
]
df_income_1901_keep.head()

Unnamed: 0,Geographic Area Name,Estimate Households Total,"Estimate Households Total Less than $10,000",Estimate Households Median income (dollars),Margin of Error Households Median income (dollars),Estimate Families Total,"Estimate Families Total Less than $10,000",Estimate Families Median income (dollars),Margin of Error Families Median income (dollars),Estimate Nonfamily households Total,"Estimate Nonfamily households Total Less than $10,000",Estimate Nonfamily households Median income (dollars)
0,ZCTA5 00601,5341,27.1,17526,1697,3527,21.0,19628,2030,1814,45.6,10740
1,ZCTA5 00602,12777,24.3,20260,1494,8652,16.9,24126,1980,4125,42.1,13060
2,ZCTA5 00603,19624,31.0,17703,1223,12751,25.1,24062,1934,6873,45.2,11301
3,ZCTA5 00606,1948,25.9,19603,3497,1316,25.6,21439,4857,632,29.4,16716
4,ZCTA5 00610,8781,20.3,22796,1701,5872,17.1,26914,2712,2909,33.1,13634


In [63]:
# Health Outcomes
print(
    "Number of unique values: ",
    df_health_outcomes.Data_Value_Unit.nunique(),
    df_health_outcomes.Data_Value_Type.nunique(),
)
for i in df_health_outcomes.index:
    if (
        df_health_outcomes.loc[i, "LocationID"]
        != df_health_outcomes.loc[i, "LocationName"]
    ):
        print(
            "Error:",
            df_health_outcomes.loc[i, "LocationID"],
            df_health_outcomes.loc[i, "LocationName"],
        )

Number of unique values:  1 1


In [64]:
df_health_outcomes_keep = df_health_outcomes[
    [
        "Year",
        "LocationName",
        "Category",
        "Measure",
        "Data_Value",
        "TotalPopulation",
        "Short_Question_Text",
        "Geolocation",
    ]
]
df_health_outcomes_keep.head()
# Note: 'Data_Value_Unit' and 'Data_Value_Type' have only one value each - '%' & 'Crude prevalence' respectively. Thus dropped.
# Note: LocationID and LocationName are exactly same and redundant. Thus dropped LocationID

Unnamed: 0,Year,LocationName,Category,Measure,Data_Value,TotalPopulation,Short_Question_Text,Geolocation
0,2021,1001,Prevention,Current lack of health insurance among adults ...,4.1,16769,Health Insurance,POINT (-72.62581515 42.06255509)
1,2021,1001,Health Outcomes,Arthritis among adults aged >=18 years,33.7,16769,Arthritis,POINT (-72.62581515 42.06255509)
2,2021,1001,Health Risk Behaviors,Binge drinking among adults aged >=18 years,15.6,16769,Binge Drinking,POINT (-72.62581515 42.06255509)
3,2021,1001,Health Outcomes,High blood pressure among adults aged >=18 years,33.8,16769,High Blood Pressure,POINT (-72.62581515 42.06255509)
4,2021,1001,Prevention,Taking medicine for high blood pressure contro...,78.9,16769,Taking BP Medication,POINT (-72.62581515 42.06255509)


In [79]:
# Social Determinants of Health
df_sdoh.Data_Value_Type.nunique()

1

In [80]:
df_sdoh_keep = df_sdoh[["LocationName", "Measure", "Data_Value", "TotalPopulation"]]
df_sdoh_keep.head()
# Note: 'Year' and 'Data_Value_Type' have only one value each. Thus dropped.

Unnamed: 0,LocationName,Measure,Data_Value,TotalPopulation
0,93420,Crowding among housing units,2.3,31864
1,6120,No broadband internet subscription among house...,29.3,13399
2,1810,Housing cost burden among households,15.8,35600
3,4259,Housing cost burden among households,28.6,3279
4,1128,Persons of racial or ethnic minority status,34.8,3232


In [165]:
# Political affiliation by state and Public spending by health
df_state_politics_keep = df_state_politics.copy()
display(df_state_politics_keep.head(5))
display(df_state_public_spend["Data Type"].unique())
df_state_public_spend_keep = df_state_public_spend[["Location", "TimeFrame", "Data"]]
display(df_state_public_spend_keep.head(5))
# Note: Data Type dropped from public spend as it has only 1 value - Dollars and is thus redundant.

Unnamed: 0,STATE(TERRITORY),Abbreviation,Political Affiliation (2008-2020 presidential elections)
0,California,CA,Deep Blue
1,Colorado,CO,Deep Blue
2,Connecticut,CT,Deep Blue
3,Delaware,DE,Deep Blue
4,District of Columbia,DC,Deep Blue


array(['Dollars'], dtype=object)

Unnamed: 0,Location,TimeFrame,Data
0,Alabama,2017,56.43214
1,Alabama,2018,57.23
2,Alabama,2019,54.25
3,Alabama,2020,46.81
4,Alabama,2021,52.35


In [28]:
# American Community Survey : ACSDP5Y2020.DP05
display(df_acs["Percent!!Total housing units"].nunique())

1

In [155]:
df_acs_keep = df_acs[
    [
        "Geographic Area Name",
        "Estimate!!SEX AND AGE!!Total population",
        "Estimate!!SEX AND AGE!!Total population!!Sex ratio (males per 100 females)",
        "Percent!!RACE!!Total population",
        "Percent!!RACE!!Total population!!One race",
        "Percent!!RACE!!Total population!!One race!!Black or African American",
        "Percent Margin of Error!!RACE!!Total population!!One race!!Black or African American",
        "Percent!!RACE!!Total population!!One race!!American Indian and Alaska Native",
        "Percent Margin of Error!!RACE!!Total population!!One race!!American Indian and Alaska Native",
        "Percent!!RACE!!Total population!!One race!!Native Hawaiian and Other Pacific Islander",
        "Percent Margin of Error!!RACE!!Total population!!One race!!Native Hawaiian and Other Pacific Islander",
        "Percent!!HISPANIC OR LATINO AND RACE!!Total population!!Hispanic or Latino (of any race)",
        "Percent Margin of Error!!HISPANIC OR LATINO AND RACE!!Total population!!Hispanic or Latino (of any race)",
        "Estimate!!Total housing units",
    ]
]

# df_acs_keep.head(5)
# Note: There are some duplicative/redundant columns beyond the ones kept.
# Note 2: 'Percent!!Total housing units' has only value - X. Thus dropped.

In [154]:
# State-ZIP mapping
df_zip2st_keep = df_zip2st[["ZIP", "USPS_ZIP_PREF_STATE"]].drop_duplicates()
# display(df_zip2st_keep.head())
df_zip2st_keep.shape

(39368, 2)

##### Datasets carried forward:
<ol>
<li>df_income_1901_keep</li>
<li>df_health_outcomes_keep</li>
<li>df_sdoh_keep</li>
<li>df_state_politics</li>
<li>df_state_public_spend</li>
<li>df_acs_keep</li>
<li>df_zip2st_keep</li>
</ol>

#### Basic data manipulations for downstream processes

In [15]:
# Census income levels
# Renaming columns for readability
df_income_1901_edited = df_income_1901_keep.rename(
    columns={
        "Estimate Households Total Less than $10,000": "Percent Households lt 10k",
        "Estimate Families Total Less than $10,000": "Percent Families lt 10k",
        "Estimate Nonfamily households Total Less than $10,000": "Percent Nonfamily households lt 10k",
    }
)

# Extracting ZIP out of Geographic Area Name
df_income_1901_edited.loc[:, "ZIP"] = df_income_1901_keep.loc[
    :, "Geographic Area Name"
].str[-5:]
df_income_1901_edited.drop(columns="Geographic Area Name", inplace=True)

# Converting columns times for numeric operations. obj_to_float is defined in helper_functions notebook
df_income_1901_edited["Estimate Households Median income (dollars)"] = obt_to_float(
    df_income_1901_edited["Estimate Households Median income (dollars)"]
)
df_income_1901_edited["Estimate Families Median income (dollars)"] = obt_to_float(
    df_income_1901_edited["Estimate Families Median income (dollars)"]
)
df_income_1901_edited["Estimate Nonfamily households Median income (dollars)"] = (
    obt_to_float(
        df_income_1901_edited["Estimate Nonfamily households Median income (dollars)"]
    )
)
df_income_1901_edited["Percent Households lt 10k"] = obt_to_float(
    df_income_1901_keep["Estimate Households Total Less than $10,000"]
)
df_income_1901_edited["Percent Families lt 10k"] = obt_to_float(
    df_income_1901_keep["Estimate Families Total Less than $10,000"]
)
df_income_1901_edited["Percent Nonfamily households lt 10k"] = obt_to_float(
    df_income_1901_keep["Estimate Nonfamily households Total Less than $10,000"]
)
df_income_1901_edited["Margin of Error Households Median income (dollars)"] = (
    obt_to_float(
        df_income_1901_keep["Margin of Error Households Median income (dollars)"]
    )
)
df_income_1901_edited["Margin of Error Families Median income (dollars)"] = (
    obt_to_float(
        df_income_1901_keep["Margin of Error Families Median income (dollars)"]
    )
)

# Defining additional derived variables that can be useful for data exploration
df_income_1901_edited["Percent Households that are Families"] = (
    df_income_1901_keep["Estimate Families Total"]
    / df_income_1901_keep["Estimate Households Total"]
)
df_income_1901_edited["Margin of Error Households Median income (percent)"] = (
    df_income_1901_edited["Margin of Error Households Median income (dollars)"]
    / df_income_1901_edited["Estimate Households Median income (dollars)"]
)
df_income_1901_edited["Margin of Error Families Median income (percent)"] = (
    df_income_1901_edited["Margin of Error Families Median income (dollars)"]
    / df_income_1901_edited["Estimate Families Median income (dollars)"]
)

In [16]:
# Checking if all datatypes are now usable for downstream processes
df_income_1901_edited.dtypes

Estimate Households Total                                         int64
Percent Households lt 10k                                       float64
Estimate Households Median income (dollars)                     float64
Margin of Error Households Median income (dollars)              float64
Estimate Families Total                                           int64
Percent Families lt 10k                                         float64
Estimate Families Median income (dollars)                       float64
Margin of Error Families Median income (dollars)                float64
Estimate Nonfamily households Total                               int64
Percent Nonfamily households lt 10k                             float64
Estimate Nonfamily households Median income (dollars)           float64
ZIP                                                      string[python]
Percent Households that are Families                            float64
Margin of Error Households Median income (percent)              

In [17]:
# QC: Check that special cases handling in object to float conversion took place correctly. Function defined in helper_functions
print("Checking: Estimate Households Median income (dollars)")
display(
    qc_obj_to_float(
        df_income_1901_keep["Estimate Households Median income (dollars)"],
        df_income_1901_edited["Estimate Households Median income (dollars)"],
    )
)
print("Checking: Margin of Error Households Median income (dollars)")
display(
    qc_obj_to_float(
        df_income_1901_keep["Margin of Error Households Median income (dollars)"],
        df_income_1901_edited["Margin of Error Households Median income (dollars)"],
    )
)

Checking: Estimate Households Median income (dollars)
Index: 441, 	 Old value: 250,000+, 	                 New value: 250000.0
Index: 565, 	 Old value: 250,000+, 	                 New value: 250000.0
Index: 570, 	 Old value: 250,000+, 	                 New value: 250000.0
Index: 573, 	 Old value: 250,000+, 	                 New value: 250000.0
Index: 1014, 	 Old value: 250,000+, 	                 New value: 250000.0
Index: 1352, 	 Old value: 250,000+, 	                 New value: 250000.0
Index: 1775, 	 Old value: 250,000+, 	                 New value: 250000.0
Index: 1954, 	 Old value: 250,000+, 	                 New value: 250000.0
Index: 1959, 	 Old value: 250,000+, 	                 New value: 250000.0
Index: 1962, 	 Old value: 250,000+, 	                 New value: 250000.0
Index: 1967, 	 Old value: 250,000+, 	                 New value: 250000.0
Index: 2018, 	 Old value: 250,000+, 	                 New value: 250000.0
Index: 2044, 	 Old value: 250,000+, 	                 New valu

None

Checking: Margin of Error Households Median income (dollars)
Index: 14, 	 Old value: **, 	                 New value: nan
Index: 44, 	 Old value: **, 	                 New value: nan
Index: 88, 	 Old value: **, 	                 New value: nan
Index: 93, 	 Old value: **, 	                 New value: nan
Index: 109, 	 Old value: **, 	                 New value: nan
Index: 111, 	 Old value: **, 	                 New value: nan
Index: 118, 	 Old value: **, 	                 New value: nan
Index: 134, 	 Old value: **, 	                 New value: nan
Index: 167, 	 Old value: **, 	                 New value: nan
Index: 178, 	 Old value: **, 	                 New value: nan
Index: 192, 	 Old value: **, 	                 New value: nan
Index: 205, 	 Old value: **, 	                 New value: nan
Index: 211, 	 Old value: **, 	                 New value: nan
Index: 214, 	 Old value: **, 	                 New value: nan
Index: 221, 	 Old value: **, 	                 New value: nan
Index: 233, 	

None

In [68]:
# Health Outcomes
# Renaming columns for readability
df_health_outcomes_edited = df_health_outcomes_keep.rename(
    columns={"LocationName": "ZIP"}
)
# Creating latitude and longitude from Geolocation for downstream processes
geo_split = df_health_outcomes_keep["Geolocation"].str.split(expand=True)
df_health_outcomes_edited["longitude"] = (
    geo_split[1].str.replace("(", "").str.replace(")", "").astype("float")
)
df_health_outcomes_edited["latitude"] = (
    geo_split[2].str.replace("(", "").str.replace(")", "").astype("float")
)

In [81]:
# Social Determinants of Health
# Renaming columns for readability
df_sdoh_edited = df_sdoh_keep.rename(columns={"LocationName": "ZIP"})

In [166]:
# Political affiliation by state and Public spending by health
# Checking which state names are extra in public spend file and renaming them if it's a spelling mis-match
display(df_state_politics_keep["STATE(TERRITORY)"].unique())
for state_name in df_state_public_spend_keep["Location"].unique():
    if state_name not in df_state_politics_keep["STATE(TERRITORY)"].unique():
        print(f"Missing in state politics: {state_name}")

array(['California', 'Colorado', 'Connecticut', 'Delaware',
       'District of Columbia', 'Hawaii', 'Illinois', 'Maine', 'Maryland',
       'Massachusetts', 'Minnesota', 'Nevada', 'New Hampshire',
       'New Jersey', 'New Mexico', 'New York', 'Oregon', 'Rhode Island',
       'Vermont', 'Virginia', 'Washington', 'Alabama', 'Alaska',
       'Arkansas', 'Idaho', 'Kansas', 'Kentucky', 'Louisiana',
       'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'North Dakota',
       'Oklahoma', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas',
       'Utah', 'West Virginia', 'Wyoming', 'Michigan', 'Pennsylvania',
       'Wisconsin', 'Arizona', 'Georgia', 'Indiana', 'North Carolina',
       'Florida', 'Iowa', 'Ohio', 'American Samoa', 'Guam',
       'Northern Mariana Islands', 'Puerto Rico', 'Trust Territories',
       'Virgin Islands'], dtype=object)

Missing in state politics: Dist. of Columbia


In [167]:
# Renaming columns for readability
df_state_politics_edited = df_state_politics_keep.rename(
    columns={"Abbreviation": "State", "STATE(TERRITORY)": "State Name"}
)
df_state_public_spend_edited = df_state_public_spend_keep.rename(
    columns={"Location": "State Name"}
)

# Matching Washington DC's name in both the state level files
df_state_public_spend_edited.loc[
    df_state_public_spend_edited["State Name"] == "Dist. of Columbia", "State Name"
] = "District of Columbia"

In [73]:
# American Community Survey
# Copying dataset
df_acs_edited = df_acs_keep.copy()

# Extracting ZIP out of Geographic Area Name
df_acs_edited.loc[:, "ZIP"] = df_acs_keep.loc[:, "Geographic Area Name"].str[-5:]
df_acs_edited.drop(columns="Geographic Area Name", inplace=True)

In [156]:
# State-ZIP mapping
# Renaming columns for readability
df_zip2st_edited = df_zip2st_keep.rename(columns={"USPS_ZIP_PREF_STATE": "State"})

#### Saving edited pandas dataframes as pickle file for later use

In [168]:
# Naming dataframes
df_income_1901_edited.name = "df_income_1901_edited"
df_health_outcomes_edited.name = "df_health_outcomes_edited"
df_sdoh_keep.name = "df_sdoh_edited"
df_state_politics_edited.name = "df_state_politics_edited"
df_state_public_spend_edited.name = "df_state_public_spend_edited"
df_acs_edited.name = "df_acs_edited"
df_zip2st_edited.name = "df_zip2st_edited"

In [169]:
df_income_1901_edited.to_pickle(output_path + "\\df_income_1901_edited.pkl")
df_health_outcomes_edited.to_pickle(output_path + "\\df_health_outcomes_edited.pkl")
df_sdoh_edited.to_pickle(output_path + "\\df_sdoh_edited.pkl")
df_state_politics_edited.to_pickle(output_path + "\\df_state_politics_edited.pkl")
df_state_public_spend_edited.to_pickle(
    output_path + "\\df_state_public_spend_edited.pkl"
)
df_acs_edited.to_pickle(output_path + "\\df_acs_edited.pkl")
df_zip2st_edited.to_pickle(output_path + "\\df_zip2st_edited.pkl")