# <span style="font-family:Courier New; color:#CCCCCC">**Insurance Health Preprocessing**</span>

In [61]:
import pandas as pd

## <span style="font-family:Courier New; color:#336666">**Reading and Selecting Data**</span>

In [62]:
# Load the data
df = pd.read_csv("../data/raw/exogenous_data/acs_2018_health_insurance_coverage_estimates.csv")

# Show first rows
df.head()

Unnamed: 0,geo_id,state,state_name,acs_variable,estimate,margin_of_error,label,concept,estimate_type,coverage_type,age_group,labor_force,employed
0,1,AL,Alabama,DP03_0096,4307566.0,8603.0,Estimate!!HEALTH INSURANCE COVERAGE!!Civilian ...,Selected Economic Characteristics,population_estimate,With health insurance coverage,overall,,
1,1,AL,Alabama,DP03_0096P,90.0,0.2,Percent Estimate!!HEALTH INSURANCE COVERAGE!!C...,Selected Economic Characteristics,percent_estimate,With health insurance coverage,overall,,
2,1,AL,Alabama,DP03_0097,3221648.0,15028.0,Estimate!!HEALTH INSURANCE COVERAGE!!Civilian ...,Selected Economic Characteristics,population_estimate,With private health insurance,overall,,
3,1,AL,Alabama,DP03_0097P,67.3,0.3,Percent Estimate!!HEALTH INSURANCE COVERAGE!!C...,Selected Economic Characteristics,percent_estimate,With private health insurance,overall,,
4,1,AL,Alabama,DP03_0098,1745707.0,10498.0,Estimate!!HEALTH INSURANCE COVERAGE!!Civilian ...,Selected Economic Characteristics,population_estimate,With public coverage,overall,,


### <span style="font-family:Courier New; color:#336633">**Data Selection**</span>

<span style="font-family:Courier New">Once again, we drop the variables `geo_id` and `state`. We also drop `label`, since it can be inferred from the other variables (it’s redundant). We remove the `concept` variable, which has only a single value that merely provides context. Finally, the `employed` variable has 52.94% missing values—more than half—so we drop it as well.</span>

In [63]:
df = df.drop(['geo_id', 'state', 'label', 'concept', 'employed'], axis=1)
df = df.rename(columns={'state_name': 'state'})

In [64]:
# Missing data
NAs = df.isnull().mean() * 100
NAs

state               0.000000
acs_variable        0.000000
estimate            0.000000
margin_of_error     0.000000
estimate_type       0.000000
coverage_type       0.000000
age_group           0.000000
labor_force        29.411765
dtype: float64

<span style="font-family:Courier New">To impute the missing values in `labor_force`, we create an Unknown category to avoid biasing the data.</span>

In [65]:
df['labor_force'] = df['labor_force'].fillna("Unknown")

<span style="font-family:Courier New">Keep only percent estimate variables, not estimate, that its magnitude is state relative.</span>

In [66]:
# Create a boolean mask: True where acs_variable ends with 'P', thus Percent estimate variable
mask = df['acs_variable'].str.endswith('P', na=False)

# Apply it to filter
df = df[mask]

In [67]:
df.head(17)

Unnamed: 0,state,acs_variable,estimate,margin_of_error,estimate_type,coverage_type,age_group,labor_force
1,Alabama,DP03_0096P,90.0,0.2,percent_estimate,With health insurance coverage,overall,Unknown
3,Alabama,DP03_0097P,67.3,0.3,percent_estimate,With private health insurance,overall,Unknown
5,Alabama,DP03_0098P,36.5,0.2,percent_estimate,With public coverage,overall,Unknown
7,Alabama,DP03_0099P,10.0,0.2,percent_estimate,No health insurance coverage,overall,Unknown
9,Alabama,DP03_0101P,3.3,0.2,percent_estimate,No health insurance coverage,under 19 years,Unknown
11,Alabama,DP03_0105P,87.6,0.3,percent_estimate,With health insurance coverage,19 to 64 years,In labor force
13,Alabama,DP03_0106P,84.3,0.3,percent_estimate,With private health insurance,19 to 64 years,In labor force
15,Alabama,DP03_0107P,6.2,0.2,percent_estimate,With public coverage,19 to 64 years,In labor force
17,Alabama,DP03_0108P,12.4,0.3,percent_estimate,No health insurance coverage,19 to 64 years,In labor force
19,Alabama,DP03_0110P,57.1,1.2,percent_estimate,With health insurance coverage,19 to 64 years,In labor force


In [68]:
# Mapping for interesting coverage estimates
mapping = {
    "DP03_0097P": "private_coverage",
    "DP03_0098P": "public_coverage",
    "DP03_0099P": "no_coverage"
}

# Pivot estimates for private, public, and no coverage
df_cov = (
    df[df["acs_variable"].isin(mapping.keys())]
    .assign(variable=lambda x: x["acs_variable"].map(mapping))
    .pivot(index="state", columns="variable", values="estimate")
    .reset_index()
)

# Pivot for coverage by labor force
df_lf = (
    df[df["acs_variable"].isin(["DP03_0105P", "DP03_0115P"])]
    .pivot(index="state", columns="acs_variable", values="estimate")
    .reset_index()
)
df_lf["labor_cov_diff"] = round(df_lf["DP03_0105P"] - df_lf["DP03_0115P"], 3)

# Merge everything
df_final = df_cov.merge(
    df_lf[["state", "labor_cov_diff"]],
    on="state"
)

In [69]:
df_final

Unnamed: 0,state,no_coverage,private_coverage,public_coverage,labor_cov_diff
0,Alabama,10.0,67.3,36.5,5.1
1,Alaska,14.4,65.5,31.0,5.6
2,Arizona,10.9,62.8,38.3,2.4
3,Arkansas,9.0,61.5,42.5,1.4
4,California,8.5,63.4,37.2,2.2
5,Colorado,8.1,70.5,32.0,1.3
6,Connecticut,5.6,71.8,34.4,1.5
7,Delaware,6.0,72.1,37.1,1.6
8,District of Columbia,4.0,69.6,36.0,1.9
9,Florida,13.5,61.9,36.9,4.1


In [70]:
df_final.to_csv("../data/preprocessed/acs_2018_health_insurance_coverage_estimates_cleaned.csv", index=False)