## CDC data on race and ethnicity

### load packages

In [1]:
%load_ext lab_black

In [2]:
import numpy as np
import pandas as pd
import altair as alt
import altair_grid as altgrid

alt.themes.register("grid", altgrid.theme)
alt.themes.enable("grid")

ThemeRegistry.enable('grid')

In [3]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000

#### read in data grouped by year and cause of death

In [4]:
df = pd.read_table("data/raw/byyear_cause_race.txt", dtype={"Year": str}).drop(
    columns=["Year Code"]
)
df = df[~df["Deaths"].isna()]
df["Year"] = df["Year"].str.strip()

In [5]:
df.columns = df.columns.str.lower().str.replace(" ", "_")

#### group of causes that are 42 days or more post-birth and therefore not counted in CDC surveillance

In [6]:
post42 = ["O96", "O96.0", "O96.1", "O96.9", "O97", "O97.0", "O97.1", "O97.9"]

#### make combined race/hispanic origin

In [7]:
values = [
    ((df["race"] == "White") & (df["hispanic_origin"] == "Not Hispanic or Latino")),
    (
        (df["race"] == "Black or African American")
        & (df["hispanic_origin"] == "Not Hispanic or Latino")
    ),
    (
        (df["race"] == "Asian or Pacific Islander")
        & (df["hispanic_origin"] == "Not Hispanic or Latino")
    ),
    (
        (df["race"] == "American Indian or Alaska Native")
        & (df["hispanic_origin"] == "Not Hispanic or Latino")
    ),
    (df["hispanic_origin"] == "Hispanic or Latino")
    & (df["notes"] != "Total")
    & ~df["race"].isna(),
    (df["race"].isna()) & (df["hispanic_origin"].isna()) & (df["notes"] != "Total"),
]

conditions = [
    "White",
    "Black",
    "AAPI",
    "American Indian or Alaska Native",
    "Hispanic or Latino",
    "Overall",
]

df["race_whispanicorigin"] = np.select(values, conditions)

#### summarize deaths by race/ethnicity to compare to 2020 [report](https://www.cdc.gov/nchs/data/hestat/maternal-mortality/2020/maternal-mortality-rates-2020.htm) and make sure your counts match

In [8]:
df[(~df["cause_of_death_code"].isin(post42)) & (df["year"] == "2019")].groupby(
    "race_whispanicorigin"
).agg(deaths=pd.NamedAgg(column="deaths", aggfunc=sum)).reset_index()

Unnamed: 0,race_whispanicorigin,deaths
0,0,3.0
1,AAPI,39.0
2,American Indian or Alaska Native,15.0
3,Black,242.0
4,Hispanic or Latino,112.0
5,White,343.0


#### Numbers are not quite matching
I'm seeing 3 extra deaths for black mothers and 6 for white mothers compared to CDC surveillance<br>
also checked the figures for 2018 and 2019, in 2018, matching for hispanic/latino but higher by four (in my calculations) for black and white mothers. in 2019, matching for white and hispanic/latino but up by one <br>
the CDC doesn't present numbers for AAPI or Native mothers, so I can't fact-check those numbers and am relying entirely on my own calculations and want to be confident they are correct, if I ever publish viz with this data

#### Checking to see if I erroneously included any causes of death that weren't in the cdc list of causes

In [9]:
(
    df[
        (df["year"] == "2020")
        & (df["deaths"] <= 3)
        & (df["race_whispanicorigin"] == "Black")
        & (~df["cause_of_death_code"].isin(post42))
    ][["race_whispanicorigin", "cause_of_death_code", "deaths"]]
    .sort_values("deaths", ascending=False)
    .reset_index()
    .drop(columns={"index"})
)

Unnamed: 0,race_whispanicorigin,cause_of_death_code,deaths
0,Black,O88.1,3.0
1,Black,O75.9,3.0
2,Black,O14.2,3.0
3,Black,O15.9,3.0
4,Black,O75.4,3.0
5,Black,O22.3,3.0
6,Black,O00.9,3.0
7,Black,O00.1,2.0
8,Black,O24.0,2.0
9,Black,O98.7,2.0


#### Natality data

In [10]:
natality_df = pd.read_table(
    "data/raw/natality_byrace_year.txt", dtype={"Year": str}
).drop(
    columns=["Year Code", "Mother's Hispanic Origin Code", "Mother's Single Race Code"]
)

In [11]:
natality_df = natality_df[~natality_df["Births"].isna()]

In [12]:
natality_df.columns = (
    natality_df.columns.str.lower().str.replace(" ", "_").str.replace("'", "")
)

In [13]:
aapi = ["Native Hawaiian or Other Pacific Islander", "Asian"]

birth_conditions = [
    (
        (natality_df["mothers_single_race"] == "White")
        & (natality_df["mothers_hispanic_origin"] == "Not Hispanic or Latino")
        & (natality_df["notes"] != "Total")
    ),
    (
        (natality_df["mothers_single_race"] == "Black or African American")
        & (natality_df["mothers_hispanic_origin"] == "Not Hispanic or Latino")
        & (natality_df["notes"] != "Total")
    ),
    (
        (natality_df["mothers_single_race"].isin(aapi))
        & (natality_df["mothers_hispanic_origin"] == "Not Hispanic or Latino")
        & (natality_df["notes"] != "Total")
    ),
    (
        (natality_df["mothers_single_race"] == "American Indian or Alaska Native")
        & (natality_df["mothers_hispanic_origin"] == "Not Hispanic or Latino")
        & (natality_df["notes"] != "Total")
    ),
    (
        (natality_df["mothers_hispanic_origin"] == "Hispanic or Latino")
        & (natality_df["notes"] != "Total")
    ),
]

birth_values = [
    "White",
    "Black",
    "AAPI",
    "American Indian or Alaska Native",
    "Hispanic or Latino",
]

natality_df["race_whispanicorigin"] = np.select(birth_conditions, birth_values)

#### QC against 2020 maternal mortality live birth numbers


In [14]:
births_byyear = (
    natality_df.groupby(["race_whispanicorigin", "year"])
    .agg(births=pd.NamedAgg(column="births", aggfunc=sum))
    .reset_index()
)

#### The number of live births matches up, so I'm going to sketch out the maternal mortalities rates over time
Even though there are some small discrepancies in the death counts, the graph will still give a sense of the rates over time, until I can reconcile those differences

In [15]:
deaths_byyear = (
    df[~df["cause_of_death_code"].isin(post42)]
    .groupby(["race_whispanicorigin", "year"])
    .agg(deaths=pd.NamedAgg(column="deaths", aggfunc=sum))
    .reset_index()
)

In [16]:
years = ["2016", "2017", "2018", "2019", "2020"]
rates = (
    deaths_byyear[
        (deaths_byyear["race_whispanicorigin"] != "0")
        & (deaths_byyear["year"].isin(years))
    ]
    .merge(births_byyear, on=["year", "race_whispanicorigin"])
    .copy()
)

In [17]:
rates["rate"] = 100000 * (rates["deaths"] / rates["births"])

In [18]:
alt.Chart(rates).mark_line().encode(
    x=alt.X("year:O"), y=alt.Y("rate"), color="race_whispanicorigin"
)