#### setup

In [1]:
%load_ext lab_black

In [2]:
import requests
import urllib.request
import numpy as np
import pandas as pd

In [3]:
import altair as alt
import altair_grid as altgrid

alt.themes.register("grid", altgrid.theme)
alt.themes.enable("grid")

ThemeRegistry.enable('grid')

In [4]:
from datawrapper import Datawrapper

dw = Datawrapper(
    access_token="FtIwtvFtoGLaRT9a3gjX69PLu4wSuRyKddoOz6SOPw3k9wWyNICMHTkcPhOGCR5Z"
)

In [5]:
# scope = ['https://spreadsheets.google.com/feeds']
# credentials = ServiceAccountCredentials.from_json_keyfile_name('jupyter-integration-349314-25735d35924b.json', scope)
# gc = gspread.authorize(credentials)

In [6]:
# spreadsheet_key = "1sCb1YbQ3-1oiL-cnK0yCkwFcQpvpf0efIngAUeC1ixo"
# book = gc.open_by_key(spreadsheet_key)

In [7]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000

### maternal death data

In [8]:
df = pd.read_table("data/raw/byyear_cause_race.txt", dtype={"Year": str}).drop(
    columns=["Year Code"]
)
df = df[~df["Deaths"].isna()]
df["Year"] = df["Year"].str.strip()

In [9]:
df.columns = df.columns.str.lower().str.replace(" ", "_")

##### group of causes that are 42 days or more post-birth and therefore not counted in CDC surveillance

In [10]:
post42 = ["O96", "O96.0", "O96.1", "O96.9", "O97", "O97.0", "O97.1", "O97.9"]

#### make combined race/hispanic origin

In [11]:
values = [
    ((df["race"] == "White") & (df["hispanic_origin"] == "Not Hispanic or Latino")),
    (
        (df["race"] == "Black or African American")
        & (df["hispanic_origin"] == "Not Hispanic or Latino")
    ),
    (
        (df["race"] == "Asian or Pacific Islander")
        & (df["hispanic_origin"] == "Not Hispanic or Latino")
    ),
    (
        (df["race"] == "American Indian or Alaska Native")
        & (df["hispanic_origin"] == "Not Hispanic or Latino")
    ),
    (df["hispanic_origin"] == "Hispanic or Latino")
    & (df["notes"] != "Total")
    & ~df["race"].isna(),
    (df["race"].isna()) & (df["hispanic_origin"].isna()) & (df["notes"] != "Total"),
]

conditions = [
    "White",
    "Black",
    "AAPI",
    "American Indian or Alaska Native",
    "Hispanic or Latino",
    "Overall",
]

df["race_whispanicorigin"] = np.select(values, conditions)

In [12]:
df[(~df["cause_of_death_code"].isin(post42)) & (df["year"] == "2019")].groupby(
    "race_whispanicorigin"
).agg(deaths=pd.NamedAgg(column="deaths", aggfunc=sum)).reset_index()

Unnamed: 0,race_whispanicorigin,deaths
0,0,3.0
1,AAPI,39.0
2,American Indian or Alaska Native,15.0
3,Black,242.0
4,Hispanic or Latino,112.0
5,White,343.0


##### number is closely matching CDC figures and accurate for hispanic/latino
##### but I'm seeing 3 extra deaths for black mothers and 6 for white mothers compared to cdc surveillance
##### https://www.cdc.gov/nchs/data/hestat/maternal-mortality/2020/maternal-mortality-rates-2020.htm
##### also checked the figures for 2018 and 2019, in 2018, matching for hispanic/latino but higher by four (in my calculations) for black and white mothers. in 2019, good for white and hispanic/latino but up by one
##### this is an important qc because the CDC doesn't present numbers for AAPI or Native mothers, so I can't fact-check those numbers. I have to be confident I'm counting correctly

##### here, you can see all the cause of death codes match those included by the CDC (or any with few enough deaths to explain the discrepancy)

In [13]:
(
    df[
        (df["year"] == "2020")
        & (df["deaths"] <= 3)
        & (df["race_whispanicorigin"] == "Black")
        & (~df["cause_of_death_code"].isin(post42))
    ][["race_whispanicorigin", "cause_of_death_code", "deaths"]]
    .sort_values("deaths", ascending=False)
    .reset_index()
    .drop(columns={"index"})
)

Unnamed: 0,race_whispanicorigin,cause_of_death_code,deaths
0,Black,O88.1,3.0
1,Black,O75.9,3.0
2,Black,O14.2,3.0
3,Black,O15.9,3.0
4,Black,O75.4,3.0
5,Black,O22.3,3.0
6,Black,O00.9,3.0
7,Black,O00.1,2.0
8,Black,O24.0,2.0
9,Black,O98.7,2.0


##### so moving on, that's something to resolve if I ever publish with this data

### Natality data

In [14]:
natality_df = pd.read_table(
    "data/raw/natality_byrace_year.txt", dtype={"Year": str}
).drop(
    columns=["Year Code", "Mother's Hispanic Origin Code", "Mother's Single Race Code"]
)

In [15]:
natality_df = natality_df[~natality_df["Births"].isna()]

In [16]:
natality_df.columns = (
    natality_df.columns.str.lower().str.replace(" ", "_").str.replace("'", "")
)

In [17]:
aapi = ["Native Hawaiian or Other Pacific Islander", "Asian"]

birth_conditions = [
    (
        (natality_df["mothers_single_race"] == "White")
        & (natality_df["mothers_hispanic_origin"] == "Not Hispanic or Latino")
        & (natality_df["notes"] != "Total")
    ),
    (
        (natality_df["mothers_single_race"] == "Black or African American")
        & (natality_df["mothers_hispanic_origin"] == "Not Hispanic or Latino")
        & (natality_df["notes"] != "Total")
    ),
    (
        (natality_df["mothers_single_race"].isin(aapi))
        & (natality_df["mothers_hispanic_origin"] == "Not Hispanic or Latino")
        & (natality_df["notes"] != "Total")
    ),
    (
        (natality_df["mothers_single_race"] == "American Indian or Alaska Native")
        & (natality_df["mothers_hispanic_origin"] == "Not Hispanic or Latino")
        & (natality_df["notes"] != "Total")
    ),
    (
        (natality_df["mothers_hispanic_origin"] == "Hispanic or Latino")
        & (natality_df["notes"] != "Total")
    ),
]

birth_values = [
    "White",
    "Black",
    "AAPI",
    "American Indian or Alaska Native",
    "Hispanic or Latino",
]

natality_df["race_whispanicorigin"] = np.select(birth_conditions, birth_values)

#### QC against 2020 maternal mortality work
results show that natality data is a precise match, only differences in deaths are the small ones

In [18]:
births_byyear = (
    natality_df.groupby(["race_whispanicorigin", "year"])
    .agg(births=pd.NamedAgg(column="births", aggfunc=sum))
    .reset_index()
)

In [19]:
deaths_byyear = (
    df[~df["cause_of_death_code"].isin(post42)]
    .groupby(["race_whispanicorigin", "year"])
    .agg(deaths=pd.NamedAgg(column="deaths", aggfunc=sum))
    .reset_index()
)

In [20]:
years = ["2020", "2019", "2018"]
test = deaths_byyear[deaths_byyear["year"].isin(years)].merge(
    (births_byyear[births_byyear["year"].isin(years)]),
    on=["year", "race_whispanicorigin"],
)

In [21]:
test["rate"] = 100000 * (test["deaths"] / test["births"])

In [22]:
358 + 158 + 296 + 13 + 36

861

In [23]:
test

Unnamed: 0,race_whispanicorigin,year,deaths,births,rate
0,0,2018,1.0,7701118.0,0.012985
1,0,2019,3.0,7615177.0,0.039395
2,AAPI,2018,37.0,250274.0,14.783797
3,AAPI,2019,39.0,248539.0,15.691702
4,AAPI,2020,36.0,228694.0,15.741559
5,American Indian or Alaska Native,2018,10.0,29092.0,34.373711
6,American Indian or Alaska Native,2019,15.0,28450.0,52.724077
7,American Indian or Alaska Native,2020,13.0,26813.0,48.483944
8,Black,2018,210.0,552029.0,38.04148
9,Black,2019,242.0,548075.0,44.154541


### Here's the causes and codes I'm treating as possibly prevented by abortion
Ectopic pregnancy: O00.9 <br>
Placenta praevia with haemorrhage O40.1 <br>
Spontaneous abortion (miscarriage) O03. ... <br>
hemorrhage early in pregnancy 020.9


In [26]:
abortion_causes = ["O00.9", "O40.1", "O03.1", "O03.3", "O03.4", "O3.8", "O20.9"]

### for now, focus on visualizing different racial groups

In [27]:
years = ["2016", "2017", "2018", "2019", "2020"]
rates = deaths_byyear[
    (deaths_byyear["race_whispanicorigin"] != "0") & (deaths_byyear["year"].isin(years))
].merge(births_byyear, on=["year", "race_whispanicorigin"])

In [28]:
rates["rate"] = 100000 * (rates["deaths"] / rates["births"])

In [29]:
alt.Chart(rates).mark_line().encode(
    x=alt.X("year:O"), y=alt.Y("rate"), color="race_whispanicorigin"
)