Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
benhammondmusic committed Apr 23, 2024
2 parents e966414 + 498dbcb commit ed7fd14
Show file tree
Hide file tree
Showing 11 changed files with 258 additions and 60 deletions.
13 changes: 13 additions & 0 deletions data/maternal_mortality/Table.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
year_id,race_group,location_name,maternal_deaths_estimated_total,live_births_estimated_total
1999,Non-Hispanic American Indian and Alaska Native,National,7,37800
2019,Non-Hispanic American Indian and Alaska Native,National,23,33300
1999,"Non-Hispanic Asian, Native Hawaiian, or Other Pacific Islander",National,17,177900
2019,"Non-Hispanic Asian, Native Hawaiian, or Other Pacific Islander",National,56,268600
1999,Non-Hispanic Black,National,186,593200
2019,Non-Hispanic Black,National,393,581400
1999,Non-Hispanic White,National,74,764100
2019,Non-Hispanic White,National,184,886900
1999,Hispanic and any race,National,220,2392200
2019,Hispanic and any race,National,551,1979000
1999,All racial and ethnic groups,National,505,3965200
2019,All racial and ethnic groups,National,1210,3749100
53 changes: 53 additions & 0 deletions data/maternal_mortality/manual_count_table_national_regional.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
Census region,Maternal death No. (95% UI) estimates,Live birth estimates in thousands,year_id,race_group
US,7 (5-11),37.8 (32.0-44.5),1999,American Indian and Alaska Native
Midwest,1 (1-3),7 (5.2-9.4),1999,American Indian and Alaska Native
Northeast,0 (0-1),1.6 (1.2-2.1),1999,American Indian and Alaska Native
South,1 (1-3),9.6 (7.8-12.0),1999,American Indian and Alaska Native
West,4 (2-8),19.6 (14.6-25.8),1999,American Indian and Alaska Native
US,23 (16-35),33.3 (29.9-37.4),2019,American Indian and Alaska Native
Midwest,6 (3-12),6.5 (4.8-8.8),2019,American Indian and Alaska Native
Northeast,0 (0-2),1.1 (0.9-1.3),2019,American Indian and Alaska Native
South,3 (2-6),9.8 (7.4-12.8),2019,American Indian and Alaska Native
West,13 (7-24),15.9 (14.1-18.0),2019,American Indian and Alaska Native
US,17 (12-23),177.9 (161.7-198.3),1999,"Asian, Native Hawaiian, or Other Pacific Islander"
Midwest,2 (1-3),23.2 (20.3-27),1999,"Asian, Native Hawaiian, or Other Pacific Islander"
Northeast,3 (1-6),36.4 (32.0-40.9),1999,"Asian, Native Hawaiian, or Other Pacific Islander"
South,4 (2-6),33.6 (29.2-38.5),1999,"Asian, Native Hawaiian, or Other Pacific Islander"
West,9 (5-14),84.8 (68.6-103.9),1999,"Asian, Native Hawaiian, or Other Pacific Islander"
US,56 (42-76),268.6 (257.8-280),2019,"Asian, Native Hawaiian, or Other Pacific Islander"
Midwest,5 (3-10),36.1 (32.6-40.3),2019,"Asian, Native Hawaiian, or Other Pacific Islander"
Northeast,6 (4-11),53 (48.8-57.7),2019,"Asian, Native Hawaiian, or Other Pacific Islander"
South,17 (10-25),64.9 (60.1-69.8),2019,"Asian, Native Hawaiian, or Other Pacific Islander"
West,28 (17-44),114.6 (107.6-122.2),2019,"Asian, Native Hawaiian, or Other Pacific Islander"
US,186 (152-225),593.2 (543.2-648.3),1999,Black
Midwest,30 (20-42),116.4 (103.3-130.5),1999,Black
Northeast,29 (18-45),100.3 (88.7-112.7),1999,Black
South,113 (87-143),328.5 (291.8-371.1),1999,Black
West,14 (9-20),47.9 (39.1-58.8),1999,Black
US,393 (334-463),581.4 (556.4-609.4),2019,Black
Midwest,50 (32-73),118 (106.6-131.8),2019,Black
Northeast,73 (51-100),81.2 (74.3-88.6),2019,Black
South,245 (196-298),332.1 (312.3-353.1),2019,Black
West,26 (15-42),50 (43.1-57.4),2019,Black
US,74 (59-93),764.1 (695.1-844.2),1999,Hispanic
Midwest,6 (3-10),69.2 (60.7-79.1),1999,Hispanic
Northeast,12 (7-19),99 (87.9-111.2),1999,Hispanic
South,25 (18-34),246.4 (210.2-293.3),1999,Hispanic
West,31 (21-44),349.5 (286.0-424.4),1999,Hispanic
US,184 (151-230),886.9 (847.1-930.3),2019,Hispanic
Midwest,13 (8-22),84.8 (75.2-95.4),2019,Hispanic
Northeast,37 (24-55),121.4 (111.9-132.6),2019,Hispanic
South,76 (54-99),357.7 (326.9-388.9),2019,Hispanic
West,59 (39-91),323 (299.8-345.3),2019,Hispanic
US,220 (181-264),2392.2 (2225.1-2579.6),1999,White
Midwest,62 (45-84),667.6 (597.2-744.6),1999,White
Northeast,36 (22-55),449.7 (399.2-502.5),1999,White
South,86 (64-110),822 (732.2-920.6),1999,White
West,36 (25-52),452.9 (369.3-552.2),1999,White
US,551 (465-635),1979 (1918.7-2041.5),2019,White
Midwest,147 (109-190),539.6 (511.1-572.1),2019,White
Northeast,80 (56-111),341.3 (319.3-364.1),2019,White
South,237 (189-293),715.2 (676.9-753.9),2019,White
West,87 (60-122),382.9 (359.3-409.0),2019,White
US,505 (424-593),3965.2 (3709.0-4267.2),1999,All racial and ethnic groups
US,1210 (1060-1370),3749.1 (3646.8-3854.9),2019,All racial and ethnic groups
13 changes: 0 additions & 13 deletions data/maternal_mortality/manual_national_table.csv

This file was deleted.

40 changes: 40 additions & 0 deletions frontend/src/data/config/MetricConfigPDOH.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,16 @@ export type PDOHMetricId =
| 'ahr_population_pct'
| 'cawp_population_pct'
| 'incarceration_population_pct'
| 'incarceration_population_estimated_total'
| 'jail_estimated_total'
| 'jail_pct_relative_inequity'
| 'jail_pct_share'
| 'jail_per_100k'
| 'pct_share_of_state_leg'
| 'pct_share_of_us_congress'
| 'pct_share_of_women_state_leg'
| 'pct_share_of_women_us_congress'
| 'prison_estimated_total'
| 'prison_pct_relative_inequity'
| 'prison_pct_share'
| 'prison_per_100k'
Expand Down Expand Up @@ -234,13 +237,31 @@ export const INCARCERATION_METRICS: DataTypeConfig[] = [
},
dataTableTitle: 'Breakdown summary for people in prison',
metrics: {
sub_population_count: {
chartTitle: '',
metricId: 'incarceration_population_estimated_total',
shortLabel: 'Total Population (Ages 15-64)',
type: 'count',
},
per100k: {
metricId: 'prison_per_100k',
chartTitle: 'Prison incarceration',
trendsCardTitleName: 'Rates of prison incarceration over time',
columnTitleHeader: 'People in prison per 100k',
shortLabel: 'prison per 100k',
type: 'per100k',
rateNumeratorMetric: {
metricId: 'prison_estimated_total',
shortLabel: 'in prison',
chartTitle: '',
type: 'count',
},
rateDenominatorMetric: {
metricId: 'incarceration_population_estimated_total',
shortLabel: 'Total population (Ages 15-64)',
chartTitle: '',
type: 'count',
},
},
pct_share: {
chartTitle: 'Percent share of total prison population',
Expand Down Expand Up @@ -289,13 +310,32 @@ export const INCARCERATION_METRICS: DataTypeConfig[] = [
},
dataTableTitle: 'Breakdown summary for people in jail',
metrics: {

sub_population_count: {
chartTitle: '',
metricId: 'incarceration_population_estimated_total',
shortLabel: 'Total Population (Ages 15-64)',
type: 'count',
},
per100k: {
metricId: 'jail_per_100k',
chartTitle: 'Jail incarceration',
trendsCardTitleName: 'Rates of jail incarceration over time',
columnTitleHeader: 'People in jail per 100k',
shortLabel: 'jail per 100k',
type: 'per100k',
rateNumeratorMetric: {
metricId: 'jail_estimated_total',
shortLabel: 'in jail',
chartTitle: '',
type: 'count',
},
rateDenominatorMetric: {
metricId: 'incarceration_population_estimated_total',
shortLabel: 'Total population (Ages 15-64)',
chartTitle: '',
type: 'count',
},
},
pct_share: {
chartTitle: 'Percent share of total jail population',
Expand Down
3 changes: 3 additions & 0 deletions frontend/src/data/providers/IncarcerationProvider.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,14 @@ export const INCARCERATION_IDS: DataTypeId[] = ['prison', 'jail']

export const JAIL_METRIC_IDS: MetricId[] = [
'jail_pct_share',
'jail_estimated_total',
'jail_per_100k',
'jail_pct_relative_inequity',
]

export const PRISON_METRIC_IDS: MetricId[] = [
'prison_pct_share',
'prison_estimated_total',
'prison_per_100k',
'prison_pct_relative_inequity',
]
Expand All @@ -40,6 +42,7 @@ const INCARCERATION_METRIC_IDS: MetricId[] = [
...PRISON_METRIC_IDS,
'total_confined_children',
'incarceration_population_pct',
'incarceration_population_estimated_total',
]

class IncarcerationProvider extends VariableProvider {
Expand Down
4 changes: 2 additions & 2 deletions frontend/src/data/utils/DatasetTimeUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ export function makeA11yTableData(
if (hasUnknowns && unknownMetric) {
a11yRow[`${unknownMetric.shortLabel} with unknown ${demographicType}`] =
unknownsData.find((row) => row[TIME_PERIOD] === timePeriod)?.[
unknownMetric.metricId
unknownMetric.metricId
]
}

Expand Down Expand Up @@ -289,7 +289,7 @@ export function getMostRecentYearAsString(
if (!df.getColumnNames().includes(TIME_PERIOD)) return

const filteredRows = df
.where((row) => row?.[metricId] !== undefined)
.where((row) => row?.[metricId] != null)
.select((row) => ({
time_period: row.time_period,
metricId: row?.[metricId],
Expand Down
128 changes: 107 additions & 21 deletions python/datasources/maternal_mortality.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,12 @@
from datasources.data_source import DataSource
import ingestion.standardized_columns as std_col
from ingestion.merge_utils import merge_state_ids, merge_pop_numbers
from ingestion.constants import NATIONAL_LEVEL, STATE_LEVEL, US_NAME
from ingestion.constants import NATIONAL_LEVEL, STATE_LEVEL, US_NAME, CURRENT, HISTORICAL
from ingestion import dataset_utils
import pandas as pd
from typing import List

NATIONAL = "National"

RACE_GROUPS_TO_STANDARD = {
'Non-Hispanic American Indian and Alaska Native': std_col.Race.AIAN_NH.value,
Expand All @@ -17,13 +21,17 @@
# ZIP FILE CONTAINING STATE-LEVEL CSV FOR /data
# https://ghdx.healthdata.org/record/ihme-data/united-states-maternal-mortality-by-state-race-ethnicity-1999-2019

# DATA FOR NATIONAL AND REGIONAL COUNTS ARE FROM THE IMAGE IN THE
# ORIGINAL STUDY LABELED "Table" AND MANUALLY INPUTTED TO /data

COLS_TO_STANDARD = {
'val': std_col.MM_PER_100K,
'race_group': std_col.RACE_CATEGORY_ID_COL,
'location_name': std_col.STATE_NAME_COL,
'year_id': std_col.TIME_PERIOD_COL,
}

RATE_COLS_TO_STANDARD = {'val': std_col.MM_PER_100K, **COLS_TO_STANDARD}


class MaternalMortalityData(DataSource):
@staticmethod
Expand All @@ -40,7 +48,7 @@ def upload_to_gcs(self, _, **attrs):
def write_to_bq(self, dataset, gcs_bucket, **attrs):

# load source data once
source_df = preprocess_source_data()
source_df = preprocess_source_rates()

for geo_level in [STATE_LEVEL, NATIONAL_LEVEL]:

Expand All @@ -63,36 +71,114 @@ def write_to_bq(self, dataset, gcs_bucket, **attrs):
std_col.RACE_OR_HISPANIC_COL,
]

keep_number_cols = [
std_col.MM_PER_100K,
std_col.POPULATION_PCT_COL,
]
if geo_level == NATIONAL_LEVEL:
df = merge_counts(df)
df = dataset_utils.generate_pct_share_col_without_unknowns(
df,
{std_col.MATERNAL_DEATHS_RAW: std_col.MM_PCT_SHARE},
std_col.RACE_OR_HISPANIC_COL,
std_col.ALL_VALUE,
)
df = dataset_utils.generate_pct_rel_inequity_col(
df, std_col.MM_PCT_SHARE, std_col.POPULATION_PCT_COL, std_col.MM_PCT_REL_INEQUITY
)

for time_type in [HISTORICAL, CURRENT]:
table_name = f'by_race_{geo_level}_{time_type}'

float_cols = get_float_cols(time_type, geo_level)

df_for_bq = df.copy()[keep_string_cols + float_cols]

df = df[keep_string_cols + keep_number_cols]
# get list of all columns expected to contain numbers
col_types = gcs_to_bq_util.get_bq_column_types(df, keep_number_cols)
table_name = f'by_race_{geo_level}_historical'
gcs_to_bq_util.add_df_to_bq(df, dataset, table_name, column_types=col_types)
if time_type == CURRENT:
df_for_bq = dataset_utils.preserve_only_current_time_period_rows(df_for_bq, std_col.TIME_PERIOD_COL)

col_types = gcs_to_bq_util.get_bq_column_types(df_for_bq, float_cols)

def preprocess_source_data() -> pd.DataFrame:
gcs_to_bq_util.add_df_to_bq(df_for_bq, dataset, table_name, column_types=col_types)


def preprocess_source_rates() -> pd.DataFrame:
"""Load and preprocess source data.
Returns:
pandas.DataFrame: preprocessed source data including state and national rows
"""
source_df = gcs_to_bq_util.load_csv_as_df_from_data_dir(
source_rates_df = gcs_to_bq_util.load_csv_as_df_from_data_dir(
'maternal_mortality',
'IHME_USA_MMR_STATE_RACE_ETHN_1999_2019_ESTIMATES_Y2023M07D03.CSV',
dtype={'year_id': str},
usecols=COLS_TO_STANDARD.keys(),
usecols=RATE_COLS_TO_STANDARD.keys(),
)

source_df = source_df.rename(columns=COLS_TO_STANDARD)
source_df = source_df.replace({'National': US_NAME})
source_df = source_df.replace(RACE_GROUPS_TO_STANDARD)
std_col.add_race_columns_from_category_id(source_df)
source_rates_df = source_rates_df.rename(columns=RATE_COLS_TO_STANDARD)
source_rates_df = source_rates_df.replace({NATIONAL: US_NAME})
source_rates_df = source_rates_df.replace(RACE_GROUPS_TO_STANDARD)
std_col.add_race_columns_from_category_id(source_rates_df)

# round rate to whole numbers
source_df[std_col.MM_PER_100K] = source_df[std_col.MM_PER_100K].round(0)
source_rates_df[std_col.MM_PER_100K] = source_rates_df[std_col.MM_PER_100K].round(0)

return source_rates_df


def merge_counts(df: pd.DataFrame) -> pd.DataFrame:
"""Merges columns for live births and maternal deaths onto the df.
These are manually input from a png image titled 'Table' within the original study
TODO: There are also regional counts available (the South, Mid-Atlantic, etc.)
which we could consider using in place of missing state level counts."""

source_counts_df = gcs_to_bq_util.load_csv_as_df_from_data_dir(
'maternal_mortality',
'Table.csv',
dtype={'year_id': str},
usecols=[
'race_group',
'location_name',
'year_id',
std_col.MATERNAL_DEATHS_RAW,
std_col.LIVE_BIRTHS_RAW,
],
)

source_counts_df = source_counts_df.rename(columns=COLS_TO_STANDARD)
source_counts_df = source_counts_df.replace({NATIONAL: US_NAME})
source_counts_df = source_counts_df.replace(RACE_GROUPS_TO_STANDARD)
std_col.add_race_columns_from_category_id(source_counts_df)

df = df.merge(
source_counts_df,
on=[
std_col.TIME_PERIOD_COL,
std_col.STATE_NAME_COL,
std_col.RACE_OR_HISPANIC_COL,
std_col.RACE_CATEGORY_ID_COL,
],
how="left",
)

return df


def get_float_cols(time_type: str, geo_level: str) -> List[str]:
"""Get the float columns for the given time type and geo level
Until we can load regional counts from Table as state, most metrics are only national
Args:
time_type (str): time type
geo_level (str): geo level
Returns:
List[str]: list of numerical columns
"""

cols = [std_col.MM_PER_100K]

if time_type == HISTORICAL:
if geo_level == NATIONAL_LEVEL:
cols.extend([std_col.MM_PCT_REL_INEQUITY])
if time_type == CURRENT:
if geo_level == NATIONAL_LEVEL:
cols.extend(
[std_col.POPULATION_PCT_COL, std_col.MM_PCT_SHARE, std_col.MATERNAL_DEATHS_RAW, std_col.LIVE_BIRTHS_RAW]
)

return source_df
return cols
4 changes: 4 additions & 0 deletions python/ingestion/standardized_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,10 @@

# MATERNAL MORTALITY
MM_PER_100K = "maternal_mortality_per_100k"
MATERNAL_DEATHS_RAW = "maternal_deaths_estimated_total"
LIVE_BIRTHS_RAW = "live_births_estimated_total"
MM_PCT_SHARE = "maternal_mortality_pct_share"
MM_PCT_REL_INEQUITY = "maternal_mortality_pct_relative_inequity"


RaceTuple = namedtuple("RaceTuple", ["race_category_id", "race_and_ethnicity"])
Expand Down
Loading

0 comments on commit ed7fd14

Please sign in to comment.