Merge branch 'main' of https://github.com/SatcherInstitute/health-equ…

…ity-tracker
benhammondmusic · Apr 23, 2024 · ed7fd14 · ed7fd14
2 parents e966414 + 498dbcb
commit ed7fd14
Show file tree

Hide file tree

Showing 11 changed files with 258 additions and 60 deletions.
diff --git a/data/maternal_mortality/Table.csv b/data/maternal_mortality/Table.csv
@@ -0,0 +1,13 @@
+year_id,race_group,location_name,maternal_deaths_estimated_total,live_births_estimated_total
+1999,Non-Hispanic American Indian and Alaska Native,National,7,37800
+2019,Non-Hispanic American Indian and Alaska Native,National,23,33300
+1999,"Non-Hispanic Asian, Native Hawaiian, or Other Pacific Islander",National,17,177900
+2019,"Non-Hispanic Asian, Native Hawaiian, or Other Pacific Islander",National,56,268600
+1999,Non-Hispanic Black,National,186,593200
+2019,Non-Hispanic Black,National,393,581400
+1999,Non-Hispanic White,National,74,764100
+2019,Non-Hispanic White,National,184,886900
+1999,Hispanic and any race,National,220,2392200
+2019,Hispanic and any race,National,551,1979000
+1999,All racial and ethnic groups,National,505,3965200
+2019,All racial and ethnic groups,National,1210,3749100
diff --git a/data/maternal_mortality/manual_count_table_national_regional.csv b/data/maternal_mortality/manual_count_table_national_regional.csv
@@ -0,0 +1,53 @@
+Census region,Maternal death No. (95% UI) estimates,Live birth estimates in thousands,year_id,race_group
+US,7 (5-11),37.8 (32.0-44.5),1999,American Indian and Alaska Native
+Midwest,1 (1-3),7 (5.2-9.4),1999,American Indian and Alaska Native
+Northeast,0 (0-1),1.6 (1.2-2.1),1999,American Indian and Alaska Native
+South,1 (1-3),9.6 (7.8-12.0),1999,American Indian and Alaska Native
+West,4 (2-8),19.6 (14.6-25.8),1999,American Indian and Alaska Native
+US,23 (16-35),33.3 (29.9-37.4),2019,American Indian and Alaska Native
+Midwest,6 (3-12),6.5 (4.8-8.8),2019,American Indian and Alaska Native
+Northeast,0 (0-2),1.1 (0.9-1.3),2019,American Indian and Alaska Native
+South,3 (2-6),9.8 (7.4-12.8),2019,American Indian and Alaska Native
+West,13 (7-24),15.9 (14.1-18.0),2019,American Indian and Alaska Native
+US,17 (12-23),177.9 (161.7-198.3),1999,"Asian, Native Hawaiian, or Other Pacific Islander"
+Midwest,2 (1-3),23.2 (20.3-27),1999,"Asian, Native Hawaiian, or Other Pacific Islander"
+Northeast,3 (1-6),36.4 (32.0-40.9),1999,"Asian, Native Hawaiian, or Other Pacific Islander"
+South,4 (2-6),33.6 (29.2-38.5),1999,"Asian, Native Hawaiian, or Other Pacific Islander"
+West,9 (5-14),84.8 (68.6-103.9),1999,"Asian, Native Hawaiian, or Other Pacific Islander"
+US,56 (42-76),268.6 (257.8-280),2019,"Asian, Native Hawaiian, or Other Pacific Islander"
+Midwest,5 (3-10),36.1 (32.6-40.3),2019,"Asian, Native Hawaiian, or Other Pacific Islander"
+Northeast,6 (4-11),53 (48.8-57.7),2019,"Asian, Native Hawaiian, or Other Pacific Islander"
+South,17 (10-25),64.9 (60.1-69.8),2019,"Asian, Native Hawaiian, or Other Pacific Islander"
+West,28 (17-44),114.6 (107.6-122.2),2019,"Asian, Native Hawaiian, or Other Pacific Islander"
+US,186 (152-225),593.2 (543.2-648.3),1999,Black
+Midwest,30 (20-42),116.4 (103.3-130.5),1999,Black
+Northeast,29 (18-45),100.3 (88.7-112.7),1999,Black
+South,113 (87-143),328.5 (291.8-371.1),1999,Black
+West,14 (9-20),47.9 (39.1-58.8),1999,Black
+US,393 (334-463),581.4 (556.4-609.4),2019,Black
+Midwest,50 (32-73),118 (106.6-131.8),2019,Black
+Northeast,73 (51-100),81.2 (74.3-88.6),2019,Black
+South,245 (196-298),332.1 (312.3-353.1),2019,Black
+West,26 (15-42),50 (43.1-57.4),2019,Black
+US,74 (59-93),764.1 (695.1-844.2),1999,Hispanic
+Midwest,6 (3-10),69.2 (60.7-79.1),1999,Hispanic
+Northeast,12 (7-19),99 (87.9-111.2),1999,Hispanic
+South,25 (18-34),246.4 (210.2-293.3),1999,Hispanic
+West,31 (21-44),349.5 (286.0-424.4),1999,Hispanic
+US,184 (151-230),886.9 (847.1-930.3),2019,Hispanic
+Midwest,13 (8-22),84.8 (75.2-95.4),2019,Hispanic
+Northeast,37 (24-55),121.4 (111.9-132.6),2019,Hispanic
+South,76 (54-99),357.7 (326.9-388.9),2019,Hispanic
+West,59 (39-91),323 (299.8-345.3),2019,Hispanic
+US,220 (181-264),2392.2 (2225.1-2579.6),1999,White
+Midwest,62 (45-84),667.6 (597.2-744.6),1999,White
+Northeast,36 (22-55),449.7 (399.2-502.5),1999,White
+South,86 (64-110),822 (732.2-920.6),1999,White
+West,36 (25-52),452.9 (369.3-552.2),1999,White
+US,551 (465-635),1979 (1918.7-2041.5),2019,White
+Midwest,147 (109-190),539.6 (511.1-572.1),2019,White
+Northeast,80 (56-111),341.3 (319.3-364.1),2019,White
+South,237 (189-293),715.2 (676.9-753.9),2019,White
+West,87 (60-122),382.9 (359.3-409.0),2019,White
+US,505 (424-593),3965.2 (3709.0-4267.2),1999,All racial and ethnic groups
+US,1210 (1060-1370),3749.1 (3646.8-3854.9),2019,All racial and ethnic groups
diff --git a/data/maternal_mortality/manual_national_table.csv b/data/maternal_mortality/manual_national_table.csv
diff --git a/frontend/src/data/config/MetricConfigPDOH.ts b/frontend/src/data/config/MetricConfigPDOH.ts
@@ -26,13 +26,16 @@ export type PDOHMetricId =
   | 'ahr_population_pct'
   | 'cawp_population_pct'
   | 'incarceration_population_pct'
+  | 'incarceration_population_estimated_total'
+  | 'jail_estimated_total'
   | 'jail_pct_relative_inequity'
   | 'jail_pct_share'
   | 'jail_per_100k'
   | 'pct_share_of_state_leg'
   | 'pct_share_of_us_congress'
   | 'pct_share_of_women_state_leg'
   | 'pct_share_of_women_us_congress'
+  | 'prison_estimated_total'
   | 'prison_pct_relative_inequity'
   | 'prison_pct_share'
   | 'prison_per_100k'
@@ -234,13 +237,31 @@ export const INCARCERATION_METRICS: DataTypeConfig[] = [
     },
     dataTableTitle: 'Breakdown summary for people in prison',
     metrics: {
+      sub_population_count: {
+        chartTitle: '',
+        metricId: 'incarceration_population_estimated_total',
+        shortLabel: 'Total Population (Ages 15-64)',
+        type: 'count',
+      },
       per100k: {
         metricId: 'prison_per_100k',
         chartTitle: 'Prison incarceration',
         trendsCardTitleName: 'Rates of prison incarceration over time',
         columnTitleHeader: 'People in prison per 100k',
         shortLabel: 'prison per 100k',
         type: 'per100k',
+        rateNumeratorMetric: {
+          metricId: 'prison_estimated_total',
+          shortLabel: 'in prison',
+          chartTitle: '',
+          type: 'count',
+        },
+        rateDenominatorMetric: {
+          metricId: 'incarceration_population_estimated_total',
+          shortLabel: 'Total population (Ages 15-64)',
+          chartTitle: '',
+          type: 'count',
+        },
       },
       pct_share: {
         chartTitle: 'Percent share of total prison population',
@@ -289,13 +310,32 @@ export const INCARCERATION_METRICS: DataTypeConfig[] = [
     },
     dataTableTitle: 'Breakdown summary for people in jail',
     metrics: {
+
+      sub_population_count: {
+        chartTitle: '',
+        metricId: 'incarceration_population_estimated_total',
+        shortLabel: 'Total Population (Ages 15-64)',
+        type: 'count',
+      },
       per100k: {
         metricId: 'jail_per_100k',
         chartTitle: 'Jail incarceration',
         trendsCardTitleName: 'Rates of jail incarceration over time',
         columnTitleHeader: 'People in jail per 100k',
         shortLabel: 'jail per 100k',
         type: 'per100k',
+        rateNumeratorMetric: {
+          metricId: 'jail_estimated_total',
+          shortLabel: 'in jail',
+          chartTitle: '',
+          type: 'count',
+        },
+        rateDenominatorMetric: {
+          metricId: 'incarceration_population_estimated_total',
+          shortLabel: 'Total population (Ages 15-64)',
+          chartTitle: '',
+          type: 'count',
+        },
       },
       pct_share: {
         chartTitle: 'Percent share of total jail population',

diff --git a/frontend/src/data/providers/IncarcerationProvider.tsx b/frontend/src/data/providers/IncarcerationProvider.tsx
@@ -25,12 +25,14 @@ export const INCARCERATION_IDS: DataTypeId[] = ['prison', 'jail']
 
 export const JAIL_METRIC_IDS: MetricId[] = [
   'jail_pct_share',
+  'jail_estimated_total',
   'jail_per_100k',
   'jail_pct_relative_inequity',
 ]
 
 export const PRISON_METRIC_IDS: MetricId[] = [
   'prison_pct_share',
+  'prison_estimated_total',
   'prison_per_100k',
   'prison_pct_relative_inequity',
 ]
@@ -40,6 +42,7 @@ const INCARCERATION_METRIC_IDS: MetricId[] = [
   ...PRISON_METRIC_IDS,
   'total_confined_children',
   'incarceration_population_pct',
+  'incarceration_population_estimated_total',
 ]
 
 class IncarcerationProvider extends VariableProvider {

diff --git a/frontend/src/data/utils/DatasetTimeUtils.ts b/frontend/src/data/utils/DatasetTimeUtils.ts
@@ -207,7 +207,7 @@ export function makeA11yTableData(
     if (hasUnknowns && unknownMetric) {
       a11yRow[`${unknownMetric.shortLabel} with unknown ${demographicType}`] =
         unknownsData.find((row) => row[TIME_PERIOD] === timePeriod)?.[
-          unknownMetric.metricId
+        unknownMetric.metricId
         ]
     }
 
@@ -289,7 +289,7 @@ export function getMostRecentYearAsString(
   if (!df.getColumnNames().includes(TIME_PERIOD)) return
 
   const filteredRows = df
-    .where((row) => row?.[metricId] !== undefined)
+    .where((row) => row?.[metricId] != null)
     .select((row) => ({
       time_period: row.time_period,
       metricId: row?.[metricId],

diff --git a/python/datasources/maternal_mortality.py b/python/datasources/maternal_mortality.py
@@ -2,8 +2,12 @@
 from datasources.data_source import DataSource
 import ingestion.standardized_columns as std_col
 from ingestion.merge_utils import merge_state_ids, merge_pop_numbers
-from ingestion.constants import NATIONAL_LEVEL, STATE_LEVEL, US_NAME
+from ingestion.constants import NATIONAL_LEVEL, STATE_LEVEL, US_NAME, CURRENT, HISTORICAL
+from ingestion import dataset_utils
 import pandas as pd
+from typing import List
+
+NATIONAL = "National"
 
 RACE_GROUPS_TO_STANDARD = {
     'Non-Hispanic American Indian and Alaska Native': std_col.Race.AIAN_NH.value,
@@ -17,13 +21,17 @@
 # ZIP FILE CONTAINING STATE-LEVEL CSV FOR /data
 # https://ghdx.healthdata.org/record/ihme-data/united-states-maternal-mortality-by-state-race-ethnicity-1999-2019
 
+# DATA FOR NATIONAL AND REGIONAL COUNTS ARE FROM THE IMAGE IN THE
+# ORIGINAL STUDY LABELED "Table" AND MANUALLY INPUTTED TO /data
+
 COLS_TO_STANDARD = {
-    'val': std_col.MM_PER_100K,
     'race_group': std_col.RACE_CATEGORY_ID_COL,
     'location_name': std_col.STATE_NAME_COL,
     'year_id': std_col.TIME_PERIOD_COL,
 }
 
+RATE_COLS_TO_STANDARD = {'val': std_col.MM_PER_100K, **COLS_TO_STANDARD}
+
 
 class MaternalMortalityData(DataSource):
     @staticmethod
@@ -40,7 +48,7 @@ def upload_to_gcs(self, _, **attrs):
     def write_to_bq(self, dataset, gcs_bucket, **attrs):
 
         # load source data once
-        source_df = preprocess_source_data()
+        source_df = preprocess_source_rates()
 
         for geo_level in [STATE_LEVEL, NATIONAL_LEVEL]:
 
@@ -63,36 +71,114 @@ def write_to_bq(self, dataset, gcs_bucket, **attrs):
                 std_col.RACE_OR_HISPANIC_COL,
             ]
 
-            keep_number_cols = [
-                std_col.MM_PER_100K,
-                std_col.POPULATION_PCT_COL,
-            ]
+            if geo_level == NATIONAL_LEVEL:
+                df = merge_counts(df)
+                df = dataset_utils.generate_pct_share_col_without_unknowns(
+                    df,
+                    {std_col.MATERNAL_DEATHS_RAW: std_col.MM_PCT_SHARE},
+                    std_col.RACE_OR_HISPANIC_COL,
+                    std_col.ALL_VALUE,
+                )
+                df = dataset_utils.generate_pct_rel_inequity_col(
+                    df, std_col.MM_PCT_SHARE, std_col.POPULATION_PCT_COL, std_col.MM_PCT_REL_INEQUITY
+                )
+
+            for time_type in [HISTORICAL, CURRENT]:
+                table_name = f'by_race_{geo_level}_{time_type}'
+
+                float_cols = get_float_cols(time_type, geo_level)
+
+                df_for_bq = df.copy()[keep_string_cols + float_cols]
 
-            df = df[keep_string_cols + keep_number_cols]
-            # get list of all columns expected to contain numbers
-            col_types = gcs_to_bq_util.get_bq_column_types(df, keep_number_cols)
-            table_name = f'by_race_{geo_level}_historical'
-            gcs_to_bq_util.add_df_to_bq(df, dataset, table_name, column_types=col_types)
+                if time_type == CURRENT:
+                    df_for_bq = dataset_utils.preserve_only_current_time_period_rows(df_for_bq, std_col.TIME_PERIOD_COL)
 
+                col_types = gcs_to_bq_util.get_bq_column_types(df_for_bq, float_cols)
 
-def preprocess_source_data() -> pd.DataFrame:
+                gcs_to_bq_util.add_df_to_bq(df_for_bq, dataset, table_name, column_types=col_types)
+
+
+def preprocess_source_rates() -> pd.DataFrame:
     """Load and preprocess source data.
     Returns:
         pandas.DataFrame: preprocessed source data including state and national rows
     """
-    source_df = gcs_to_bq_util.load_csv_as_df_from_data_dir(
+    source_rates_df = gcs_to_bq_util.load_csv_as_df_from_data_dir(
         'maternal_mortality',
         'IHME_USA_MMR_STATE_RACE_ETHN_1999_2019_ESTIMATES_Y2023M07D03.CSV',
         dtype={'year_id': str},
-        usecols=COLS_TO_STANDARD.keys(),
+        usecols=RATE_COLS_TO_STANDARD.keys(),
     )
 
-    source_df = source_df.rename(columns=COLS_TO_STANDARD)
-    source_df = source_df.replace({'National': US_NAME})
-    source_df = source_df.replace(RACE_GROUPS_TO_STANDARD)
-    std_col.add_race_columns_from_category_id(source_df)
+    source_rates_df = source_rates_df.rename(columns=RATE_COLS_TO_STANDARD)
+    source_rates_df = source_rates_df.replace({NATIONAL: US_NAME})
+    source_rates_df = source_rates_df.replace(RACE_GROUPS_TO_STANDARD)
+    std_col.add_race_columns_from_category_id(source_rates_df)
 
     # round rate to whole numbers
-    source_df[std_col.MM_PER_100K] = source_df[std_col.MM_PER_100K].round(0)
+    source_rates_df[std_col.MM_PER_100K] = source_rates_df[std_col.MM_PER_100K].round(0)
+
+    return source_rates_df
+
+
+def merge_counts(df: pd.DataFrame) -> pd.DataFrame:
+    """Merges columns for live births and maternal deaths onto the df.
+    These are manually input from a png image titled 'Table' within the original study
+
+    TODO: There are also regional counts available (the South, Mid-Atlantic, etc.)
+    which we could consider using in place of missing state level counts."""
+
+    source_counts_df = gcs_to_bq_util.load_csv_as_df_from_data_dir(
+        'maternal_mortality',
+        'Table.csv',
+        dtype={'year_id': str},
+        usecols=[
+            'race_group',
+            'location_name',
+            'year_id',
+            std_col.MATERNAL_DEATHS_RAW,
+            std_col.LIVE_BIRTHS_RAW,
+        ],
+    )
+
+    source_counts_df = source_counts_df.rename(columns=COLS_TO_STANDARD)
+    source_counts_df = source_counts_df.replace({NATIONAL: US_NAME})
+    source_counts_df = source_counts_df.replace(RACE_GROUPS_TO_STANDARD)
+    std_col.add_race_columns_from_category_id(source_counts_df)
+
+    df = df.merge(
+        source_counts_df,
+        on=[
+            std_col.TIME_PERIOD_COL,
+            std_col.STATE_NAME_COL,
+            std_col.RACE_OR_HISPANIC_COL,
+            std_col.RACE_CATEGORY_ID_COL,
+        ],
+        how="left",
+    )
+
+    return df
+
+
+def get_float_cols(time_type: str, geo_level: str) -> List[str]:
+    """Get the float columns for the given time type and geo level
+    Until we can load regional counts from Table as state, most metrics are only national
+    Args:
+        time_type (str): time type
+        geo_level (str): geo level
+    Returns:
+        List[str]: list of numerical columns
+    """
+
+    cols = [std_col.MM_PER_100K]
+
+    if time_type == HISTORICAL:
+        if geo_level == NATIONAL_LEVEL:
+            cols.extend([std_col.MM_PCT_REL_INEQUITY])
+    if time_type == CURRENT:
+        if geo_level == NATIONAL_LEVEL:
+            cols.extend(
+                [std_col.POPULATION_PCT_COL, std_col.MM_PCT_SHARE, std_col.MATERNAL_DEATHS_RAW, std_col.LIVE_BIRTHS_RAW]
+            )
 
-    return source_df
+    return cols
diff --git a/python/ingestion/standardized_columns.py b/python/ingestion/standardized_columns.py
@@ -212,6 +212,10 @@
 
 # MATERNAL MORTALITY
 MM_PER_100K = "maternal_mortality_per_100k"
+MATERNAL_DEATHS_RAW = "maternal_deaths_estimated_total"
+LIVE_BIRTHS_RAW = "live_births_estimated_total"
+MM_PCT_SHARE = "maternal_mortality_pct_share"
+MM_PCT_REL_INEQUITY = "maternal_mortality_pct_relative_inequity"
 
 
 RaceTuple = namedtuple("RaceTuple", ["race_category_id", "race_and_ethnicity"])