adds all ages pct share cols

benhammondmusic · Jun 13, 2024 · 9938b35 · 9938b35
1 parent 7616129
commit 9938b35
Show file tree

Hide file tree

Showing 7 changed files with 382 additions and 359 deletions.
diff --git a/python/datasources/graphql_ahr.py b/python/datasources/graphql_ahr.py
@@ -7,7 +7,11 @@
 
 from ingestion import standardized_columns as std_col
 from ingestion.constants import US_ABBR, NATIONAL_LEVEL, CURRENT, Sex
-from ingestion.dataset_utils import generate_time_df_with_cols_and_types, generate_estimated_total_col
+from ingestion.dataset_utils import (
+    generate_time_df_with_cols_and_types,
+    generate_estimated_total_col,
+    generate_pct_share_col_of_summed_alls,
+)
 from ingestion.graphql_ahr_utils import (
     generate_cols_map,
     fetch_ahr_data_from_graphql,
@@ -16,7 +20,7 @@
     AHR_MEASURES_TO_RATES_MAP_ALL_AGES,
     PCT_RATE_TO_PER_100K_TOPICS,
 )  # type: ignore
-from ingestion.types import DEMOGRAPHIC_TYPE, GEO_TYPE, SEX_RACE_AGE_TYPE
+from ingestion.types import DEMOGRAPHIC_TYPE, GEO_TYPE, SEX_RACE_AGE_TYPE, SEX_RACE_ETH_AGE_TYPE
 
 # pylint: disable=no-name-in-module
 from ingestion.merge_utils import (
@@ -90,6 +94,11 @@
     for rate_col in AHR_MEASURES_TO_RATES_MAP_ALL_AGES.values()
 }
 
+RAW_TO_SHARE_ALL_AGES_MAP = {
+    rate_col: f'{std_col.extract_prefix(rate_col)}_{std_col.PCT_SHARE_SUFFIX}'
+    for rate_col in AHR_MEASURES_TO_RATES_MAP_ALL_AGES.values()
+}
+
 
 class GraphQlAHRData(DataSource):
     def __init__(self) -> None:
@@ -116,6 +125,7 @@ def write_to_bq(self, dataset, gcs_bucket, write_local_instead_of_bq=False, **at
         for table_type in [CURRENT]:
             table_name = f"{demographic}_{geo_level}_{table_type}"
             float_cols = get_float_cols(table_type, demographic, self.intersectional_pop_cols)
+            # print("cols before generate_time_df_with_cols_and_types", df.columns)
             df_for_bq, col_types = generate_time_df_with_cols_and_types(df, float_cols, table_type, demographic)
 
         gcs_to_bq_util.add_df_to_bq(df_for_bq, dataset, table_name, column_types=col_types)
@@ -129,13 +139,13 @@ def generate_breakdown_df(self, breakdown: DEMOGRAPHIC_TYPE, geo_level: GEO_TYPE
 
         return breakdown_df
 
-    def post_process(self, df: pd.DataFrame, breakdown: DEMOGRAPHIC_TYPE, geo_level: GEO_TYPE):
+    def post_process(self, df: pd.DataFrame, demographic: DEMOGRAPHIC_TYPE, geo_level: GEO_TYPE):
         """
         Post-processes a DataFrame containing demographic data.
 
         Args:
         - df (pd.DataFrame): The DataFrame containing the raw demographic data.
-        - breakdown_col (DEMOGRAPHIC_TYPE): The type of demographic breakdown to be standardized.
+        - breakdown_col (DEMOGRAPHIC_TYPE): The type of demographic to be standardized.
         - geo_level (GEO_TYPE): The geographic level of the data.
 
         Returns:
@@ -144,24 +154,25 @@ def post_process(self, df: pd.DataFrame, breakdown: DEMOGRAPHIC_TYPE, geo_level:
         This function performs the following steps:
         - Standardizes demographic breakdowns based on the specified demographic type.
         - Merges state IDs with the DataFrame.
-        - Merges yearly population numbers based on the demographic breakdown and geographic level.
+        - Merges yearly population numbers based on the demographic and geographic level.
         - Merges intersection population col for adult populations for race and sex breakdowns.
         - Adds estimated total columns based on specified mappings.
-        - TODO: Generates percentage share columns without unknowns based on specified mappings.
-        - TODO: Drops the 'Population' column from the DataFrame.
+        - Generates percentage share (of summed groups) columns.
         - Sorts the DataFrame by state FIPS code and time period in descending order.
         - Converts the 'Time Period' column to datetime and filters data up to the year 2021.
         """
 
         breakdown_df = df.copy()
 
-        if breakdown == std_col.AGE_COL:
+        if demographic == std_col.AGE_COL:
             breakdown_df = breakdown_df.replace(to_replace=AGE_GROUPS_TO_STANDARD)
-        if breakdown == std_col.RACE_OR_HISPANIC_COL:
+        if demographic == std_col.RACE_OR_HISPANIC_COL:
             breakdown_df = breakdown_df.rename(columns={std_col.RACE_OR_HISPANIC_COL: std_col.RACE_CATEGORY_ID_COL})
             breakdown_df = breakdown_df.replace(to_replace=RACE_GROUPS_TO_STANDARD)
 
-        pop_breakdown = std_col.RACE_COL if breakdown == std_col.RACE_OR_HISPANIC_COL else breakdown
+        pop_breakdown = std_col.RACE_COL if demographic == std_col.RACE_OR_HISPANIC_COL else demographic
+        share_demo = std_col.RACE_OR_HISPANIC_COL if demographic == std_col.RACE_OR_HISPANIC_COL else demographic
+
         breakdown_df = merge_state_ids(breakdown_df)
 
         # merge general population by primary demographic
@@ -170,12 +181,24 @@ def post_process(self, df: pd.DataFrame, breakdown: DEMOGRAPHIC_TYPE, geo_level:
         # suicide is all ages
         breakdown_df = generate_estimated_total_col(breakdown_df, std_col.POPULATION_COL, RATE_TO_RAW_ALL_AGES_MAP)
 
+        if demographic in [std_col.RACE_OR_HISPANIC_COL, std_col.RACE_COL]:
+            std_col.add_race_columns_from_category_id(breakdown_df)
+
+        breakdown_df = generate_pct_share_col_of_summed_alls(
+            breakdown_df, RAW_TO_SHARE_ALL_AGES_MAP, cast(SEX_RACE_ETH_AGE_TYPE, share_demo)
+        )
+
+        # print("cols", breakdown_df.columns)
+
         # merge another col with 18+ population if by race or by sex
-        if breakdown != std_col.AGE_COL:
+        if demographic != std_col.AGE_COL:
+
             breakdown_df, pop_18plus_col = merge_intersectional_pop(
-                breakdown_df, geo_level, breakdown, age_specific_group='18+'
+                breakdown_df, geo_level, demographic, age_specific_group='18+'
             )
 
+            # print("cols", breakdown_df.columns)
+
             breakdown_df = generate_estimated_total_col(
                 breakdown_df,
                 pop_18plus_col,
@@ -186,9 +209,6 @@ def post_process(self, df: pd.DataFrame, breakdown: DEMOGRAPHIC_TYPE, geo_level:
             # save the generated intersectional population column for later use writing to bq
             self.intersectional_pop_cols.append(pop_18plus_col)
 
-        if breakdown == std_col.RACE_OR_HISPANIC_COL:
-            std_col.add_race_columns_from_category_id(breakdown_df)
-
         breakdown_df = breakdown_df.sort_values(
             by=[std_col.STATE_FIPS_COL, std_col.TIME_PERIOD_COL], ascending=[True, False]
         )
@@ -353,13 +373,14 @@ def get_float_cols(
             ]
         )
 
-        # all breakdowns get all ages counts
         float_cols.extend(list(RATE_TO_RAW_ALL_AGES_MAP.values()))
+        float_cols.extend(list(RAW_TO_SHARE_ALL_AGES_MAP.values()))
 
         # race/sex get age 18+ topic counts
         if demo_col != std_col.AGE_COL:
             float_cols.extend(intersectional_pop_cols)
             float_cols.extend(list(RATE_TO_RAW_18PLUS_MAP.values()))
+            # TODO: add pct_share cols for 18+ age topics
 
     # TODO: historical tables will get pct_relative_inequity cols
 

diff --git a/python/ingestion/dataset_utils.py b/python/ingestion/dataset_utils.py
@@ -246,7 +246,7 @@ def calc_pct_share(record, raw_count_col):
 
 def generate_pct_share_col_of_summed_alls(
     df: pd.DataFrame, raw_count_to_pct_share: dict[str, str], demo_col: Literal['age', 'sex', 'race_and_ethnicity']
-):
+) -> pd.DataFrame:
     """
     Adds a `pct_share` column for each raw_count_to_pct_share item. Rather than using the "All" row's
     estimate_total values, this recalculates the "All" rows' values as the sum of the groups' rows.

diff --git a/python/ingestion/merge_utils.py b/python/ingestion/merge_utils.py
@@ -425,6 +425,8 @@ def merge_intersectional_pop(
 
     if primary_demo_col == std_col.RACE_OR_HISPANIC_COL:
         merge_cols.append(std_col.RACE_CATEGORY_ID_COL)
+        if std_col.RACE_OR_HISPANIC_COL in df.columns:
+            merge_cols.append(std_col.RACE_OR_HISPANIC_COL)
     else:
         merge_cols.append(primary_demo_col)
 

diff --git a/python/tests/data/graphql_ahr/golden_data/age_national_current.csv b/python/tests/data/graphql_ahr/golden_data/age_national_current.csv
@@ -1,13 +1,13 @@
-state_name,state_fips,age,voter_participation_pct_rate,asthma_per_100k,avoided_care_pct_rate,cardiovascular_diseases_per_100k,chronic_kidney_disease_per_100k,copd_per_100k,depression_per_100k,diabetes_per_100k,excessive_drinking_per_100k,frequent_mental_distress_per_100k,preventable_hospitalizations_per_100k,non_medical_drug_use_per_100k,suicide_per_100k,population,population_pct,suicide_estimated_total
-United States,00,45-64,,10100.0,9.6,9100.0,3200.0,7700.0,19100.0,15100.0,15100.0,13500.0,,,,85333288.0,25.6,
-United States,00,15-24,,,,,,,,,,,,,15.2,43666116.0,13.1,6637.0
-United States,00,18-44,,9900.0,13.1,1900.0,1100.0,2700.0,22200.0,3400.0,23200.0,18400.0,,,,119279569.0,35.8,
-United States,00,24-34,,,,,,,,,,,,,19.5,,,
-United States,00,35-44,,,,,,,,,,,,,18.1,42845441.0,12.9,7755.0
-United States,00,45-54,,,,,,,,,,,,,18.2,42062109.0,12.6,7655.0
-United States,00,55-64,,,,,,,,,,,,,17.0,43271179.0,13.0,7356.0
-United States,00,65+,,8600.0,3.4,20600.0,7100.0,12300.0,14600.0,22600.0,7000.0,8500.0,,,16.9,53592546.0,16.1,9057.0
-United States,00,65-74,,,,,,,,,,,,,15.3,31968856.0,9.6,4891.0
-United States,00,75-84,,,,,,,,,,,,,19.6,15235011.0,4.6,2986.0
-United States,00,85+,,,,,,,,,,,,,22.4,6388679.0,1.9,1431.0
-United States,00,All,,9800.0,8.8,8000.0,3000.0,6200.0,20500.0,10900.0,17300.0,14700.0,2681.0,12.0,14.5,333036755.0,100.0,48290.0
+state_name,state_fips,age,asthma_per_100k,avoided_care_pct_rate,cardiovascular_diseases_per_100k,chronic_kidney_disease_per_100k,copd_per_100k,depression_per_100k,diabetes_per_100k,excessive_drinking_per_100k,frequent_mental_distress_per_100k,non_medical_drug_use_per_100k,voter_participation_pct_rate,preventable_hospitalizations_per_100k,suicide_per_100k,population,population_pct,suicide_estimated_total,suicide_pct_share
+United States,00,45-64,10100.0,9.6,9100.0,3200.0,7700.0,19100.0,15100.0,15100.0,13500.0,,,,,85333288.0,25.6,,
+United States,00,15-24,,,,,,,,,,,,,15.2,43666116.0,13.1,6637.0,0.8
+United States,00,18-44,9900.0,13.1,1900.0,1100.0,2700.0,22200.0,3400.0,23200.0,18400.0,,,,,119279569.0,35.8,,
+United States,00,24-34,,,,,,,,,,,,,19.5,,,,1.0
+United States,00,35-44,,,,,,,,,,,,,18.1,42845441.0,12.9,7755.0,0.9
+United States,00,45-54,,,,,,,,,,,,,18.2,42062109.0,12.6,7655.0,0.9
+United States,00,55-64,,,,,,,,,,,,,17.0,43271179.0,13.0,7356.0,0.9
+United States,00,65+,8600.0,3.4,20600.0,7100.0,12300.0,14600.0,22600.0,7000.0,8500.0,,,,16.9,53592546.0,16.1,9057.0,0.9
+United States,00,65-74,,,,,,,,,,,,,15.3,31968856.0,9.6,4891.0,0.8
+United States,00,75-84,,,,,,,,,,,,,19.6,15235011.0,4.6,2986.0,1.0
+United States,00,85+,,,,,,,,,,,,,22.4,6388679.0,1.9,1431.0,1.2
+United States,00,All,9800.0,8.8,8000.0,3000.0,6200.0,20500.0,10900.0,17300.0,14700.0,12.0,,2681.0,1917.6,333036755.0,100.0,48290.0,100.0