Skip to content

Commit

Permalink
adds all ages pct share cols
Browse files Browse the repository at this point in the history
  • Loading branch information
benhammondmusic committed Jun 13, 2024
1 parent 7616129 commit 9938b35
Show file tree
Hide file tree
Showing 7 changed files with 382 additions and 359 deletions.
53 changes: 37 additions & 16 deletions python/datasources/graphql_ahr.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@

from ingestion import standardized_columns as std_col
from ingestion.constants import US_ABBR, NATIONAL_LEVEL, CURRENT, Sex
from ingestion.dataset_utils import generate_time_df_with_cols_and_types, generate_estimated_total_col
from ingestion.dataset_utils import (
generate_time_df_with_cols_and_types,
generate_estimated_total_col,
generate_pct_share_col_of_summed_alls,
)
from ingestion.graphql_ahr_utils import (
generate_cols_map,
fetch_ahr_data_from_graphql,
Expand All @@ -16,7 +20,7 @@
AHR_MEASURES_TO_RATES_MAP_ALL_AGES,
PCT_RATE_TO_PER_100K_TOPICS,
) # type: ignore
from ingestion.types import DEMOGRAPHIC_TYPE, GEO_TYPE, SEX_RACE_AGE_TYPE
from ingestion.types import DEMOGRAPHIC_TYPE, GEO_TYPE, SEX_RACE_AGE_TYPE, SEX_RACE_ETH_AGE_TYPE

# pylint: disable=no-name-in-module
from ingestion.merge_utils import (
Expand Down Expand Up @@ -90,6 +94,11 @@
for rate_col in AHR_MEASURES_TO_RATES_MAP_ALL_AGES.values()
}

RAW_TO_SHARE_ALL_AGES_MAP = {
rate_col: f'{std_col.extract_prefix(rate_col)}_{std_col.PCT_SHARE_SUFFIX}'
for rate_col in AHR_MEASURES_TO_RATES_MAP_ALL_AGES.values()
}


class GraphQlAHRData(DataSource):
def __init__(self) -> None:
Expand All @@ -116,6 +125,7 @@ def write_to_bq(self, dataset, gcs_bucket, write_local_instead_of_bq=False, **at
for table_type in [CURRENT]:
table_name = f"{demographic}_{geo_level}_{table_type}"
float_cols = get_float_cols(table_type, demographic, self.intersectional_pop_cols)
# print("cols before generate_time_df_with_cols_and_types", df.columns)
df_for_bq, col_types = generate_time_df_with_cols_and_types(df, float_cols, table_type, demographic)

gcs_to_bq_util.add_df_to_bq(df_for_bq, dataset, table_name, column_types=col_types)
Expand All @@ -129,13 +139,13 @@ def generate_breakdown_df(self, breakdown: DEMOGRAPHIC_TYPE, geo_level: GEO_TYPE

return breakdown_df

def post_process(self, df: pd.DataFrame, breakdown: DEMOGRAPHIC_TYPE, geo_level: GEO_TYPE):
def post_process(self, df: pd.DataFrame, demographic: DEMOGRAPHIC_TYPE, geo_level: GEO_TYPE):
"""
Post-processes a DataFrame containing demographic data.
Args:
- df (pd.DataFrame): The DataFrame containing the raw demographic data.
- breakdown_col (DEMOGRAPHIC_TYPE): The type of demographic breakdown to be standardized.
- breakdown_col (DEMOGRAPHIC_TYPE): The type of demographic to be standardized.
- geo_level (GEO_TYPE): The geographic level of the data.
Returns:
Expand All @@ -144,24 +154,25 @@ def post_process(self, df: pd.DataFrame, breakdown: DEMOGRAPHIC_TYPE, geo_level:
This function performs the following steps:
- Standardizes demographic breakdowns based on the specified demographic type.
- Merges state IDs with the DataFrame.
- Merges yearly population numbers based on the demographic breakdown and geographic level.
- Merges yearly population numbers based on the demographic and geographic level.
- Merges intersection population col for adult populations for race and sex breakdowns.
- Adds estimated total columns based on specified mappings.
- TODO: Generates percentage share columns without unknowns based on specified mappings.
- TODO: Drops the 'Population' column from the DataFrame.
- Generates percentage share (of summed groups) columns.
- Sorts the DataFrame by state FIPS code and time period in descending order.
- Converts the 'Time Period' column to datetime and filters data up to the year 2021.
"""

breakdown_df = df.copy()

if breakdown == std_col.AGE_COL:
if demographic == std_col.AGE_COL:
breakdown_df = breakdown_df.replace(to_replace=AGE_GROUPS_TO_STANDARD)
if breakdown == std_col.RACE_OR_HISPANIC_COL:
if demographic == std_col.RACE_OR_HISPANIC_COL:
breakdown_df = breakdown_df.rename(columns={std_col.RACE_OR_HISPANIC_COL: std_col.RACE_CATEGORY_ID_COL})
breakdown_df = breakdown_df.replace(to_replace=RACE_GROUPS_TO_STANDARD)

pop_breakdown = std_col.RACE_COL if breakdown == std_col.RACE_OR_HISPANIC_COL else breakdown
pop_breakdown = std_col.RACE_COL if demographic == std_col.RACE_OR_HISPANIC_COL else demographic
share_demo = std_col.RACE_OR_HISPANIC_COL if demographic == std_col.RACE_OR_HISPANIC_COL else demographic

breakdown_df = merge_state_ids(breakdown_df)

# merge general population by primary demographic
Expand All @@ -170,12 +181,24 @@ def post_process(self, df: pd.DataFrame, breakdown: DEMOGRAPHIC_TYPE, geo_level:
# suicide is all ages
breakdown_df = generate_estimated_total_col(breakdown_df, std_col.POPULATION_COL, RATE_TO_RAW_ALL_AGES_MAP)

if demographic in [std_col.RACE_OR_HISPANIC_COL, std_col.RACE_COL]:
std_col.add_race_columns_from_category_id(breakdown_df)

breakdown_df = generate_pct_share_col_of_summed_alls(
breakdown_df, RAW_TO_SHARE_ALL_AGES_MAP, cast(SEX_RACE_ETH_AGE_TYPE, share_demo)
)

# print("cols", breakdown_df.columns)

# merge another col with 18+ population if by race or by sex
if breakdown != std_col.AGE_COL:
if demographic != std_col.AGE_COL:

breakdown_df, pop_18plus_col = merge_intersectional_pop(
breakdown_df, geo_level, breakdown, age_specific_group='18+'
breakdown_df, geo_level, demographic, age_specific_group='18+'
)

# print("cols", breakdown_df.columns)

breakdown_df = generate_estimated_total_col(
breakdown_df,
pop_18plus_col,
Expand All @@ -186,9 +209,6 @@ def post_process(self, df: pd.DataFrame, breakdown: DEMOGRAPHIC_TYPE, geo_level:
# save the generated intersectional population column for later use writing to bq
self.intersectional_pop_cols.append(pop_18plus_col)

if breakdown == std_col.RACE_OR_HISPANIC_COL:
std_col.add_race_columns_from_category_id(breakdown_df)

breakdown_df = breakdown_df.sort_values(
by=[std_col.STATE_FIPS_COL, std_col.TIME_PERIOD_COL], ascending=[True, False]
)
Expand Down Expand Up @@ -353,13 +373,14 @@ def get_float_cols(
]
)

# all breakdowns get all ages counts
float_cols.extend(list(RATE_TO_RAW_ALL_AGES_MAP.values()))
float_cols.extend(list(RAW_TO_SHARE_ALL_AGES_MAP.values()))

# race/sex get age 18+ topic counts
if demo_col != std_col.AGE_COL:
float_cols.extend(intersectional_pop_cols)
float_cols.extend(list(RATE_TO_RAW_18PLUS_MAP.values()))
# TODO: add pct_share cols for 18+ age topics

# TODO: historical tables will get pct_relative_inequity cols

Expand Down
2 changes: 1 addition & 1 deletion python/ingestion/dataset_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ def calc_pct_share(record, raw_count_col):

def generate_pct_share_col_of_summed_alls(
df: pd.DataFrame, raw_count_to_pct_share: dict[str, str], demo_col: Literal['age', 'sex', 'race_and_ethnicity']
):
) -> pd.DataFrame:
"""
Adds a `pct_share` column for each raw_count_to_pct_share item. Rather than using the "All" row's
estimate_total values, this recalculates the "All" rows' values as the sum of the groups' rows.
Expand Down
2 changes: 2 additions & 0 deletions python/ingestion/merge_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,8 @@ def merge_intersectional_pop(

if primary_demo_col == std_col.RACE_OR_HISPANIC_COL:
merge_cols.append(std_col.RACE_CATEGORY_ID_COL)
if std_col.RACE_OR_HISPANIC_COL in df.columns:
merge_cols.append(std_col.RACE_OR_HISPANIC_COL)
else:
merge_cols.append(primary_demo_col)

Expand Down
26 changes: 13 additions & 13 deletions python/tests/data/graphql_ahr/golden_data/age_national_current.csv
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
state_name,state_fips,age,voter_participation_pct_rate,asthma_per_100k,avoided_care_pct_rate,cardiovascular_diseases_per_100k,chronic_kidney_disease_per_100k,copd_per_100k,depression_per_100k,diabetes_per_100k,excessive_drinking_per_100k,frequent_mental_distress_per_100k,preventable_hospitalizations_per_100k,non_medical_drug_use_per_100k,suicide_per_100k,population,population_pct,suicide_estimated_total
United States,00,45-64,,10100.0,9.6,9100.0,3200.0,7700.0,19100.0,15100.0,15100.0,13500.0,,,,85333288.0,25.6,
United States,00,15-24,,,,,,,,,,,,,15.2,43666116.0,13.1,6637.0
United States,00,18-44,,9900.0,13.1,1900.0,1100.0,2700.0,22200.0,3400.0,23200.0,18400.0,,,,119279569.0,35.8,
United States,00,24-34,,,,,,,,,,,,,19.5,,,
United States,00,35-44,,,,,,,,,,,,,18.1,42845441.0,12.9,7755.0
United States,00,45-54,,,,,,,,,,,,,18.2,42062109.0,12.6,7655.0
United States,00,55-64,,,,,,,,,,,,,17.0,43271179.0,13.0,7356.0
United States,00,65+,,8600.0,3.4,20600.0,7100.0,12300.0,14600.0,22600.0,7000.0,8500.0,,,16.9,53592546.0,16.1,9057.0
United States,00,65-74,,,,,,,,,,,,,15.3,31968856.0,9.6,4891.0
United States,00,75-84,,,,,,,,,,,,,19.6,15235011.0,4.6,2986.0
United States,00,85+,,,,,,,,,,,,,22.4,6388679.0,1.9,1431.0
United States,00,All,,9800.0,8.8,8000.0,3000.0,6200.0,20500.0,10900.0,17300.0,14700.0,2681.0,12.0,14.5,333036755.0,100.0,48290.0
state_name,state_fips,age,asthma_per_100k,avoided_care_pct_rate,cardiovascular_diseases_per_100k,chronic_kidney_disease_per_100k,copd_per_100k,depression_per_100k,diabetes_per_100k,excessive_drinking_per_100k,frequent_mental_distress_per_100k,non_medical_drug_use_per_100k,voter_participation_pct_rate,preventable_hospitalizations_per_100k,suicide_per_100k,population,population_pct,suicide_estimated_total,suicide_pct_share
United States,00,45-64,10100.0,9.6,9100.0,3200.0,7700.0,19100.0,15100.0,15100.0,13500.0,,,,,85333288.0,25.6,,
United States,00,15-24,,,,,,,,,,,,,15.2,43666116.0,13.1,6637.0,0.8
United States,00,18-44,9900.0,13.1,1900.0,1100.0,2700.0,22200.0,3400.0,23200.0,18400.0,,,,,119279569.0,35.8,,
United States,00,24-34,,,,,,,,,,,,,19.5,,,,1.0
United States,00,35-44,,,,,,,,,,,,,18.1,42845441.0,12.9,7755.0,0.9
United States,00,45-54,,,,,,,,,,,,,18.2,42062109.0,12.6,7655.0,0.9
United States,00,55-64,,,,,,,,,,,,,17.0,43271179.0,13.0,7356.0,0.9
United States,00,65+,8600.0,3.4,20600.0,7100.0,12300.0,14600.0,22600.0,7000.0,8500.0,,,,16.9,53592546.0,16.1,9057.0,0.9
United States,00,65-74,,,,,,,,,,,,,15.3,31968856.0,9.6,4891.0,0.8
United States,00,75-84,,,,,,,,,,,,,19.6,15235011.0,4.6,2986.0,1.0
United States,00,85+,,,,,,,,,,,,,22.4,6388679.0,1.9,1431.0,1.2
United States,00,All,9800.0,8.8,8000.0,3000.0,6200.0,20500.0,10900.0,17300.0,14700.0,12.0,,2681.0,1917.6,333036755.0,100.0,48290.0,100.0
Loading

0 comments on commit 9938b35

Please sign in to comment.