Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
benhammondmusic committed May 3, 2024
2 parents 00bf9fc + bd0d1bd commit 4fc06be
Show file tree
Hide file tree
Showing 11 changed files with 96 additions and 90 deletions.
9 changes: 4 additions & 5 deletions python/datasources/cdc_vaccination_county.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from ingestion import gcs_to_bq_util
from ingestion.merge_utils import merge_county_names, merge_pop_numbers
from ingestion.constants import COUNTY_LEVEL, RACE
from ingestion.dataset_utils import generate_per_100k_col
from ingestion.dataset_utils import generate_pct_rate_col
from ingestion.standardized_columns import Race
import ingestion.standardized_columns as std_col

Expand Down Expand Up @@ -49,7 +49,7 @@ def write_to_bq(self, dataset, gcs_bucket, write_local_instead_of_bq=False, **at
df = generate_breakdown(df)

col_types = gcs_to_bq_util.get_bq_column_types(
df, float_cols=[std_col.VACCINATED_PER_100K, std_col.VACCINATED_RAW]
df, float_cols=[std_col.VACCINATED_PCT_RATE, std_col.VACCINATED_RAW]
)

gcs_to_bq_util.add_df_to_bq(df, dataset, 'alls_county', column_types=col_types)
Expand All @@ -67,16 +67,15 @@ def generate_breakdown(df):
df = merge_county_names(df)
df = merge_pop_numbers(df, RACE, COUNTY_LEVEL)

df = generate_per_100k_col(df, CDC_ONE_DOSE, std_col.POPULATION_COL, std_col.VACCINATED_PER_100K)

df = generate_pct_rate_col(df, CDC_ONE_DOSE, std_col.POPULATION_COL, std_col.VACCINATED_PCT_RATE)
df = df.rename(columns={CDC_ONE_DOSE: std_col.VACCINATED_RAW})

df = df[
[
std_col.COUNTY_FIPS_COL,
std_col.COUNTY_NAME_COL,
std_col.RACE_CATEGORY_ID_COL,
std_col.VACCINATED_PER_100K,
std_col.VACCINATED_PCT_RATE,
std_col.SEX_COL,
std_col.AGE_COL,
std_col.VACCINATED_RAW,
Expand Down
7 changes: 3 additions & 4 deletions python/datasources/cdc_vaccination_national.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def write_to_bq(self, dataset, gcs_bucket, write_local_instead_of_bq=False, **at
breakdown_df, f'{self.get_table_name()}-{breakdown}_processed'
)
else:
float_cols = [std_col.VACCINATED_PER_100K, std_col.VACCINATED_PCT_SHARE, std_col.VACCINATED_POP_PCT]
float_cols = [std_col.VACCINATED_PCT_RATE, std_col.VACCINATED_PCT_SHARE, std_col.VACCINATED_POP_PCT]
col_types = gcs_to_bq_util.get_bq_column_types(breakdown_df, float_cols)
gcs_to_bq_util.add_df_to_bq(breakdown_df, dataset, f'{breakdown}_processed', column_types=col_types)

Expand All @@ -116,8 +116,7 @@ def generate_breakdown(self, breakdown, df):
unknown_df = unknown_df.rename(columns={'administered_dose1_pct_us': std_col.VACCINATED_PCT_SHARE})
df = pd.concat([known_df, unknown_df])

# convert source pct_rate to per_100k
df[std_col.VACCINATED_PER_100K] = df['administered_dose1_pct'].mul(1000)
df[std_col.VACCINATED_PCT_RATE] = df['administered_dose1_pct']

df.loc[df[demo_col].isin(ALLS), std_col.VACCINATED_PCT_SHARE] = 100.0

Expand All @@ -138,7 +137,7 @@ def generate_breakdown(self, breakdown, df):
demo_col,
std_col.VACCINATED_PCT_SHARE,
std_col.VACCINATED_POP_PCT,
std_col.VACCINATED_PER_100K,
std_col.VACCINATED_PCT_RATE,
std_col.VACCINATED_RAW,
]
]
Expand Down
23 changes: 12 additions & 11 deletions python/datasources/kff_vaccination.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from datasources.data_source import DataSource
from ingestion import gcs_to_bq_util, github_util

from ingestion.dataset_utils import generate_per_100k_col
from ingestion.dataset_utils import generate_pct_rate_col

from ingestion.merge_utils import merge_state_ids, merge_pop_numbers

Expand Down Expand Up @@ -153,7 +153,7 @@ def generate_output_row(state_row_pct_share, state_row_pct_total, state_row_pct_
output_row[std_col.VACCINATED_PCT_SHARE] = str(state_row_pct_share[generate_pct_share_key(race)].values[0])

if race in KFF_RACES_PCT_TOTAL:
output_row[std_col.VACCINATED_PER_100K] = str(state_row_pct_total[generate_total_pct_key(race)].values[0])
output_row[std_col.VACCINATED_PCT_RATE] = str(state_row_pct_total[generate_total_pct_key(race)].values[0])
output_row[std_col.VACCINATED_POP_PCT] = str(
state_row_pct_population[generate_pct_of_population_key(race)].values[0]
)
Expand Down Expand Up @@ -215,7 +215,7 @@ def parse_data(self):
std_col.STATE_NAME_COL,
std_col.RACE_CATEGORY_ID_COL,
std_col.VACCINATED_PCT_SHARE,
std_col.VACCINATED_PER_100K,
std_col.VACCINATED_PCT_RATE,
VACCINATED_FIRST_DOSE,
std_col.VACCINATED_POP_PCT,
]
Expand Down Expand Up @@ -264,13 +264,13 @@ def post_process(self, df):
df = clean_row(df, std_col.VACCINATED_POP_PCT)
df[std_col.VACCINATED_POP_PCT] = df[std_col.VACCINATED_POP_PCT] * 100

df = clean_row(df, std_col.VACCINATED_PER_100K)
df[std_col.VACCINATED_PER_100K] = df[std_col.VACCINATED_PER_100K] * 1000 * 100
df = clean_row(df, std_col.VACCINATED_PCT_RATE)
df[std_col.VACCINATED_PCT_RATE] = df[std_col.VACCINATED_PCT_RATE] * 100

total_df = df.loc[~df[VACCINATED_FIRST_DOSE].isnull()].reset_index(drop=True)
total_df = merge_pop_numbers(total_df, RACE, STATE_LEVEL)
total_df = generate_per_100k_col(
total_df, VACCINATED_FIRST_DOSE, std_col.POPULATION_COL, std_col.VACCINATED_PER_100K
total_df = generate_pct_rate_col(
total_df, VACCINATED_FIRST_DOSE, std_col.POPULATION_COL, std_col.VACCINATED_PCT_RATE
)

df = df.loc[df[VACCINATED_FIRST_DOSE].isnull()].reset_index(drop=True)
Expand All @@ -294,7 +294,7 @@ def post_process(self, df):
std_col.STATE_FIPS_COL,
std_col.RACE_CATEGORY_ID_COL,
std_col.VACCINATED_PCT_SHARE,
std_col.VACCINATED_PER_100K,
std_col.VACCINATED_PCT_RATE,
std_col.VACCINATED_POP_PCT,
std_col.ACS_VACCINATED_POP_PCT,
std_col.VACCINATED_RAW,
Expand All @@ -309,7 +309,7 @@ def write_to_bq(self, dataset, gcs_bucket, write_local_instead_of_bq=False, **at

float_cols = [
std_col.VACCINATED_PCT_SHARE,
std_col.VACCINATED_PER_100K,
std_col.VACCINATED_PCT_RATE,
std_col.VACCINATED_POP_PCT,
std_col.ACS_VACCINATED_POP_PCT,
std_col.VACCINATED_RAW,
Expand All @@ -321,9 +321,10 @@ def write_to_bq(self, dataset, gcs_bucket, write_local_instead_of_bq=False, **at
gcs_to_bq_util.add_df_to_bq(df, dataset, f'{std_col.RACE_OR_HISPANIC_COL}_state', column_types=col_types)

# WRITE ALLS TABLE FOR SEX/AGE (get just the All rows from the race table and add needed cols)
df = df.copy()
df = df[df[std_col.RACE_CATEGORY_ID_COL] == std_col.Race.ALL.value]
df[std_col.SEX_COL] = std_col.ALL_VALUE
df[std_col.AGE_COL] = std_col.ALL_VALUE
df.loc[:, std_col.SEX_COL] = std_col.ALL_VALUE
df.loc[:, std_col.AGE_COL] = std_col.ALL_VALUE
col_types = gcs_to_bq_util.get_bq_column_types(df, float_cols)
gcs_to_bq_util.add_df_to_bq(df, dataset, 'alls_state', column_types=col_types)

Expand Down
52 changes: 29 additions & 23 deletions python/ingestion/dataset_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def generate_pct_share_col_without_unknowns(
all_demo_values = set(df[breakdown_col].to_list())
if Race.UNKNOWN.value in all_demo_values or 'Unknown' in all_demo_values:
raise ValueError(
('This dataset contains unknowns, use the' 'generate_pct_share_col_with_unknowns function instead')
('This dataset contains unknowns, use the `generate_pct_share_col_with_unknowns` function instead')
)

return _generate_pct_share_col(df, raw_count_to_pct_share, breakdown_col, all_val)
Expand Down Expand Up @@ -237,26 +237,6 @@ def calc_pct_share(record, raw_count_col):
return df.reset_index(drop=True)


def generate_pct_rate_col(df, raw_count_col, pop_col, pct_rate_col):
"""Returns a df with a `_pct_rate` col
df: incoming df that will get the new column
raw_count_col: str col name that contains the raw count
of individuals with the condition. this will be the numerator
pop_col: str col name with the total population number.
this will be the denominator
pct_rate_col: str col name to place the generated
pct_rate data in"""

df[pct_rate_col] = df[raw_count_col] / df[pop_col]
df[pct_rate_col] = df[pct_rate_col].mul(100).round()

# div by zero results in inf, cast these as nan
df.replace([np.inf, -np.inf], np.nan, inplace=True)

return df


def generate_per_100k_col(df, raw_count_col, pop_col, per_100k_col):
"""Returns a dataframe with a `per_100k` column
Expand All @@ -276,6 +256,31 @@ def calc_per_100k(record):
return df


def generate_pct_rate_col(df, raw_count_col, pop_col, pct_rate_col):
"""Returns a df with a `_pct_rate` col
df: incoming df that will get the new column
raw_count_col: str col name that contains the raw count
of individuals with the condition. this will be the numerator
pop_col: str col name with the total population number.
this will be the denominator
pct_rate_col: str col name to place the generated
pct_rate data in.
In general, we prefer using PCT_RATE for more frequently occurring,
non-medical conditions like voting, poverty, etc. and use PER_100K
for diagnoses of diseases that occur less frequently like COPD."""

df[pct_rate_col] = df[raw_count_col].astype(float) / df[pop_col]
df[pct_rate_col] = df[pct_rate_col].mul(100).round()

# div by zero results in inf, cast these as nan
df.replace([np.inf, -np.inf], np.nan, inplace=True)

return df


def percent_avoid_rounding_to_zero(numerator, denominator, default_decimals=1, max_decimals=2):
"""Calculates percentage to `default_decimals` number of decimal places. If
the percentage would round to 0, calculates with more decimal places until
Expand Down Expand Up @@ -460,7 +465,8 @@ def zero_out_pct_rel_inequity(

# optionally preserve null pct_inequity for race rows that have no population info
if pop_pct_col:
df.loc[df[pop_pct_col].isnull(), pct_inequity_col] = np.nan
for rate_col, pct_inequity_col in rate_to_inequity_col_map.items():
df.loc[df[pop_pct_col].isnull(), pct_inequity_col] = np.nan

return df

Expand Down Expand Up @@ -536,7 +542,7 @@ def combine_race_ethnicity(

def generate_time_df_with_cols_and_types(
df: pd.DataFrame,
numerical_cols_to_keep: list[str],
numerical_cols_to_keep: List[str],
table_type: Literal['current', 'historical'],
dem_col: Literal['age', 'race', 'race_and_ethnicity', 'sex'],
):
Expand Down
2 changes: 1 addition & 1 deletion python/ingestion/standardized_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@

# Vaccination columns
VACCINATED_RAW = "vaccinated_estimated_total"
VACCINATED_PER_100K = "vaccinated_per_100k"
VACCINATED_PCT_RATE = "vaccinated_pct_rate"
VACCINATED_PCT_SHARE = "vaccinated_pct_share"
VACCINATED_POP_PCT = 'vaccinated_pop_pct'
ACS_VACCINATED_POP_PCT = 'acs_vaccinated_pop_pct'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
county_fips,county_name,race_category_id,vaccinated_per_100k,sex,age,race_and_ethnicity,vaccinated_estimated_total
20143,Ottawa County,ALL,35905.0,All,All,All,2071.0
42023,Cameron County,ALL,62081.0,All,All,All,2816.0
county_fips,county_name,race_category_id,vaccinated_pct_rate,sex,age,race_and_ethnicity,vaccinated_estimated_total
20143,Ottawa County,ALL,36.0,All,All,All,2071.0
42023,Cameron County,ALL,62.0,All,All,All,2816.0
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
state_name,state_fips,age,vaccinated_pct_share,vaccinated_pop_pct,vaccinated_per_100k,vaccinated_estimated_total
United States,00,0-1,0.1,2.3,3400.0,259174.0
United States,00,2-4,0.3,3.6,5700.0,682581.0
United States,00,5-11,1.9,8.7,15500.0,4461075.0
United States,00,18-24,8.9,9.2,68500.0,20955927.0
United States,00,25-49,21.1,32.9,72300.0,49452236.0
United States,00,65+,14.0,16.5,99900.0,32893412.0
United States,00,50-64,23.8,19.2,87700.0,55833589.0
United States,00,All,100.0,100,70600.0,234269053.0
United States,00,12-17,6.6,7.6,61300.0,15514175.0
state_name,state_fips,age,vaccinated_pct_share,vaccinated_pop_pct,vaccinated_pct_rate,vaccinated_estimated_total
United States,00,0-1,0.1,2.3,3.4,259174.0
United States,00,2-4,0.3,3.6,5.7,682581.0
United States,00,5-11,1.9,8.7,15.5,4461075.0
United States,00,18-24,8.9,9.2,68.5,20955927.0
United States,00,25-49,21.1,32.9,72.3,49452236.0
United States,00,65+,14.0,16.5,99.9,32893412.0
United States,00,50-64,23.8,19.2,87.7,55833589.0
United States,00,All,100.0,100,70.6,234269053.0
United States,00,12-17,6.6,7.6,61.3,15514175.0
United States,00,Unknown,0.0,,,24085.0
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
state_name,state_fips,race_category_id,vaccinated_pct_share,vaccinated_pop_pct,vaccinated_per_100k,vaccinated_estimated_total,race_and_ethnicity
United States,00,NHPI_NH,0.3,0.2,52200.0,484651.0,Native Hawaiian and Pacific Islander (NH)
United States,00,MULTI_OR_OTHER_STANDARD_NH,5.5,3.9,99900.0,9088540.0,Two or more races & Unrepresented race (NH)
United States,00,WHITE_NH,57.8,58.3,48000.0,94732860.0,White (NH)
United States,00,AIAN_NH,0.9,0.5,61800.0,1506108.0,American Indian and Alaska Native (NH)
United States,00,HISP,19.3,19.4,49600.0,31640980.0,Hispanic or Latino
United States,00,ALL,100.0,100.0,70600.0,234269053.0,All
United States,00,ASIAN_NH,6.0,5.6,51600.0,9798756.0,Asian (NH)
United States,00,BLACK_NH,10.2,12.0,40600.0,16717933.0,Black or African American (NH)
state_name,state_fips,race_category_id,vaccinated_pct_share,vaccinated_pop_pct,vaccinated_pct_rate,vaccinated_estimated_total,race_and_ethnicity
United States,00,NHPI_NH,0.3,0.2,52.2,484651.0,Native Hawaiian and Pacific Islander (NH)
United States,00,MULTI_OR_OTHER_STANDARD_NH,5.5,3.9,99.9,9088540.0,Two or more races & Unrepresented race (NH)
United States,00,WHITE_NH,57.8,58.3,48.0,94732860.0,White (NH)
United States,00,AIAN_NH,0.9,0.5,61.8,1506108.0,American Indian and Alaska Native (NH)
United States,00,HISP,19.3,19.4,49.6,31640980.0,Hispanic or Latino
United States,00,ALL,100.0,100.0,70.6,234269053.0,All
United States,00,ASIAN_NH,6.0,5.6,51.6,9798756.0,Asian (NH)
United States,00,BLACK_NH,10.2,12.0,40.6,16717933.0,Black or African American (NH)
United States,00,UNKNOWN,30.0,,,70299220.0,Unknown race
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
state_name,state_fips,sex,vaccinated_pct_share,vaccinated_pop_pct,vaccinated_per_100k,vaccinated_estimated_total
United States,00,Male,47.6,49.6,67600.0,110475379.0
United States,00,All,100.0,100.0,70600.0,234269053.0
United States,00,Female,52.4,50.4,72200.0,121625508.0
state_name,state_fips,sex,vaccinated_pct_share,vaccinated_pop_pct,vaccinated_pct_rate,vaccinated_estimated_total
United States,00,Male,47.6,49.6,67.6,110475379.0
United States,00,All,100.0,100.0,70.6,234269053.0
United States,00,Female,52.4,50.4,72.2,121625508.0
United States,00,Unknown,0.9,,,2168166.0
Original file line number Diff line number Diff line change
@@ -1,34 +1,34 @@
state_name,state_fips,race_category_id,vaccinated_pct_share,vaccinated_per_100k,vaccinated_pop_pct,acs_vaccinated_pop_pct,race_and_ethnicity,vaccinated_estimated_total
state_name,state_fips,race_category_id,vaccinated_pct_share,vaccinated_pct_rate,vaccinated_pop_pct,acs_vaccinated_pop_pct,race_and_ethnicity,vaccinated_estimated_total
Alabama,01,UNKNOWN,13.0,,,,Unknown race,
Alabama,01,ETHNICITY_UNKNOWN,40.0,,,,Unknown ethnicity,
Alabama,01,WHITE,65.0,37000.0,68.10000000000001,66.2,White,
Alabama,01,BLACK,25.0,37000.0,26.6,26.4,Black or African American,
Alabama,01,HISP,6.0,34000.0,4.3999999999999995,4.6,Hispanic or Latino,
Alabama,01,ASIAN,2.0,68000.0,1.4000000000000001,1.4,Asian,
Alabama,01,WHITE,65.0,37.0,68.10000000000001,66.2,White,
Alabama,01,BLACK,25.0,37.0,26.6,26.4,Black or African American,
Alabama,01,HISP,6.0,34.0,4.3999999999999995,4.6,Hispanic or Latino,
Alabama,01,ASIAN,2.0,68.0,1.4000000000000001,1.4,Asian,
Alabama,01,AIAN,,,,0.4,American Indian and Alaska Native,
Alabama,01,NHPI,,,,0.04,Native Hawaiian and Pacific Islander,
Alabama,01,OTHER_NONSTANDARD,7.000000000000001,,,,Unrepresented race,
Arizona,04,UNKNOWN,11.0,,,,Unknown race,
Arizona,04,ETHNICITY_UNKNOWN,11.0,,,,Unknown ethnicity,
Arizona,04,WHITE_NH,52.0,47000.0,54.2,53.0,White (NH),
Arizona,04,BLACK_NH,3.0,34000.0,4.3,4.3,Black or African American (NH),
Arizona,04,HISP,19.0,29000.0,31.8,32.0,Hispanic or Latino,
Arizona,04,API_NH,4.0,61000.0,3.3000000000000003,3.4,"Asian, Native Hawaiian, and Pacific Islander (NH)",
Arizona,04,WHITE_NH,52.0,47.0,54.2,53.0,White (NH),
Arizona,04,BLACK_NH,3.0,34.0,4.3,4.3,Black or African American (NH),
Arizona,04,HISP,19.0,29.0,31.8,32.0,Hispanic or Latino,
Arizona,04,API_NH,4.0,61.0,3.3000000000000003,3.4,"Asian, Native Hawaiian, and Pacific Islander (NH)",
Arizona,04,AIAN_NH,3.0,,,3.5,American Indian and Alaska Native (NH),
Arizona,04,NHPI_NH,,,,0.2,Native Hawaiian and Pacific Islander (NH),
Arizona,04,OTHER_NONSTANDARD_NH,19.0,,,,Unrepresented race (NH),
Indiana,18,UNKNOWN,5.0,,,,Unknown race,
Indiana,18,ETHNICITY_UNKNOWN,7.000000000000001,,,,Unknown ethnicity,
Indiana,18,WHITE,82.0,47000.0,83.0,80.0,White,
Indiana,18,BLACK,7.000000000000001,37000.0,9.4,9.4,Black or African American,
Indiana,18,HISP,6.0,41000.0,7.199999999999999,7.5,Hispanic or Latino,
Indiana,18,ASIAN,3.0,67000.0,2.5,2.5,Asian,
Indiana,18,WHITE,82.0,47.0,83.0,80.0,White,
Indiana,18,BLACK,7.000000000000001,37.0,9.4,9.4,Black or African American,
Indiana,18,HISP,6.0,41.0,7.199999999999999,7.5,Hispanic or Latino,
Indiana,18,ASIAN,3.0,67.0,2.5,2.5,Asian,
Indiana,18,AIAN,,,,0.2,American Indian and Alaska Native,
Indiana,18,NHPI,,,,0.04,Native Hawaiian and Pacific Islander,
Indiana,18,OTHER_NONSTANDARD,7.000000000000001,,,,Unrepresented race,
Alabama,01,ALL,,47351.0,100.0,100.0,All,2380869.0
Arizona,04,ALL,,56963.0,100.0,100.0,All,4085580.0
Indiana,18,ALL,,49146.0,100.0,100.0,All,3334275.0
Guam,66,ALL,,72924.0,100.0,100.0,All,112184.0
Puerto Rico,72,ALL,,31.0,100.0,100.0,All,1000.0
Northern Mariana Islands,69,ALL,,71909.0,100.0,100.0,All,34034.0
Alabama,01,ALL,,47.0,100.0,100.0,All,2380869.0
Arizona,04,ALL,,57.0,100.0,100.0,All,4085580.0
Indiana,18,ALL,,49.0,100.0,100.0,All,3334275.0
Guam,66,ALL,,73.0,100.0,100.0,All,112184.0
Puerto Rico,72,ALL,,0.0,100.0,100.0,All,1000.0
Northern Mariana Islands,69,ALL,,72.0,100.0,100.0,All,34034.0
3 changes: 2 additions & 1 deletion python/tests/datasources/test_cdc_vaccination_county.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,15 @@ def testWriteToBq(
}

cdcVaccinationCounty.write_to_bq('dataset', 'gcs_bucket', **kwargs)
assert mock_csv.call_count == 1
assert mock_bq.call_count == 1
assert mock_bq.call_args_list[0].args[2] == 'alls_county'

expected_df = pd.read_csv(
GOLDEN_DATA,
dtype={
'county_fips': str,
'vaccinated_per_100k': float,
'vaccinated_pct_rate': float,
},
)

Expand Down

0 comments on commit 4fc06be

Please sign in to comment.