# Loading women dataset into fact tables

In [1]:
import pandas as pd
import numpy as np
import re
import traceback
import sys
from variables import * 

In [2]:
df_women = pd.read_csv('dataset/Vaccination_Coverage_among_Pregnant_Women.csv')
df_women.head()

Unnamed: 0,Vaccine,Geography Type,Geography,Survey Year/Influenza Season,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size
0,Influenza,States,Oklahoma,2021,Race and Ethnicity,"White, Non-Hispanic",45.2,38.5 to 52.0,232.0
1,Influenza,States,Oklahoma,2021,Race and Ethnicity,"Black, Non-Hispanic",34.1,28.7 to 39.8,299.0
2,Influenza,States,Oklahoma,2021,Race and Ethnicity,Hispanic,53.4,48.2 to 58.5,378.0
3,Influenza,States,Oklahoma,2021,Race and Ethnicity,"Other or Multiple Races, Non-Hispanic",51.0,45.2 to 56.8,501.0
4,Tdap,States,Oklahoma,2021,Race and Ethnicity,"White, Non-Hispanic",64.9,57.9 to 71.4,220.0


In [3]:
df_women['Vaccine'].unique()

array(['Influenza', 'Tdap'], dtype=object)

Dropping the tables with empty values for critical measures. These don't provide useful insights in the analysis.

In [4]:
length_na_rows = len(df_women[df_women['Sample Size'].isna()])
percentage_rows_dropped = (length_na_rows/df_women.shape[0]) * 100
print(f"{percentage_rows_dropped}% contained NA fields in measures. Dropping {len(df_women[df_women['Sample Size'].isna()])} rows.")

4.835348061692372% contained NA fields in measures. Dropping 232 rows.


In [5]:
len_df_women = df_women.shape[0]
len_df_women

4798

In [6]:
critical_measures = ['Estimate (%)', 'Sample Size', '95% CI (%)']
df_women = df_women.dropna(subset=critical_measures)
df_women[df_women['Sample Size'].isna()]

Unnamed: 0,Vaccine,Geography Type,Geography,Survey Year/Influenza Season,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size


In [7]:
filled_df_nrow = df_women.shape[0]
filled_df_nrow

4566

In [8]:
# there is a NR* field in the estimate
# this indicates that the estimate is unreliable
# for now, we can set then to None
df_women['95% CI (%)'] = df_women['95% CI (%)'].replace('NR*', None)
df_women['Estimate (%)'] = df_women['Estimate (%)'].replace('NR*', None)

In [9]:
len_df_women = df_women.shape[0]
len_df_women

4566

We see that the dataset only contains individual years. This means that the dataset are all transactions. We can load them into a transaction snapshot fact table.

In [10]:
# they are all transactions
df_women['Survey Year/Influenza Season'].unique()

array([2021, 2014, 2015, 2013, 2018, 2016, 2017, 2022, 2012, 2019, 2020])

Convert the `Sample Size` column to int64. This makes sense as Sample Size would refer to individual humans. 

In [11]:
df_women['Sample Size'] = pd.to_numeric(df_women['Sample Size'], errors='coerce').astype('Int64')
df_women.head()

Unnamed: 0,Vaccine,Geography Type,Geography,Survey Year/Influenza Season,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size
0,Influenza,States,Oklahoma,2021,Race and Ethnicity,"White, Non-Hispanic",45.2,38.5 to 52.0,232
1,Influenza,States,Oklahoma,2021,Race and Ethnicity,"Black, Non-Hispanic",34.1,28.7 to 39.8,299
2,Influenza,States,Oklahoma,2021,Race and Ethnicity,Hispanic,53.4,48.2 to 58.5,378
3,Influenza,States,Oklahoma,2021,Race and Ethnicity,"Other or Multiple Races, Non-Hispanic",51.0,45.2 to 56.8,501
4,Tdap,States,Oklahoma,2021,Race and Ethnicity,"White, Non-Hispanic",64.9,57.9 to 71.4,220


In [12]:
# fill in missing dimensions
df_women['Dose'] = '1 Dose Only'

df_women.head()

Unnamed: 0,Vaccine,Geography Type,Geography,Survey Year/Influenza Season,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,Dose
0,Influenza,States,Oklahoma,2021,Race and Ethnicity,"White, Non-Hispanic",45.2,38.5 to 52.0,232,1 Dose Only
1,Influenza,States,Oklahoma,2021,Race and Ethnicity,"Black, Non-Hispanic",34.1,28.7 to 39.8,299,1 Dose Only
2,Influenza,States,Oklahoma,2021,Race and Ethnicity,Hispanic,53.4,48.2 to 58.5,378,1 Dose Only
3,Influenza,States,Oklahoma,2021,Race and Ethnicity,"Other or Multiple Races, Non-Hispanic",51.0,45.2 to 56.8,501,1 Dose Only
4,Tdap,States,Oklahoma,2021,Race and Ethnicity,"White, Non-Hispanic",64.9,57.9 to 71.4,220,1 Dose Only


In [13]:
# checking the survey year
df_women['Survey Year/Influenza Season'].unique()

array([2021, 2014, 2015, 2013, 2018, 2016, 2017, 2022, 2012, 2019, 2020])

In [14]:
# splitting the confidence intervals
ci = df_women['95% CI (%)'].astype(str)
bounds = ci.str.split(r'\s+to\s+', expand=True)
df_women['ci_lower'] = pd.to_numeric(bounds[0], errors='coerce')
df_women['ci_upper'] = pd.to_numeric(bounds[1], errors='coerce')
df_women['ci_lower'] = df_women['ci_lower'].where(df_women['ci_lower'].notnull(), None)
df_women['ci_upper'] = df_women['ci_upper'].where(df_women['ci_upper'].notnull(), None)

In [15]:
# getting the vaccine dimension table from postgres
import psycopg2 as pg
from creds import POSTGRES_USERNAME, POSTGRES_PW
conn = pg.connect(
    dbname='cs689_project',  
    user=POSTGRES_USERNAME,
    password=POSTGRES_PW,
    host='localhost',
    port='5432'
)
cursor = conn.cursor()


Getting the vaccine dimension table from DB

In [16]:
cursor.execute("SELECT vaccine_id, cleaned_vaccine, start_date FROM vaccine_dim")
mapping_data = cursor.fetchall()

vaccine_mapping = pd.DataFrame(mapping_data, columns=["vaccine_id", "cleaned_vaccine", "start_date"])
vaccine_mapping

Unnamed: 0,vaccine_id,cleaned_vaccine,start_date
0,1,DTaP,2011-09-01
1,2,Polio,2011-09-01
2,3,Hep B,2011-09-01
3,4,PCV,2011-09-01
4,7,Hib,2011-09-01
5,10,Combined 7 Series,2011-09-01
6,11,Rotavirus,2011-09-01
7,9,Influenza,2011-09-01
8,12,Influenza,2012-09-01
9,13,Influenza,2013-09-01


In [17]:
vaccine_mapping['start_date'] = pd.to_datetime(vaccine_mapping['start_date'])
vaccine_mapping['year'] = vaccine_mapping['start_date'].dt.year
vaccine_mapping


Unnamed: 0,vaccine_id,cleaned_vaccine,start_date,year
0,1,DTaP,2011-09-01,2011
1,2,Polio,2011-09-01,2011
2,3,Hep B,2011-09-01,2011
3,4,PCV,2011-09-01,2011
4,7,Hib,2011-09-01,2011
5,10,Combined 7 Series,2011-09-01,2011
6,11,Rotavirus,2011-09-01,2011
7,9,Influenza,2011-09-01,2011
8,12,Influenza,2012-09-01,2012
9,13,Influenza,2013-09-01,2013


For the flu vaccine, we have to map it to the year as the flu vaccine gets updated every year, meaning that we have to map the correct vaccine version to the correct year.

In [18]:
# for influenza, we need to match it to the year
years = df_women['Survey Year/Influenza Season'].astype(str).str[:4]
df_women['vaccine_year'] = np.where(df_women['Vaccine'] == 'Influenza',years,'2011')

In [19]:
df_women.head()

Unnamed: 0,Vaccine,Geography Type,Geography,Survey Year/Influenza Season,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,Dose,ci_lower,ci_upper,vaccine_year
0,Influenza,States,Oklahoma,2021,Race and Ethnicity,"White, Non-Hispanic",45.2,38.5 to 52.0,232,1 Dose Only,38.5,52.0,2021
1,Influenza,States,Oklahoma,2021,Race and Ethnicity,"Black, Non-Hispanic",34.1,28.7 to 39.8,299,1 Dose Only,28.7,39.8,2021
2,Influenza,States,Oklahoma,2021,Race and Ethnicity,Hispanic,53.4,48.2 to 58.5,378,1 Dose Only,48.2,58.5,2021
3,Influenza,States,Oklahoma,2021,Race and Ethnicity,"Other or Multiple Races, Non-Hispanic",51.0,45.2 to 56.8,501,1 Dose Only,45.2,56.8,2021
4,Tdap,States,Oklahoma,2021,Race and Ethnicity,"White, Non-Hispanic",64.9,57.9 to 71.4,220,1 Dose Only,57.9,71.4,2011


In [20]:
df_women[df_women['vaccine_year'].isna()]
df_women['vaccine_year'] = (pd.to_numeric(df_women['vaccine_year'], errors='coerce').astype('Int64'))

In [21]:
# we merge df_women with the vaccine_dim
df_women = pd.merge(df_women, vaccine_mapping, left_on = ['Vaccine', 'vaccine_year'], right_on=['cleaned_vaccine', 'year'], how='left')
df_women.head()

Unnamed: 0,Vaccine,Geography Type,Geography,Survey Year/Influenza Season,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,Dose,ci_lower,ci_upper,vaccine_year,vaccine_id,cleaned_vaccine,start_date,year
0,Influenza,States,Oklahoma,2021,Race and Ethnicity,"White, Non-Hispanic",45.2,38.5 to 52.0,232,1 Dose Only,38.5,52.0,2021,21,Influenza,2021-09-01,2021
1,Influenza,States,Oklahoma,2021,Race and Ethnicity,"Black, Non-Hispanic",34.1,28.7 to 39.8,299,1 Dose Only,28.7,39.8,2021,21,Influenza,2021-09-01,2021
2,Influenza,States,Oklahoma,2021,Race and Ethnicity,Hispanic,53.4,48.2 to 58.5,378,1 Dose Only,48.2,58.5,2021,21,Influenza,2021-09-01,2021
3,Influenza,States,Oklahoma,2021,Race and Ethnicity,"Other or Multiple Races, Non-Hispanic",51.0,45.2 to 56.8,501,1 Dose Only,45.2,56.8,2021,21,Influenza,2021-09-01,2021
4,Tdap,States,Oklahoma,2021,Race and Ethnicity,"White, Non-Hispanic",64.9,57.9 to 71.4,220,1 Dose Only,57.9,71.4,2011,26,Tdap,2011-09-01,2011


In [22]:
df_women['vaccine_id'].unique()

array([21, 26, 14, 15, 13, 18, 16, 17, 22, 12, 20, 19])

In [23]:
df_women[df_women['vaccine_id'].isna()]

Unnamed: 0,Vaccine,Geography Type,Geography,Survey Year/Influenza Season,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,Dose,ci_lower,ci_upper,vaccine_year,vaccine_id,cleaned_vaccine,start_date,year


Data Quality check: Verify that the number of rows after the merge matches the original number of rows in the loaded dataset (after dropping empty rows)

This ensures that we are not missing out any rows when performing the merges

In [24]:
def check_rows(df, len_df_women = filled_df_nrow):
    if df.shape[0] != len_df_women:
        print("Rows don't match!")
    else:
        print("Rows match!")

check_rows(df_women, len_df_women)

Rows match!


Mapping `Dose` to the `dose_id` in the dose dimension table

In [25]:
cursor.execute("SELECT dose, dose_id FROM dose_dim")
mapping_data = cursor.fetchall()

dose_mapping = pd.DataFrame(mapping_data, columns=["dose", "dose_id"])
dose_mapping

Unnamed: 0,dose,dose_id
0,≥3 Doses,1
1,≥2 Doses,2
2,≥1 Dose,3
3,"≥1 Dose, 2 Day",4
4,Full Series,5
5,1 Dose Only,6
6,Primary Series,7
7,≥4 Doses,8
8,7 Dose,9
9,"≥1 Dose, 1 Day",10


In [26]:
# we merge df_women with the dose_dim
df_women = pd.merge(df_women, dose_mapping, left_on = ['Dose'], right_on=['dose'], how='left')
df_women.head()

Unnamed: 0,Vaccine,Geography Type,Geography,Survey Year/Influenza Season,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,Dose,ci_lower,ci_upper,vaccine_year,vaccine_id,cleaned_vaccine,start_date,year,dose,dose_id
0,Influenza,States,Oklahoma,2021,Race and Ethnicity,"White, Non-Hispanic",45.2,38.5 to 52.0,232,1 Dose Only,38.5,52.0,2021,21,Influenza,2021-09-01,2021,1 Dose Only,6
1,Influenza,States,Oklahoma,2021,Race and Ethnicity,"Black, Non-Hispanic",34.1,28.7 to 39.8,299,1 Dose Only,28.7,39.8,2021,21,Influenza,2021-09-01,2021,1 Dose Only,6
2,Influenza,States,Oklahoma,2021,Race and Ethnicity,Hispanic,53.4,48.2 to 58.5,378,1 Dose Only,48.2,58.5,2021,21,Influenza,2021-09-01,2021,1 Dose Only,6
3,Influenza,States,Oklahoma,2021,Race and Ethnicity,"Other or Multiple Races, Non-Hispanic",51.0,45.2 to 56.8,501,1 Dose Only,45.2,56.8,2021,21,Influenza,2021-09-01,2021,1 Dose Only,6
4,Tdap,States,Oklahoma,2021,Race and Ethnicity,"White, Non-Hispanic",64.9,57.9 to 71.4,220,1 Dose Only,57.9,71.4,2011,26,Tdap,2011-09-01,2011,1 Dose Only,6


In [27]:
# drop original columns
df_women.drop(columns=['Vaccine', 'vaccine_year', 'cleaned_vaccine', 'Dose', 'dose', 'start_date', 'year'], inplace=True)
check_rows(df_women)
df_women.head()

Rows match!


Unnamed: 0,Geography Type,Geography,Survey Year/Influenza Season,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id
0,States,Oklahoma,2021,Race and Ethnicity,"White, Non-Hispanic",45.2,38.5 to 52.0,232,38.5,52.0,21,6
1,States,Oklahoma,2021,Race and Ethnicity,"Black, Non-Hispanic",34.1,28.7 to 39.8,299,28.7,39.8,21,6
2,States,Oklahoma,2021,Race and Ethnicity,Hispanic,53.4,48.2 to 58.5,378,48.2,58.5,21,6
3,States,Oklahoma,2021,Race and Ethnicity,"Other or Multiple Races, Non-Hispanic",51.0,45.2 to 56.8,501,45.2,56.8,21,6
4,States,Oklahoma,2021,Race and Ethnicity,"White, Non-Hispanic",64.9,57.9 to 71.4,220,57.9,71.4,26,6


Mapping `geography` to the geography dimension table

In [28]:
df_women[df_women['Geography Type'] == 'National']

Unnamed: 0,Geography Type,Geography,Survey Year/Influenza Season,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id
1051,National,United States,2022,Age,18-24 Years,45.1,43.0 to 47.2,5769,43.0,47.2,22,6
1052,National,United States,2022,Age,25-34 Years,56.8,55.7 to 57.9,17853,55.7,57.9,22,6
1053,National,United States,2022,Age,>=35 Years,62.5,60.7 to 64.3,6733,60.7,64.3,22,6
1054,National,United States,2022,Race and Ethnicity,"White, Non-Hispanic",56.1,54.9 to 57.3,12992,54.9,57.3,22,6
1055,National,United States,2022,Race and Ethnicity,"Black, Non-Hispanic",42.2,39.6 to 44.8,4336,39.6,44.8,22,6
...,...,...,...,...,...,...,...,...,...,...,...,...
3745,National,United States,2015,Age,25-34 Years,54.5,52.7 to 56.3,7654,52.7,56.3,26,6
3746,National,United States,2012,Age,>=35 Years,15.7,12.7 to 19.1,1197,12.7,19.1,26,6
3747,National,United States,2018,Race and Ethnicity,"White, Non-Hispanic",77.6,76.5 to 78.7,10959,76.5,78.7,26,6
3748,National,United States,2020,Race and Ethnicity,"White, Non-Hispanic",80.4,79.3 to 81.4,9757,79.3,81.4,26,6


In [29]:
# Since the Geography Type is the same as the children and adolescents, we can rename them to make it uniform
df_women.loc[df_women['Geography Type'] == 'National', 'Geography Type'] = 'HHS Regions/National'
df_women.loc[df_women['Geography Type'] == 'States', 'Geography Type'] = 'States/Local Areas'
df_women.head()

Unnamed: 0,Geography Type,Geography,Survey Year/Influenza Season,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id
0,States/Local Areas,Oklahoma,2021,Race and Ethnicity,"White, Non-Hispanic",45.2,38.5 to 52.0,232,38.5,52.0,21,6
1,States/Local Areas,Oklahoma,2021,Race and Ethnicity,"Black, Non-Hispanic",34.1,28.7 to 39.8,299,28.7,39.8,21,6
2,States/Local Areas,Oklahoma,2021,Race and Ethnicity,Hispanic,53.4,48.2 to 58.5,378,48.2,58.5,21,6
3,States/Local Areas,Oklahoma,2021,Race and Ethnicity,"Other or Multiple Races, Non-Hispanic",51.0,45.2 to 56.8,501,45.2,56.8,21,6
4,States/Local Areas,Oklahoma,2021,Race and Ethnicity,"White, Non-Hispanic",64.9,57.9 to 71.4,220,57.9,71.4,26,6


In [30]:
regions = df_women['Geography'][df_women['Geography Type'] == 'HHS Regions/National'].unique()
regions

array(['United States'], dtype=object)

There is an additional terriroty included: Commonwealth of the Northern Mariana Islands
We need to add that into the dimension table

In [31]:
df_women['Geography'].unique()

array(['Oklahoma', 'Indiana', 'Nebraska', 'Missouri', 'Illinois',
       'Minnesota', 'Oregon', 'New York', 'Vermont', 'Colorado',
       'Maryland', 'Wyoming', 'West Virginia', 'Maine', 'Alabama',
       'Tennessee', 'United States', 'Puerto Rico', 'Montana',
       'Wisconsin', 'Texas', 'New Mexico', 'Alaska', 'Virginia',
       'NY-Rest of state', 'Kentucky', 'District of Columbia',
       'Massachusetts', 'Michigan', 'Pennsylvania', 'North Dakota',
       'Florida', 'New Hampshire', 'Ohio', 'Arizona', 'Rhode Island',
       'North Carolina', 'Hawaii', 'Iowa', 'Kansas', 'Connecticut',
       'Georgia', 'Delaware', 'Louisiana',
       'Commonwealth of the Northern Mariana Islands', 'South Dakota',
       'Washington', 'Utah', 'New Jersey', 'Mississippi',
       'NY-City of New York', 'Arkansas'], dtype=object)

In [32]:
# we fill in the regions into geography_dim
geography_dim = pd.DataFrame(columns = ['state_territory', 'municipality', 'county', 'region'])
for region in regions[1:]:
    new_row = pd.DataFrame([{'state_territory': None, 'municipality': None, 'county': None, 'region': region, 'Geography Type': 'HHS Regions/National', 'Geography':region}])
    geography_dim = pd.concat([geography_dim, new_row], ignore_index=True)

geography_dim


Unnamed: 0,state_territory,municipality,county,region


In [33]:
# we fill in the states/local areas into geography_dim
for i in df_women['Geography'][df_women['Geography Type'] == 'States/Local Areas'].unique():
    matched = False
    if i in state_territory:
        region = state_to_region.get(i)
        new_row = pd.DataFrame([{'state_territory': i, 'municipality': None, 'county': None, 'region':'Region ' + str(region), 'Geography Type': 'States/Local Areas', 'Geography':i}])
        geography_dim = pd.concat([geography_dim, new_row], ignore_index=True)
        matched = True
    if i[0:2] in state_territory.values():
        state = str([k for k, v in state_territory.items() if v == i[0:2]][0])
        if 'County' in i:
            # i is a county
            county = str(i[3:])
            new_row = pd.DataFrame([{'state_territory': state, 'municipality': None, 'county': county, 'region':'Region ' + str(region), 'Geography Type': 'States/Local Areas', 'Geography':i}])
            geography_dim = pd.concat([geography_dim, new_row], ignore_index=True)
            matched = True
        else:
            # we assume it is a municipality
            municipality = i[3:]
            new_row = pd.DataFrame([{'state_territory': state, 'municipality': municipality, 'county': None, 'region':'Region ' + str(region), 'Geography Type': 'States/Local Areas', 'Geography':i}])
            geography_dim = pd.concat([geography_dim, new_row], ignore_index=True)
            matched = True
    if not matched:
        print(f"Unknown state or local area: {i}")

In [34]:
geography_dim

Unnamed: 0,state_territory,municipality,county,region,Geography Type,Geography
0,Oklahoma,,,Region 6,States/Local Areas,Oklahoma
1,Indiana,,,Region 5,States/Local Areas,Indiana
2,Nebraska,,,Region 7,States/Local Areas,Nebraska
3,Missouri,,,Region 7,States/Local Areas,Missouri
4,Illinois,,,Region 5,States/Local Areas,Illinois
5,Minnesota,,,Region 5,States/Local Areas,Minnesota
6,Oregon,,,Region 10,States/Local Areas,Oregon
7,New York,,,Region 2,States/Local Areas,New York
8,Vermont,,,Region 1,States/Local Areas,Vermont
9,Colorado,,,Region 8,States/Local Areas,Colorado


In [66]:
try:
    insert_query = '''
    INSERT INTO geography_dim (
        state_territory, 
        municipality, 
        county, 
        region, 
        original_geography_type, 
        original_geography
    )
    VALUES (%s, %s, %s, %s, %s, %s)
    ON CONFLICT (original_geography) 
    DO UPDATE SET
        state_territory = EXCLUDED.state_territory,
        municipality = EXCLUDED.municipality,
        county = EXCLUDED.county,
        region = EXCLUDED.region,
        original_geography_type = EXCLUDED.original_geography_type;
    '''
    for index, row in geography_dim.iterrows():
        print(row['Geography'])
        cursor.execute(insert_query, (row['state_territory'], row['municipality'], row['county'], row['region'], row['Geography Type'], row['Geography'],))
    conn.commit()
    print(f"{len(geography_dim)} records inserted into geography_dim.")

except Exception as e:
    print(f"Error occurred: {e}")
    conn.rollback()

Oklahoma
Indiana
Nebraska
Missouri
Illinois
Minnesota
Oregon
New York
Vermont
Colorado
Maryland
Wyoming
West Virginia
Maine
Alabama
Tennessee
Puerto Rico
Montana
Wisconsin
Texas
New Mexico
Alaska
Virginia
NY-Rest of state
Kentucky
District of Columbia
Massachusetts
Michigan
Pennsylvania
North Dakota
Florida
New Hampshire
Ohio
Arizona
Rhode Island
North Carolina
Hawaii
Iowa
Kansas
Connecticut
Georgia
Delaware
Louisiana
Commonwealth of the Northern Mariana Islands
South Dakota
Washington
Utah
New Jersey
Mississippi
NY-City of New York
Arkansas
51 records inserted into geography_dim.


In [35]:
cursor.execute("SELECT geography_id, original_geography, original_geography_type FROM geography_dim")
mapping_data = cursor.fetchall()

geography_mapping = pd.DataFrame(mapping_data, columns=["geography_id", "original_geography", "original_geography_type"])
geography_mapping

Unnamed: 0,geography_id,original_geography,original_geography_type
0,4,Region 7,HHS Regions/National
1,7,Region 1,HHS Regions/National
2,8,Region 8,HHS Regions/National
3,3,Region 10,HHS Regions/National
4,1,Region 6,HHS Regions/National
...,...,...,...
75,55,Maryland,States/Local Areas
76,58,NY-Rest of state,States/Local Areas
77,12,North Dakota,States/Local Areas
78,77,TX-Dallas County,States/Local Areas


In [36]:
# we merge df_women with the geography_mapping
df_women = pd.merge(df_women, geography_mapping, left_on = ['Geography', 'Geography Type'], right_on=['original_geography', 'original_geography_type'], how='left')
df_women.head()

Unnamed: 0,Geography Type,Geography,Survey Year/Influenza Season,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,original_geography,original_geography_type
0,States/Local Areas,Oklahoma,2021,Race and Ethnicity,"White, Non-Hispanic",45.2,38.5 to 52.0,232,38.5,52.0,21,6,30,Oklahoma,States/Local Areas
1,States/Local Areas,Oklahoma,2021,Race and Ethnicity,"Black, Non-Hispanic",34.1,28.7 to 39.8,299,28.7,39.8,21,6,30,Oklahoma,States/Local Areas
2,States/Local Areas,Oklahoma,2021,Race and Ethnicity,Hispanic,53.4,48.2 to 58.5,378,48.2,58.5,21,6,30,Oklahoma,States/Local Areas
3,States/Local Areas,Oklahoma,2021,Race and Ethnicity,"Other or Multiple Races, Non-Hispanic",51.0,45.2 to 56.8,501,45.2,56.8,21,6,30,Oklahoma,States/Local Areas
4,States/Local Areas,Oklahoma,2021,Race and Ethnicity,"White, Non-Hispanic",64.9,57.9 to 71.4,220,57.9,71.4,26,6,30,Oklahoma,States/Local Areas


In [37]:
df_women[df_women['Geography'] == 'Commonwealth of the Northern Mariana Islands']

Unnamed: 0,Geography Type,Geography,Survey Year/Influenza Season,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,original_geography,original_geography_type
2958,States/Local Areas,Commonwealth of the Northern Mariana Islands,2022,Age,18-24 Years,83.5,70.1 to 92.6,49,70.1,92.6,22,6,123,Commonwealth of the Northern Mariana Islands,States/Local Areas
2959,States/Local Areas,Commonwealth of the Northern Mariana Islands,2022,Age,25-34 Years,85.0,76.8 to 91.2,107,76.8,91.2,22,6,123,Commonwealth of the Northern Mariana Islands,States/Local Areas
2960,States/Local Areas,Commonwealth of the Northern Mariana Islands,2022,Age,>=18 Years,84.6,79.0 to 89.2,211,79.0,89.2,22,6,123,Commonwealth of the Northern Mariana Islands,States/Local Areas
2961,States/Local Areas,Commonwealth of the Northern Mariana Islands,2022,Age,>=35 Years,85.1,72.9 to 93.3,55,72.9,93.3,22,6,123,Commonwealth of the Northern Mariana Islands,States/Local Areas
2962,States/Local Areas,Commonwealth of the Northern Mariana Islands,2022,Race and Ethnicity,"Other or Multiple Races, Non-Hispanic",85.6,80.1 to 90.1,207,80.1,90.1,22,6,123,Commonwealth of the Northern Mariana Islands,States/Local Areas


In [38]:
# drop original columns
df_women.drop(columns=['Geography', 'Geography Type', 'original_geography', 'original_geography_type'], inplace=True)
df_women.head()

Unnamed: 0,Survey Year/Influenza Season,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id
0,2021,Race and Ethnicity,"White, Non-Hispanic",45.2,38.5 to 52.0,232,38.5,52.0,21,6,30
1,2021,Race and Ethnicity,"Black, Non-Hispanic",34.1,28.7 to 39.8,299,28.7,39.8,21,6,30
2,2021,Race and Ethnicity,Hispanic,53.4,48.2 to 58.5,378,48.2,58.5,21,6,30
3,2021,Race and Ethnicity,"Other or Multiple Races, Non-Hispanic",51.0,45.2 to 56.8,501,45.2,56.8,21,6,30
4,2021,Race and Ethnicity,"White, Non-Hispanic",64.9,57.9 to 71.4,220,57.9,71.4,26,6,30


In [39]:
check_rows(df_women)

Rows match!


Since the are two dimension types: `Age` and `Race and Ethnicity`, we can separate out the dimensions. That way, we can map the `Race and Ethnicity` dimension to `race_ethnicity` dimension table.

In [40]:
df_age = df_women[df_women['Dimension Type'] == 'Age']
df_race = df_women[df_women['Dimension Type'] == 'Race and Ethnicity']

print(df_age.shape[0] + df_race.shape[0])

4566


In [41]:
df_age.head()

Unnamed: 0,Survey Year/Influenza Season,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id
8,2014,Age,>=18 Years,51.1,46.9 to 55.3,1798,46.9,55.3,14,6,30
9,2014,Age,18-24 Years,45.4,38.0 to 53.0,516,38.0,53.0,14,6,30
10,2014,Age,25-34 Years,54.9,49.4 to 60.3,1091,49.4,60.3,14,6,30
11,2014,Age,>=35 Years,47.6,33.3 to 62.2,191,33.3,62.2,14,6,30
20,2013,Age,25-34 Years,56.5,51.1 to 61.8,1024,51.1,61.8,13,6,30


In [42]:
df_age['Dimension'].unique()

array(['>=18 Years', '18-24 Years', '25-34 Years', '>=35 Years'],
      dtype=object)

Treat age as a varchar

In [43]:
df_age.rename(columns={'Dimension': 'Age'}, inplace=True)
df_age.drop(columns=['Dimension Type'], inplace=True)
df_age.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_age.rename(columns={'Dimension': 'Age'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_age.drop(columns=['Dimension Type'], inplace=True)


Unnamed: 0,Survey Year/Influenza Season,Age,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id
8,2014,>=18 Years,51.1,46.9 to 55.3,1798,46.9,55.3,14,6,30
9,2014,18-24 Years,45.4,38.0 to 53.0,516,38.0,53.0,14,6,30
10,2014,25-34 Years,54.9,49.4 to 60.3,1091,49.4,60.3,14,6,30
11,2014,>=35 Years,47.6,33.3 to 62.2,191,33.3,62.2,14,6,30
20,2013,25-34 Years,56.5,51.1 to 61.8,1024,51.1,61.8,13,6,30


In [44]:
# making sure that we don't drop any rows
print(df_age.shape[0] + df_race.shape[0])

4566


In [45]:
cursor.execute("SELECT race_ethnicity, race_ethnicity_id FROM race_ethnicity_dim")
mapping_data = cursor.fetchall()

race_ethnicity_mapping = pd.DataFrame(mapping_data, columns=["race_ethnicity", "race_ethnicity_id"])
race_ethnicity_mapping

Unnamed: 0,race_ethnicity,race_ethnicity_id
0,"American Indian or Alaska Native, Non-Hispanic",5
1,"Asian, Non-Hispanic",6
2,"White, Non-Hispanic",2
3,Hispanic,3
4,"Other or Multiple Races, Non-Hispanic",1
5,"Black, Non-Hispanic",4


In [46]:
df_race['Dimension'].unique()

array(['White, Non-Hispanic', 'Black, Non-Hispanic', 'Hispanic',
       'Other or Multiple Races, Non-Hispanic'], dtype=object)

In [47]:
df_race = pd.merge(df_race, race_ethnicity_mapping, left_on = ['Dimension'], right_on=['race_ethnicity'], how='left')
df_race.head()

Unnamed: 0,Survey Year/Influenza Season,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,race_ethnicity,race_ethnicity_id
0,2021,Race and Ethnicity,"White, Non-Hispanic",45.2,38.5 to 52.0,232,38.5,52.0,21,6,30,"White, Non-Hispanic",2
1,2021,Race and Ethnicity,"Black, Non-Hispanic",34.1,28.7 to 39.8,299,28.7,39.8,21,6,30,"Black, Non-Hispanic",4
2,2021,Race and Ethnicity,Hispanic,53.4,48.2 to 58.5,378,48.2,58.5,21,6,30,Hispanic,3
3,2021,Race and Ethnicity,"Other or Multiple Races, Non-Hispanic",51.0,45.2 to 56.8,501,45.2,56.8,21,6,30,"Other or Multiple Races, Non-Hispanic",1
4,2021,Race and Ethnicity,"White, Non-Hispanic",64.9,57.9 to 71.4,220,57.9,71.4,26,6,30,"White, Non-Hispanic",2


In [48]:
df_race.drop(columns=['Dimension Type', 'Dimension', 'race_ethnicity'], inplace=True)
print(df_race.shape[0])
df_race.head()

2146


Unnamed: 0,Survey Year/Influenza Season,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,race_ethnicity_id
0,2021,45.2,38.5 to 52.0,232,38.5,52.0,21,6,30,2
1,2021,34.1,28.7 to 39.8,299,28.7,39.8,21,6,30,4
2,2021,53.4,48.2 to 58.5,378,48.2,58.5,21,6,30,3
3,2021,51.0,45.2 to 56.8,501,45.2,56.8,21,6,30,1
4,2021,64.9,57.9 to 71.4,220,57.9,71.4,26,6,30,2


In [49]:
df_age.rename(columns={'Survey Year/Influenza Season':'Survey Year'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_age.rename(columns={'Survey Year/Influenza Season':'Survey Year'}, inplace=True)


In [50]:
#since all women, we assign the gender to 4 (Females)
df_age['gender_id'] = 4 
df_age.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_age['gender_id'] = 4


Unnamed: 0,Survey Year,Age,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,gender_id
8,2014,>=18 Years,51.1,46.9 to 55.3,1798,46.9,55.3,14,6,30,4
9,2014,18-24 Years,45.4,38.0 to 53.0,516,38.0,53.0,14,6,30,4
10,2014,25-34 Years,54.9,49.4 to 60.3,1091,49.4,60.3,14,6,30,4
11,2014,>=35 Years,47.6,33.3 to 62.2,191,33.3,62.2,14,6,30,4
20,2013,25-34 Years,56.5,51.1 to 61.8,1024,51.1,61.8,13,6,30,4


In [51]:
df_age.shape[0]

2420

In [52]:
df_race.shape[0]

2146

In [53]:
df_age.columns

Index(['Survey Year', 'Age', 'Estimate (%)', '95% CI (%)', 'Sample Size',
       'ci_lower', 'ci_upper', 'vaccine_id', 'dose_id', 'geography_id',
       'gender_id'],
      dtype='object')

In [54]:
df_age[df_age['Estimate (%)'] == 'NR*']

Unnamed: 0,Survey Year,Age,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,gender_id


### Load dataset into fact tables

In [55]:
try:
    insert_query = '''
    INSERT INTO vaccine_transaction_fact (
      vaccine_transaction_survey_year,
      vaccine_transaction_age,
      vaccine_transaction_estimate_pct,
      vaccine_transaction_sample_size,
      vaccine_transaction_ci_lower,
      vaccine_transaction_ci_upper,
      vaccine_transaction_vaccine_id,
      vaccine_transaction_dose_id,
      vaccine_transaction_gender_id,
      vaccine_transaction_geography_id
    )
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
    '''
    for index, row in df_age.iterrows():

        vaccine_id = int(row['vaccine_id']) if pd.notnull(row['vaccine_id']) else None
        dose_id = int(row['dose_id']) if pd.notnull(row['dose_id']) else None
        geography_id = int(row['geography_id']) if pd.notnull(row['geography_id']) else None
        gender_id = int(row['gender_id']) if pd.notnull(row['gender_id']) else None
    
        cursor.execute(insert_query, (
            row['Survey Year'],
            row['Age'],
            row['Estimate (%)'],
            row['Sample Size'],
            row['ci_lower'],
            row['ci_upper'],
            vaccine_id,
            dose_id,
            gender_id,
            geography_id
        ))

    conn.commit()
    print(f"{len(df_age)} records inserted into vaccine_transaction_fact.")

except Exception as e:
    print(f"Error occurred: {e}")
    traceback.print_exc(file=sys.stdout)
    conn.rollback()

2420 records inserted into vaccine_transaction_fact.


In [56]:
df_race.columns

Index(['Survey Year/Influenza Season', 'Estimate (%)', '95% CI (%)',
       'Sample Size', 'ci_lower', 'ci_upper', 'vaccine_id', 'dose_id',
       'geography_id', 'race_ethnicity_id'],
      dtype='object')

In [57]:
df_race.head()

Unnamed: 0,Survey Year/Influenza Season,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,race_ethnicity_id
0,2021,45.2,38.5 to 52.0,232,38.5,52.0,21,6,30,2
1,2021,34.1,28.7 to 39.8,299,28.7,39.8,21,6,30,4
2,2021,53.4,48.2 to 58.5,378,48.2,58.5,21,6,30,3
3,2021,51.0,45.2 to 56.8,501,45.2,56.8,21,6,30,1
4,2021,64.9,57.9 to 71.4,220,57.9,71.4,26,6,30,2


In [58]:
try:
    insert_query = '''
    INSERT INTO vaccine_transaction_fact (
      vaccine_transaction_survey_year,
      vaccine_transaction_estimate_pct,
      vaccine_transaction_sample_size,
      vaccine_transaction_ci_lower,
      vaccine_transaction_ci_upper,
      vaccine_transaction_vaccine_id,
      vaccine_transaction_dose_id,
      vaccine_transaction_geography_id,
      vaccine_transaction_race_ethnicity_id
    )
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s);
    '''
    for index, row in df_race.iterrows():

        vaccine_id = int(row['vaccine_id']) if pd.notnull(row['vaccine_id']) else None
        dose_id = int(row['dose_id']) if pd.notnull(row['dose_id']) else None
        geography_id = int(row['geography_id']) if pd.notnull(row['geography_id']) else None
        race_ethnicity_id = int(row['race_ethnicity_id']) if pd.notnull(row['race_ethnicity_id']) else None
    
        cursor.execute(insert_query, (
            row['Survey Year/Influenza Season'],
            row['Estimate (%)'],
            row['Sample Size'],
            row['ci_lower'],
            row['ci_upper'],
            vaccine_id,
            dose_id,
            geography_id,
            race_ethnicity_id
        ))

    conn.commit()
    print(f"{len(df_race)} records inserted into vaccine_transaction_fact.")

except Exception as e:
    print(f"Error occurred: {e}")
    traceback.print_exc(file=sys.stdout)
    conn.rollback()

2146 records inserted into vaccine_transaction_fact.
