# Loading Adolescent dataset into fact tables

In [2]:
import pandas as pd
import numpy as np
import re
import traceback
import sys

In [3]:
df_adolescents = pd.read_csv('dataset/Vaccination_Coverage_among_Adolescents.csv')
df_adolescents.head()

Unnamed: 0,Vaccine/Sample,Dose,Geography Type,Geography,Survey Year,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size
0,HPV,"≥1 Dose, Males",States/Local Areas,New York,2023,Age,13-17 Years,81.5,75.2 to 86.5,289.0
1,Tetanus,≥1 Dose Tdap,States/Local Areas,New York,2023,Age,13-17 Years,90.2,86.8 to 92.8,559.0
2,Tetanus,≥1 Dose Td or Tdap,States/Local Areas,New York,2023,Age,13-17 Years,93.6,90.9 to 95.5,559.0
3,≥1 Dose MenACWY,,States/Local Areas,New York,2023,Age,13-17 Years,95.3,92.7 to 97.0,559.0
4,HPV,"≥1 Dose, Males and Females",States/Local Areas,New York,2023,Age,13-17 Years,79.4,74.8 to 83.3,559.0


In [4]:
original_nrow = df_adolescents.shape[0]
original_nrow


27565

In [5]:
df_adolescents['Vaccine/Sample'].unique()

array(['HPV', 'Tetanus', '≥1 Dose MenACWY', '≥3 Doses HepB',
       '≥2 Doses MMR', 'Varicella', '≥2 Doses Hep A'], dtype=object)

In [6]:
critical_measures = ['Estimate (%)', 'Sample Size', '95% CI (%)']
length_na_rows = len(df_adolescents[df_adolescents[critical_measures].isna().all(axis=1)])
percentage_rows_dropped = (length_na_rows/original_nrow) * 100
print(f"{percentage_rows_dropped}% contained NA fields in measures. Dropping {length_na_rows} rows.")

0.0435334663522583% contained NA fields in measures. Dropping 12 rows.


In [7]:
df_adolescents[df_adolescents[critical_measures].isna().all(axis=1)].head()

Unnamed: 0,Vaccine/Sample,Dose,Geography Type,Geography,Survey Year,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size
2571,HPV,"Up-to-Date, Females",States/Local Areas,U.S. Virgin Islands,2018,Age,13-15 Years,,,
2655,HPV,"Up-to-Date, Males",States/Local Areas,U.S. Virgin Islands,2018,Age,13-15 Years,,,
16628,HPV,"≥1 Dose, Males",HHS Regions/National,United States,2009,Age,13-17 Years,,,
16735,HPV,"≥2 Doses, Males",HHS Regions/National,United States,2009,Age,13-17 Years,,,
16993,HPV,"Up-to-Date, Males",States/Local Areas,Puerto Rico,2018,Age,13-15 Years,,,


In [8]:
critical_measures = ['Estimate (%)', 'Sample Size', '95% CI (%)']
df_adolescents = df_adolescents.dropna(subset=critical_measures, how = 'all')
df_adolescents[df_adolescents['Sample Size'].isna()]

Unnamed: 0,Vaccine/Sample,Dose,Geography Type,Geography,Survey Year,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size


In [9]:
filled_df_nrow = df_adolescents.shape[0]
filled_df_nrow

27553

In [10]:
# data qauality check
# checking that rows dropped + current rows match original rows
if (filled_df_nrow + length_na_rows == original_nrow):
    print("Dropped rows correctly")
else:
    print("Rows dropped incorrectly! Check the code above!")

Dropped rows correctly


In [11]:
def check_rows(df, len_df_adolescent = filled_df_nrow):
    if df.shape[0] != len_df_adolescent:
        print("Rows don't match!")
    else:
        print("Rows match!")

check_rows(df_adolescents, filled_df_nrow)

Rows match!


In [12]:
# splitting the confidence intervals
ci = df_adolescents['95% CI (%)'].astype(str)
df_adolescents[['ci_lower', 'ci_upper']] = ci.str.split(r'\s+to\s+', expand=True)
df_adolescents['ci_lower'] = df_adolescents['ci_lower'].astype(float)
df_adolescents['ci_upper'] = df_adolescents['ci_upper'].astype(float)

In [13]:
df_adolescents['Sample Size'] = pd.to_numeric(df_adolescents['Sample Size'], errors='coerce').astype('Int64')
df_adolescents.head()

Unnamed: 0,Vaccine/Sample,Dose,Geography Type,Geography,Survey Year,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper
0,HPV,"≥1 Dose, Males",States/Local Areas,New York,2023,Age,13-17 Years,81.5,75.2 to 86.5,289,75.2,86.5
1,Tetanus,≥1 Dose Tdap,States/Local Areas,New York,2023,Age,13-17 Years,90.2,86.8 to 92.8,559,86.8,92.8
2,Tetanus,≥1 Dose Td or Tdap,States/Local Areas,New York,2023,Age,13-17 Years,93.6,90.9 to 95.5,559,90.9,95.5
3,≥1 Dose MenACWY,,States/Local Areas,New York,2023,Age,13-17 Years,95.3,92.7 to 97.0,559,92.7,97.0
4,HPV,"≥1 Dose, Males and Females",States/Local Areas,New York,2023,Age,13-17 Years,79.4,74.8 to 83.3,559,74.8,83.3


In [14]:
# fill in missing dimensions

for i, row in df_adolescents.iterrows():
    match = re.search(r'(≥\d+|\d+)', row['Vaccine/Sample'])
    if match:
        df_adolescents.at[i, 'Dose'] = str(match.group()) + ' Dose'
    else:
        pass  # Do nothing if no match

df_adolescents[df_adolescents['Dose'] == '≥1 Dose'].head()

Unnamed: 0,Vaccine/Sample,Dose,Geography Type,Geography,Survey Year,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper
3,≥1 Dose MenACWY,≥1 Dose,States/Local Areas,New York,2023,Age,13-17 Years,95.3,92.7 to 97.0,559,92.7,97.0
16,≥1 Dose MenACWY,≥1 Dose,States/Local Areas,New York,2023,Age,13-15 Years,96.6,94.1 to 98.1,350,94.1,98.1
31,≥1 Dose MenACWY,≥1 Dose,States/Local Areas,New York,2018-2022,Insurance Coverage,Uninsured,82.9,53.5 to 95.4,36,53.5,95.4
34,≥1 Dose MenACWY,≥1 Dose,States/Local Areas,New York,2018-2022,Insurance Coverage,Other,95.7,93.0 to 97.4,365,93.0,97.4
37,≥1 Dose MenACWY,≥1 Dose,States/Local Areas,New York,2018-2022,Insurance Coverage,Any Medicaid,93.9,91.7 to 95.6,897,91.7,95.6


In [15]:
# since the HPV dose has additional male and female dimensions, we can denormalie it further into a new gender dimension
gender = ['Males', 'Females', 'Males and Females']
def extract_gender(dose):
    # Check if there is a comma in the dose string
    if ',' in dose:
        # Extract the portion after the last comma and remove any extra whitespace.
        gender_part = dose.split(',')[-1].strip()
    else:
        gender_part = dose.strip()
        
    # Perform an exact match against our gender list
    if gender_part in gender:
        return gender_part
    else:
        return None
df_adolescents['Gender'] = df_adolescents['Dose'].apply(extract_gender)

In [16]:
df_adolescents.head()

Unnamed: 0,Vaccine/Sample,Dose,Geography Type,Geography,Survey Year,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,Gender
0,HPV,"≥1 Dose, Males",States/Local Areas,New York,2023,Age,13-17 Years,81.5,75.2 to 86.5,289,75.2,86.5,Males
1,Tetanus,≥1 Dose Tdap,States/Local Areas,New York,2023,Age,13-17 Years,90.2,86.8 to 92.8,559,86.8,92.8,
2,Tetanus,≥1 Dose Td or Tdap,States/Local Areas,New York,2023,Age,13-17 Years,93.6,90.9 to 95.5,559,90.9,95.5,
3,≥1 Dose MenACWY,≥1 Dose,States/Local Areas,New York,2023,Age,13-17 Years,95.3,92.7 to 97.0,559,92.7,97.0,
4,HPV,"≥1 Dose, Males and Females",States/Local Areas,New York,2023,Age,13-17 Years,79.4,74.8 to 83.3,559,74.8,83.3,Males and Females


In [17]:
if (filled_df_nrow + length_na_rows == original_nrow):
    print("DF processed correctly!")
else:
    print("Rows dropped incorrectly! Check the code above!")

DF processed correctly!


In [18]:
# now that we have denormalised and added a new gender column, we can clean up the Dose column
df_adolescents['Dose'] = df_adolescents['Dose'].apply(
    lambda dose: dose.split(',')[0].strip() if ',' in dose else dose.strip()
)
df_adolescents['Dose'].unique()

array(['≥1 Dose', '≥1 Dose Tdap', '≥1 Dose Td or Tdap', 'Up-to-Date',
       '≥3 Dose', '≥2 Dose', '≥2 Doses or history of disease',
       '≥2 Doses with no disease history',
       '≥1 Dose with no disease history', 'History of disease',
       '≥2 Doses', '≥3 Doses',
       'Series Completion (3 Dose) Among HPV Vaccination Initiators'],
      dtype=object)

In [19]:
# getting the vaccine dimension table from postgres
import psycopg2 as pg
from psycopg2 import sql
from creds import POSTGRES_USERNAME, POSTGRES_PW
conn = pg.connect(
    dbname='cs689_project',  
    user=POSTGRES_USERNAME,
    password=POSTGRES_PW,
    host='localhost',
    port='5432'
)
cursor = conn.cursor()


In [20]:
cursor.execute("SELECT vaccine_id, cleaned_vaccine, start_date FROM vaccine_dim")
mapping_data = cursor.fetchall()

vaccine_mapping = pd.DataFrame(mapping_data, columns=["vaccine_id", "cleaned_vaccine", "start_date"])
vaccine_mapping

Unnamed: 0,vaccine_id,cleaned_vaccine,start_date
0,1,DTaP,2011-09-01
1,2,Polio,2011-09-01
2,3,Hep B,2011-09-01
3,4,PCV,2011-09-01
4,7,Hib,2011-09-01
5,10,Combined 7 Series,2011-09-01
6,11,Rotavirus,2011-09-01
7,9,Influenza,2011-09-01
8,12,Influenza,2012-09-01
9,13,Influenza,2013-09-01


In [21]:
vaccine_mapping['start_date'] = pd.to_datetime(vaccine_mapping['start_date'])
vaccine_mapping['year'] = vaccine_mapping['start_date'].dt.year
vaccine_mapping

Unnamed: 0,vaccine_id,cleaned_vaccine,start_date,year
0,1,DTaP,2011-09-01,2011
1,2,Polio,2011-09-01,2011
2,3,Hep B,2011-09-01,2011
3,4,PCV,2011-09-01,2011
4,7,Hib,2011-09-01,2011
5,10,Combined 7 Series,2011-09-01,2011
6,11,Rotavirus,2011-09-01,2011
7,9,Influenza,2011-09-01,2011
8,12,Influenza,2012-09-01,2012
9,13,Influenza,2013-09-01,2013


In [22]:
# clean the vaccines first
df_adolescents['cleaned_vaccine'] = df_adolescents['Vaccine/Sample'].str.replace(r'(≥\d+\s*Doses?\s*)', '', regex=True).str.strip()

In [23]:
df_adolescents.loc[df_adolescents['cleaned_vaccine'] == 'Tetanus', 'cleaned_vaccine'] = 'Tdap'


In [24]:
# for influenza, we need to match it to the year
# but there is no influenza vaccine here so match all to 2011
df_adolescents['vaccine_year'] = 2011
df_adolescents.head()

Unnamed: 0,Vaccine/Sample,Dose,Geography Type,Geography,Survey Year,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,Gender,cleaned_vaccine,vaccine_year
0,HPV,≥1 Dose,States/Local Areas,New York,2023,Age,13-17 Years,81.5,75.2 to 86.5,289,75.2,86.5,Males,HPV,2011
1,Tetanus,≥1 Dose Tdap,States/Local Areas,New York,2023,Age,13-17 Years,90.2,86.8 to 92.8,559,86.8,92.8,,Tdap,2011
2,Tetanus,≥1 Dose Td or Tdap,States/Local Areas,New York,2023,Age,13-17 Years,93.6,90.9 to 95.5,559,90.9,95.5,,Tdap,2011
3,≥1 Dose MenACWY,≥1 Dose,States/Local Areas,New York,2023,Age,13-17 Years,95.3,92.7 to 97.0,559,92.7,97.0,,MenACWY,2011
4,HPV,≥1 Dose,States/Local Areas,New York,2023,Age,13-17 Years,79.4,74.8 to 83.3,559,74.8,83.3,Males and Females,HPV,2011


In [25]:
df_adolescents[df_adolescents['vaccine_year'].isna()]
df_adolescents['vaccine_year'] = (pd.to_numeric(df_adolescents['vaccine_year'], errors='coerce').astype('Int64'))

In [26]:
# we merge df_children with the vaccine_dim
df_adolescents = pd.merge(df_adolescents, vaccine_mapping, left_on = ['cleaned_vaccine', 'vaccine_year'], right_on=['cleaned_vaccine', 'year'], how='left')
df_adolescents.head()

Unnamed: 0,Vaccine/Sample,Dose,Geography Type,Geography,Survey Year,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,Gender,cleaned_vaccine,vaccine_year,vaccine_id,start_date,year
0,HPV,≥1 Dose,States/Local Areas,New York,2023,Age,13-17 Years,81.5,75.2 to 86.5,289,75.2,86.5,Males,HPV,2011,25,2011-09-01,2011
1,Tetanus,≥1 Dose Tdap,States/Local Areas,New York,2023,Age,13-17 Years,90.2,86.8 to 92.8,559,86.8,92.8,,Tdap,2011,26,2011-09-01,2011
2,Tetanus,≥1 Dose Td or Tdap,States/Local Areas,New York,2023,Age,13-17 Years,93.6,90.9 to 95.5,559,90.9,95.5,,Tdap,2011,26,2011-09-01,2011
3,≥1 Dose MenACWY,≥1 Dose,States/Local Areas,New York,2023,Age,13-17 Years,95.3,92.7 to 97.0,559,92.7,97.0,,MenACWY,2011,27,2011-09-01,2011
4,HPV,≥1 Dose,States/Local Areas,New York,2023,Age,13-17 Years,79.4,74.8 to 83.3,559,74.8,83.3,Males and Females,HPV,2011,25,2011-09-01,2011


In [27]:
df_adolescents[df_adolescents['vaccine_id'].isna()]

Unnamed: 0,Vaccine/Sample,Dose,Geography Type,Geography,Survey Year,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,Gender,cleaned_vaccine,vaccine_year,vaccine_id,start_date,year


In [28]:
check_rows(df_adolescents, filled_df_nrow)

Rows match!


In [29]:
cursor.execute("SELECT dose, dose_id FROM dose_dim")
mapping_data = cursor.fetchall()

dose_mapping = pd.DataFrame(mapping_data, columns=["dose", "dose_id"])
dose_mapping

Unnamed: 0,dose,dose_id
0,≥3 Doses,1
1,≥2 Doses,2
2,≥1 Dose,3
3,"≥1 Dose, 2 Day",4
4,Full Series,5
5,1 Dose Only,6
6,Primary Series,7
7,≥4 Doses,8
8,7 Dose,9
9,"≥1 Dose, 1 Day",10


In [30]:
# we merge df_children with the dose_dim
df_adolescents = pd.merge(df_adolescents, dose_mapping, left_on = ['Dose'], right_on=['dose'], how='left')
df_adolescents.head()

Unnamed: 0,Vaccine/Sample,Dose,Geography Type,Geography,Survey Year,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,Gender,cleaned_vaccine,vaccine_year,vaccine_id,start_date,year,dose,dose_id
0,HPV,≥1 Dose,States/Local Areas,New York,2023,Age,13-17 Years,81.5,75.2 to 86.5,289,75.2,86.5,Males,HPV,2011,25,2011-09-01,2011,≥1 Dose,3
1,Tetanus,≥1 Dose Tdap,States/Local Areas,New York,2023,Age,13-17 Years,90.2,86.8 to 92.8,559,86.8,92.8,,Tdap,2011,26,2011-09-01,2011,≥1 Dose Tdap,13
2,Tetanus,≥1 Dose Td or Tdap,States/Local Areas,New York,2023,Age,13-17 Years,93.6,90.9 to 95.5,559,90.9,95.5,,Tdap,2011,26,2011-09-01,2011,≥1 Dose Td or Tdap,14
3,≥1 Dose MenACWY,≥1 Dose,States/Local Areas,New York,2023,Age,13-17 Years,95.3,92.7 to 97.0,559,92.7,97.0,,MenACWY,2011,27,2011-09-01,2011,≥1 Dose,3
4,HPV,≥1 Dose,States/Local Areas,New York,2023,Age,13-17 Years,79.4,74.8 to 83.3,559,74.8,83.3,Males and Females,HPV,2011,25,2011-09-01,2011,≥1 Dose,3


In [31]:
# drop original columns
df_adolescents.drop(columns=['Vaccine/Sample', 'cleaned_vaccine', 'vaccine_year', 'Dose', 'dose', 'start_date', 'year'], inplace=True)
check_rows(df_adolescents)
df_adolescents.head()

Rows match!


Unnamed: 0,Geography Type,Geography,Survey Year,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,Gender,vaccine_id,dose_id
0,States/Local Areas,New York,2023,Age,13-17 Years,81.5,75.2 to 86.5,289,75.2,86.5,Males,25,3
1,States/Local Areas,New York,2023,Age,13-17 Years,90.2,86.8 to 92.8,559,86.8,92.8,,26,13
2,States/Local Areas,New York,2023,Age,13-17 Years,93.6,90.9 to 95.5,559,90.9,95.5,,26,14
3,States/Local Areas,New York,2023,Age,13-17 Years,95.3,92.7 to 97.0,559,92.7,97.0,,27,3
4,States/Local Areas,New York,2023,Age,13-17 Years,79.4,74.8 to 83.3,559,74.8,83.3,Males and Females,25,3


In [32]:
cursor.execute("SELECT geography_id, original_geography, original_geography_type FROM geography_dim")
mapping_data = cursor.fetchall()

geography_mapping = pd.DataFrame(mapping_data, columns=["geography_id", "original_geography", "original_geography_type"])
geography_mapping

Unnamed: 0,geography_id,original_geography,original_geography_type
0,66,New York,States/Local Areas
1,46,Maine,States/Local Areas
2,40,Kentucky,States/Local Areas
3,13,North Carolina,States/Local Areas
4,49,Hawaii,States/Local Areas
...,...,...,...
75,39,Florida,States/Local Areas
76,61,New Hampshire,States/Local Areas
77,34,Ohio,States/Local Areas
78,43,Arizona,States/Local Areas


In [33]:
# we merge df_children with the geography_mapping
df_adolescents = pd.merge(df_adolescents, geography_mapping, left_on = ['Geography', 'Geography Type'], right_on=['original_geography', 'original_geography_type'], how='left')
df_adolescents.head()

Unnamed: 0,Geography Type,Geography,Survey Year,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,Gender,vaccine_id,dose_id,geography_id,original_geography,original_geography_type
0,States/Local Areas,New York,2023,Age,13-17 Years,81.5,75.2 to 86.5,289,75.2,86.5,Males,25,3,66,New York,States/Local Areas
1,States/Local Areas,New York,2023,Age,13-17 Years,90.2,86.8 to 92.8,559,86.8,92.8,,26,13,66,New York,States/Local Areas
2,States/Local Areas,New York,2023,Age,13-17 Years,93.6,90.9 to 95.5,559,90.9,95.5,,26,14,66,New York,States/Local Areas
3,States/Local Areas,New York,2023,Age,13-17 Years,95.3,92.7 to 97.0,559,92.7,97.0,,27,3,66,New York,States/Local Areas
4,States/Local Areas,New York,2023,Age,13-17 Years,79.4,74.8 to 83.3,559,74.8,83.3,Males and Females,25,3,66,New York,States/Local Areas


In [34]:
# drop original columns
df_adolescents.drop(columns=['Geography', 'Geography Type', 'original_geography', 'original_geography_type'], inplace=True)
check_rows(df_adolescents)
df_adolescents.head()

Rows match!


Unnamed: 0,Survey Year,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,Gender,vaccine_id,dose_id,geography_id
0,2023,Age,13-17 Years,81.5,75.2 to 86.5,289,75.2,86.5,Males,25,3,66
1,2023,Age,13-17 Years,90.2,86.8 to 92.8,559,86.8,92.8,,26,13,66
2,2023,Age,13-17 Years,93.6,90.9 to 95.5,559,90.9,95.5,,26,14,66
3,2023,Age,13-17 Years,95.3,92.7 to 97.0,559,92.7,97.0,,27,3,66
4,2023,Age,13-17 Years,79.4,74.8 to 83.3,559,74.8,83.3,Males and Females,25,3,66


In [35]:
df_adolescents.head()

Unnamed: 0,Survey Year,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,Gender,vaccine_id,dose_id,geography_id
0,2023,Age,13-17 Years,81.5,75.2 to 86.5,289,75.2,86.5,Males,25,3,66
1,2023,Age,13-17 Years,90.2,86.8 to 92.8,559,86.8,92.8,,26,13,66
2,2023,Age,13-17 Years,93.6,90.9 to 95.5,559,90.9,95.5,,26,14,66
3,2023,Age,13-17 Years,95.3,92.7 to 97.0,559,92.7,97.0,,27,3,66
4,2023,Age,13-17 Years,79.4,74.8 to 83.3,559,74.8,83.3,Males and Females,25,3,66


In [36]:
cursor.execute("SELECT gender, gender_id FROM gender_dim")
mapping_data = cursor.fetchall()

gender_mapping = pd.DataFrame(mapping_data, columns=["gender", "gender_id"])
gender_mapping

Unnamed: 0,gender,gender_id
0,Males,1
1,,2
2,Males and Females,3
3,Females,4


In [37]:
df_adolescents = pd.merge(df_adolescents, gender_mapping, left_on = ['Gender'], right_on=['gender'], how='left')
df_adolescents.head()

Unnamed: 0,Survey Year,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,Gender,vaccine_id,dose_id,geography_id,gender,gender_id
0,2023,Age,13-17 Years,81.5,75.2 to 86.5,289,75.2,86.5,Males,25,3,66,Males,1
1,2023,Age,13-17 Years,90.2,86.8 to 92.8,559,86.8,92.8,,26,13,66,,2
2,2023,Age,13-17 Years,93.6,90.9 to 95.5,559,90.9,95.5,,26,14,66,,2
3,2023,Age,13-17 Years,95.3,92.7 to 97.0,559,92.7,97.0,,27,3,66,,2
4,2023,Age,13-17 Years,79.4,74.8 to 83.3,559,74.8,83.3,Males and Females,25,3,66,Males and Females,3


In [38]:
df_adolescents.drop(columns=['Gender', 'gender'], inplace=True)
check_rows(df_adolescents)
df_adolescents.head()

Rows match!


Unnamed: 0,Survey Year,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,gender_id
0,2023,Age,13-17 Years,81.5,75.2 to 86.5,289,75.2,86.5,25,3,66,1
1,2023,Age,13-17 Years,90.2,86.8 to 92.8,559,86.8,92.8,26,13,66,2
2,2023,Age,13-17 Years,93.6,90.9 to 95.5,559,90.9,95.5,26,14,66,2
3,2023,Age,13-17 Years,95.3,92.7 to 97.0,559,92.7,97.0,27,3,66,2
4,2023,Age,13-17 Years,79.4,74.8 to 83.3,559,74.8,83.3,25,3,66,3


In [39]:
# creating transactional snaphot
df_transactional = df_adolescents[df_adolescents['Survey Year'].str.len() == 4]

df_cumulative = df_adolescents[df_adolescents['Survey Year'].str.len() != 4]
print(df_transactional.shape[0] + df_cumulative.shape[0])

27553


In [40]:
df_transactional.head()

Unnamed: 0,Survey Year,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,gender_id
0,2023,Age,13-17 Years,81.5,75.2 to 86.5,289,75.2,86.5,25,3,66,1
1,2023,Age,13-17 Years,90.2,86.8 to 92.8,559,86.8,92.8,26,13,66,2
2,2023,Age,13-17 Years,93.6,90.9 to 95.5,559,90.9,95.5,26,14,66,2
3,2023,Age,13-17 Years,95.3,92.7 to 97.0,559,92.7,97.0,27,3,66,2
4,2023,Age,13-17 Years,79.4,74.8 to 83.3,559,74.8,83.3,25,3,66,3


In [41]:
# there is only the Age dimension in the transactional dataset
df_transactional['Dimension Type'].unique()

array(['Age'], dtype=object)

In [42]:
df_transactional['Dimension'].unique()

array(['13-17 Years', '13-15 Years'], dtype=object)

In [43]:
df_transactional.drop(columns=['Dimension Type'], inplace=True)
df_transactional.rename(columns={'Dimension': 'Age'}, inplace=True)
df_transactional.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_transactional.drop(columns=['Dimension Type'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_transactional.rename(columns={'Dimension': 'Age'}, inplace=True)


Unnamed: 0,Survey Year,Age,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,gender_id
0,2023,13-17 Years,81.5,75.2 to 86.5,289,75.2,86.5,25,3,66,1
1,2023,13-17 Years,90.2,86.8 to 92.8,559,86.8,92.8,26,13,66,2
2,2023,13-17 Years,93.6,90.9 to 95.5,559,90.9,95.5,26,14,66,2
3,2023,13-17 Years,95.3,92.7 to 97.0,559,92.7,97.0,27,3,66,2
4,2023,13-17 Years,79.4,74.8 to 83.3,559,74.8,83.3,25,3,66,3


In [44]:
# making sure that we don't drop any rows
df_transactional.shape[0] + df_cumulative.shape[0]

27553

In [45]:
df_cumulative.head()

Unnamed: 0,Survey Year,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,gender_id
23,2018-2022,Insurance Coverage,Uninsured,88.3,68.1 to 96.4,36,68.1,96.4,26,13,66,2
24,2018-2022,Insurance Coverage,Other,91.0,87.0 to 93.8,365,87.0,93.8,26,13,66,2
25,2018-2022,Insurance Coverage,Any Medicaid,89.2,86.4 to 91.4,897,86.4,91.4,26,13,66,2
26,2018-2022,Insurance Coverage,Private Insurance Only,92.6,90.8 to 94.0,1581,90.8,94.0,26,13,66,2
27,2018-2022,Insurance Coverage,Uninsured,40.1,23.3 to 59.6,36,23.3,59.6,25,15,66,3


In [46]:
# denormalise the birth year/cohort as it may be useful for future analysis
for _, row in df_cumulative.iterrows():
    start_year = row['Survey Year'][:4]
    end_year = row['Survey Year'][-4:]
    df_cumulative.loc[_, 'start_cohort_year'] = start_year
    df_cumulative.loc[_, 'end_cohort_year'] = end_year
df_cumulative

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cumulative.loc[_, 'start_cohort_year'] = start_year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cumulative.loc[_, 'end_cohort_year'] = end_year


Unnamed: 0,Survey Year,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,gender_id,start_cohort_year,end_cohort_year
23,2018-2022,Insurance Coverage,Uninsured,88.3,68.1 to 96.4,36,68.1,96.4,26,13,66,2,2018,2022
24,2018-2022,Insurance Coverage,Other,91.0,87.0 to 93.8,365,87.0,93.8,26,13,66,2,2018,2022
25,2018-2022,Insurance Coverage,Any Medicaid,89.2,86.4 to 91.4,897,86.4,91.4,26,13,66,2,2018,2022
26,2018-2022,Insurance Coverage,Private Insurance Only,92.6,90.8 to 94.0,1581,90.8,94.0,26,13,66,2,2018,2022
27,2018-2022,Insurance Coverage,Uninsured,40.1,23.3 to 59.6,36,23.3,59.6,25,15,66,3,2018,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27497,2018-2022,Insurance Coverage,Any Medicaid,88.0,84.1 to 91.1,487,84.1,91.1,26,13,78,2,2018,2022
27498,2018-2022,Insurance Coverage,Uninsured,83.7,71.4 to 91.3,78,71.4,91.3,26,13,78,2,2018,2022
27499,2018-2022,Insurance Coverage,Other,84.2,76.1 to 89.9,211,76.1,89.9,26,13,78,2,2018,2022
27548,2018-2022,Overall,Overall,85.9,83.5 to 88.1,1454,83.5,88.1,26,13,78,2,2018,2022


In [47]:
df_cumulative.drop(columns=['Survey Year'], inplace = True)
df_cumulative = df_cumulative.reset_index(drop=True)
df_cumulative['cumulative_id'] = df_cumulative.index + 1
print(df_cumulative.shape[0])
display(df_cumulative)

3238


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cumulative.drop(columns=['Survey Year'], inplace = True)


Unnamed: 0,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,gender_id,start_cohort_year,end_cohort_year,cumulative_id
0,Insurance Coverage,Uninsured,88.3,68.1 to 96.4,36,68.1,96.4,26,13,66,2,2018,2022,1
1,Insurance Coverage,Other,91.0,87.0 to 93.8,365,87.0,93.8,26,13,66,2,2018,2022,2
2,Insurance Coverage,Any Medicaid,89.2,86.4 to 91.4,897,86.4,91.4,26,13,66,2,2018,2022,3
3,Insurance Coverage,Private Insurance Only,92.6,90.8 to 94.0,1581,90.8,94.0,26,13,66,2,2018,2022,4
4,Insurance Coverage,Uninsured,40.1,23.3 to 59.6,36,23.3,59.6,25,15,66,3,2018,2022,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3233,Insurance Coverage,Any Medicaid,88.0,84.1 to 91.1,487,84.1,91.1,26,13,78,2,2018,2022,3234
3234,Insurance Coverage,Uninsured,83.7,71.4 to 91.3,78,71.4,91.3,26,13,78,2,2018,2022,3235
3235,Insurance Coverage,Other,84.2,76.1 to 89.9,211,76.1,89.9,26,13,78,2,2018,2022,3236
3236,Overall,Overall,85.9,83.5 to 88.1,1454,83.5,88.1,26,13,78,2,2018,2022,3237


In [48]:
# sanity check: ensure that the bug was fixed correctly
# since the survey year is only from 2018-2022, its fine here
df_cumulative['start_cohort_year'].unique()

array(['2018'], dtype=object)

In [49]:
# making sure that we don't drop any rows
df_transactional.shape[0] + df_cumulative.shape[0]

27553

In [50]:
df_cumulative['Dimension Type'].unique()

array(['Insurance Coverage', 'Poverty', 'Race and Ethnicity',
       'Urbanicity', 'Overall'], dtype=object)

In [51]:
def check_dimension_rows(df, nrow):
    if df.shape[0] == nrow:
        print('Rows matched correctly')
    else:
        print("Rows don't match! Check code above!")

In [52]:
# handle insurance
df_cumulative_insurance = df_cumulative[df_cumulative['Dimension Type'] == 'Insurance Coverage']
df_cumulative_insurance.head()

Unnamed: 0,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,gender_id,start_cohort_year,end_cohort_year,cumulative_id
0,Insurance Coverage,Uninsured,88.3,68.1 to 96.4,36,68.1,96.4,26,13,66,2,2018,2022,1
1,Insurance Coverage,Other,91.0,87.0 to 93.8,365,87.0,93.8,26,13,66,2,2018,2022,2
2,Insurance Coverage,Any Medicaid,89.2,86.4 to 91.4,897,86.4,91.4,26,13,66,2,2018,2022,3
3,Insurance Coverage,Private Insurance Only,92.6,90.8 to 94.0,1581,90.8,94.0,26,13,66,2,2018,2022,4
4,Insurance Coverage,Uninsured,40.1,23.3 to 59.6,36,23.3,59.6,25,15,66,3,2018,2022,5


In [53]:
nrow_insurance = df_cumulative_insurance.shape[0]

In [54]:
cursor.execute("SELECT insurance_coverage, insurance_id FROM insurance_dim")
mapping_data = cursor.fetchall()

insurance_mapping = pd.DataFrame(mapping_data, columns=["insurance_coverage", "insurance_id"])
insurance_mapping

Unnamed: 0,insurance_coverage,insurance_id
0,Uninsured,4
1,Other,2
2,Any Medicaid,3
3,Private Insurance Only,1


In [55]:
# we merge df_children with the insurance_mapping
df_cumulative_insurance = pd.merge(df_cumulative_insurance, insurance_mapping, left_on = ['Dimension'], right_on=['insurance_coverage'], how='left')
df_cumulative_insurance.head()

Unnamed: 0,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,gender_id,start_cohort_year,end_cohort_year,cumulative_id,insurance_coverage,insurance_id
0,Insurance Coverage,Uninsured,88.3,68.1 to 96.4,36,68.1,96.4,26,13,66,2,2018,2022,1,Uninsured,4
1,Insurance Coverage,Other,91.0,87.0 to 93.8,365,87.0,93.8,26,13,66,2,2018,2022,2,Other,2
2,Insurance Coverage,Any Medicaid,89.2,86.4 to 91.4,897,86.4,91.4,26,13,66,2,2018,2022,3,Any Medicaid,3
3,Insurance Coverage,Private Insurance Only,92.6,90.8 to 94.0,1581,90.8,94.0,26,13,66,2,2018,2022,4,Private Insurance Only,1
4,Insurance Coverage,Uninsured,40.1,23.3 to 59.6,36,23.3,59.6,25,15,66,3,2018,2022,5,Uninsured,4


In [56]:
df_cumulative_insurance.drop(columns=['Dimension Type', 'Dimension', 'insurance_coverage'], inplace=True)
print(df_cumulative_insurance.shape[0])
df_cumulative_insurance.head()

906


Unnamed: 0,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,gender_id,start_cohort_year,end_cohort_year,cumulative_id,insurance_id
0,88.3,68.1 to 96.4,36,68.1,96.4,26,13,66,2,2018,2022,1,4
1,91.0,87.0 to 93.8,365,87.0,93.8,26,13,66,2,2018,2022,2,2
2,89.2,86.4 to 91.4,897,86.4,91.4,26,13,66,2,2018,2022,3,3
3,92.6,90.8 to 94.0,1581,90.8,94.0,26,13,66,2,2018,2022,4,1
4,40.1,23.3 to 59.6,36,23.3,59.6,25,15,66,3,2018,2022,5,4


In [57]:
check_dimension_rows(df_cumulative_insurance, nrow_insurance)

Rows matched correctly


In [58]:
# handle poverty next
df_cumulative_poverty = df_cumulative[df_cumulative['Dimension Type'] == 'Poverty']
print(df_cumulative_poverty.shape[0])
nrow_poverty = df_cumulative_poverty.shape[0]
df_cumulative_poverty.head()

488


Unnamed: 0,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,gender_id,start_cohort_year,end_cohort_year,cumulative_id
16,Poverty,Below Poverty Level,88.8,85.3 to 91.6,533,85.3,91.6,26,13,66,2,2018,2022,17
17,Poverty,Living At or Above Poverty Level,91.9,90.4 to 93.2,2180,90.4,93.2,26,13,66,2,2018,2022,18
18,Poverty,Living At or Above Poverty Level,63.0,60.3 to 65.6,2180,60.3,65.6,25,15,66,3,2018,2022,19
19,Poverty,Below Poverty Level,67.8,62.4 to 72.7,533,62.4,72.7,25,15,66,3,2018,2022,20
20,Poverty,Living At or Above Poverty Level,95.4,94.1 to 96.3,2180,94.1,96.3,27,3,66,2,2018,2022,21


In [59]:
cursor.execute("SELECT poverty_status, poverty_id FROM poverty_dim")
mapping_data = cursor.fetchall()

poverty_mapping = pd.DataFrame(mapping_data, columns=["poverty_status", "poverty_id"])
poverty_mapping

Unnamed: 0,poverty_status,poverty_id
0,<133% FPL,1
1,133% to <400% FPL,2
2,>400% FPL,3
3,Below Poverty Level,4
4,Living At or Above Poverty Level,5


In [60]:
# we merge df_children with the poverty_mapping
df_cumulative_poverty = pd.merge(df_cumulative_poverty, poverty_mapping, left_on = ['Dimension'], right_on=['poverty_status'], how='left')
df_cumulative_poverty.head()

Unnamed: 0,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,gender_id,start_cohort_year,end_cohort_year,cumulative_id,poverty_status,poverty_id
0,Poverty,Below Poverty Level,88.8,85.3 to 91.6,533,85.3,91.6,26,13,66,2,2018,2022,17,Below Poverty Level,4
1,Poverty,Living At or Above Poverty Level,91.9,90.4 to 93.2,2180,90.4,93.2,26,13,66,2,2018,2022,18,Living At or Above Poverty Level,5
2,Poverty,Living At or Above Poverty Level,63.0,60.3 to 65.6,2180,60.3,65.6,25,15,66,3,2018,2022,19,Living At or Above Poverty Level,5
3,Poverty,Below Poverty Level,67.8,62.4 to 72.7,533,62.4,72.7,25,15,66,3,2018,2022,20,Below Poverty Level,4
4,Poverty,Living At or Above Poverty Level,95.4,94.1 to 96.3,2180,94.1,96.3,27,3,66,2,2018,2022,21,Living At or Above Poverty Level,5


In [61]:
df_cumulative_poverty.drop(columns=['Dimension Type', 'Dimension', 'poverty_status'], inplace=True)
df_cumulative_poverty.head()

Unnamed: 0,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,gender_id,start_cohort_year,end_cohort_year,cumulative_id,poverty_id
0,88.8,85.3 to 91.6,533,85.3,91.6,26,13,66,2,2018,2022,17,4
1,91.9,90.4 to 93.2,2180,90.4,93.2,26,13,66,2,2018,2022,18,5
2,63.0,60.3 to 65.6,2180,60.3,65.6,25,15,66,3,2018,2022,19,5
3,67.8,62.4 to 72.7,533,62.4,72.7,25,15,66,3,2018,2022,20,4
4,95.4,94.1 to 96.3,2180,94.1,96.3,27,3,66,2,2018,2022,21,5


In [62]:
check_dimension_rows(df_cumulative_poverty, nrow_poverty)

Rows matched correctly


In [63]:
# handle race and ethnicity
df_cumulative_race = df_cumulative[df_cumulative['Dimension Type'] == 'Race and Ethnicity']
nrow_race = df_cumulative_race.shape[0]
df_cumulative_race.head()

Unnamed: 0,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,gender_id,start_cohort_year,end_cohort_year,cumulative_id
24,Race and Ethnicity,"White, Non-Hispanic",70.7,67.4 to 73.8,1333,67.4,73.8,25,3,66,3,2018,2022,25
25,Race and Ethnicity,"Black, Non-Hispanic",70.4,63.7 to 76.4,322,63.7,76.4,25,3,66,3,2018,2022,26
26,Race and Ethnicity,"Other or Multiple Races, Non-Hispanic",89.6,85.0 to 92.9,435,85.0,92.9,26,13,66,2,2018,2022,27
27,Race and Ethnicity,Hispanic,89.7,86.8 to 92.0,789,86.8,92.0,26,13,66,2,2018,2022,28
28,Race and Ethnicity,"Black, Non-Hispanic",89.7,85.4 to 92.8,322,85.4,92.8,26,13,66,2,2018,2022,29


In [64]:
cursor.execute("SELECT race_ethnicity, race_ethnicity_id FROM race_ethnicity_dim")
mapping_data = cursor.fetchall()

race_ethnicity_mapping = pd.DataFrame(mapping_data, columns=["race_ethnicity", "race_ethnicity_id"])
race_ethnicity_mapping

Unnamed: 0,race_ethnicity,race_ethnicity_id
0,"American Indian or Alaska Native, Non-Hispanic",5
1,"Asian, Non-Hispanic",6
2,"White, Non-Hispanic",2
3,Hispanic,3
4,"Other or Multiple Races, Non-Hispanic",1
5,"Black, Non-Hispanic",4


In [65]:
df_cumulative_race = pd.merge(df_cumulative_race, race_ethnicity_mapping, left_on = ['Dimension'], right_on=['race_ethnicity'], how='left')
df_cumulative_race.head()

Unnamed: 0,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,gender_id,start_cohort_year,end_cohort_year,cumulative_id,race_ethnicity,race_ethnicity_id
0,Race and Ethnicity,"White, Non-Hispanic",70.7,67.4 to 73.8,1333,67.4,73.8,25,3,66,3,2018,2022,25,"White, Non-Hispanic",2
1,Race and Ethnicity,"Black, Non-Hispanic",70.4,63.7 to 76.4,322,63.7,76.4,25,3,66,3,2018,2022,26,"Black, Non-Hispanic",4
2,Race and Ethnicity,"Other or Multiple Races, Non-Hispanic",89.6,85.0 to 92.9,435,85.0,92.9,26,13,66,2,2018,2022,27,"Other or Multiple Races, Non-Hispanic",1
3,Race and Ethnicity,Hispanic,89.7,86.8 to 92.0,789,86.8,92.0,26,13,66,2,2018,2022,28,Hispanic,3
4,Race and Ethnicity,"Black, Non-Hispanic",89.7,85.4 to 92.8,322,85.4,92.8,26,13,66,2,2018,2022,29,"Black, Non-Hispanic",4


In [66]:
df_cumulative_race.drop(columns=['Dimension Type', 'Dimension', 'race_ethnicity'], inplace=True)
df_cumulative_race.head()

Unnamed: 0,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,gender_id,start_cohort_year,end_cohort_year,cumulative_id,race_ethnicity_id
0,70.7,67.4 to 73.8,1333,67.4,73.8,25,3,66,3,2018,2022,25,2
1,70.4,63.7 to 76.4,322,63.7,76.4,25,3,66,3,2018,2022,26,4
2,89.6,85.0 to 92.9,435,85.0,92.9,26,13,66,2,2018,2022,27,1
3,89.7,86.8 to 92.0,789,86.8,92.0,26,13,66,2,2018,2022,28,3
4,89.7,85.4 to 92.8,322,85.4,92.8,26,13,66,2,2018,2022,29,4


In [67]:
check_dimension_rows(df_cumulative_race, nrow_race)

Rows matched correctly


In [68]:
# handle urbanicity
df_cumulative_urban = df_cumulative[df_cumulative['Dimension Type'] == 'Urbanicity']
nrow_ubanicity = df_cumulative_urban.shape[0]
df_cumulative_urban.head()

Unnamed: 0,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,gender_id,start_cohort_year,end_cohort_year,cumulative_id
38,Urbanicity,Living In a MSA Principal City,89.9,88.0 to 91.5,1728,88.0,91.5,26,13,66,2,2018,2022,39
39,Urbanicity,Living In a MSA Non-Principal City,92.7,90.4 to 94.4,1005,90.4,94.4,26,13,66,2,2018,2022,40
42,Urbanicity,Living In a MSA Principal City,94.7,93.2 to 95.8,1728,93.2,95.8,27,3,66,2,2018,2022,43
43,Urbanicity,Living In a Non-MSA,87.1,77.7 to 92.9,146,77.7,92.9,27,3,66,2,2018,2022,44
44,Urbanicity,Living In a MSA Non-Principal City,96.0,94.2 to 97.2,1005,94.2,97.2,27,3,66,2,2018,2022,45


In [69]:
cursor.execute("SELECT urbanicity, urbanicity_id FROM urbanicity_dim")
mapping_data = cursor.fetchall()

urbanicity_mapping = pd.DataFrame(mapping_data, columns=["urbanicity", "urbanicity_id"])
urbanicity_mapping

Unnamed: 0,urbanicity,urbanicity_id
0,Living In a MSA Principal City,1
1,Living In a MSA Non-Principal City,2
2,Living In a Non-MSA,3


In [70]:
df_cumulative_urban = pd.merge(df_cumulative_urban, urbanicity_mapping, left_on = ['Dimension'], right_on=['urbanicity'], how='left')
df_cumulative_urban.head()

Unnamed: 0,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,gender_id,start_cohort_year,end_cohort_year,cumulative_id,urbanicity,urbanicity_id
0,Urbanicity,Living In a MSA Principal City,89.9,88.0 to 91.5,1728,88.0,91.5,26,13,66,2,2018,2022,39,Living In a MSA Principal City,1
1,Urbanicity,Living In a MSA Non-Principal City,92.7,90.4 to 94.4,1005,90.4,94.4,26,13,66,2,2018,2022,40,Living In a MSA Non-Principal City,2
2,Urbanicity,Living In a MSA Principal City,94.7,93.2 to 95.8,1728,93.2,95.8,27,3,66,2,2018,2022,43,Living In a MSA Principal City,1
3,Urbanicity,Living In a Non-MSA,87.1,77.7 to 92.9,146,77.7,92.9,27,3,66,2,2018,2022,44,Living In a Non-MSA,3
4,Urbanicity,Living In a MSA Non-Principal City,96.0,94.2 to 97.2,1005,94.2,97.2,27,3,66,2,2018,2022,45,Living In a MSA Non-Principal City,2


In [71]:
df_cumulative_urban.drop(columns=['Dimension Type', 'Dimension', 'urbanicity'], inplace=True)
df_cumulative_urban.head()

Unnamed: 0,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,gender_id,start_cohort_year,end_cohort_year,cumulative_id,urbanicity_id
0,89.9,88.0 to 91.5,1728,88.0,91.5,26,13,66,2,2018,2022,39,1
1,92.7,90.4 to 94.4,1005,90.4,94.4,26,13,66,2,2018,2022,40,2
2,94.7,93.2 to 95.8,1728,93.2,95.8,27,3,66,2,2018,2022,43,1
3,87.1,77.7 to 92.9,146,77.7,92.9,27,3,66,2,2018,2022,44,3
4,96.0,94.2 to 97.2,1005,94.2,97.2,27,3,66,2,2018,2022,45,2


In [72]:
check_dimension_rows(df_cumulative_urban, nrow_ubanicity)

Rows matched correctly


In [73]:
# handle overall
df_cumulative_overall = df_cumulative[df_cumulative['Dimension Type'] == 'Overall']
nrow_overall = df_cumulative_overall.shape[0]
df_cumulative_overall.head()

Unnamed: 0,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,gender_id,start_cohort_year,end_cohort_year,cumulative_id
52,Overall,Overall,63.5,61.2 to 65.8,2879,61.2,65.8,25,15,66,3,2018,2022,53
53,Overall,Overall,75.0,72.8 to 77.1,2879,72.8,77.1,25,3,66,3,2018,2022,54
54,Overall,Overall,94.8,93.7 to 95.7,2879,93.7,95.7,27,3,66,2,2018,2022,55
55,Overall,Overall,91.2,89.8 to 92.4,2879,89.8,92.4,26,13,66,2,2018,2022,56
89,Overall,Overall,33.3,30.2 to 36.5,1372,30.2,36.5,25,15,53,3,2018,2022,90


In [74]:
df_cumulative_overall.rename(columns={'Dimension': 'Overall'}, inplace=True)
df_cumulative_overall.drop(columns=['Dimension Type'], inplace=True)
print(df_cumulative_overall.shape[0])

244


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cumulative_overall.rename(columns={'Dimension': 'Overall'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cumulative_overall.drop(columns=['Dimension Type'], inplace=True)


In [75]:
check_dimension_rows(df_cumulative_overall, nrow_overall)

Rows matched correctly


In [76]:
df_cumulative_dimension = pd.concat([df_cumulative_insurance, df_cumulative_poverty, df_cumulative_race, df_cumulative_urban, df_cumulative_overall])
df_cumulative_dimension.head()

Unnamed: 0,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,gender_id,start_cohort_year,end_cohort_year,cumulative_id,insurance_id,poverty_id,race_ethnicity_id,urbanicity_id,Overall
0,88.3,68.1 to 96.4,36,68.1,96.4,26,13,66,2,2018,2022,1,4.0,,,,
1,91.0,87.0 to 93.8,365,87.0,93.8,26,13,66,2,2018,2022,2,2.0,,,,
2,89.2,86.4 to 91.4,897,86.4,91.4,26,13,66,2,2018,2022,3,3.0,,,,
3,92.6,90.8 to 94.0,1581,90.8,94.0,26,13,66,2,2018,2022,4,1.0,,,,
4,40.1,23.3 to 59.6,36,23.3,59.6,25,15,66,3,2018,2022,5,4.0,,,,


In [77]:
check_dimension_rows(df_cumulative_dimension, (nrow_insurance+nrow_overall+nrow_poverty+nrow_race+nrow_ubanicity))

Rows matched correctly


In [78]:
if df_cumulative_dimension.shape[0] + df_transactional.shape[0] == filled_df_nrow:
    print("Rows still match!")
else:
    print("Rows don't match! Check code!")

Rows still match!


In [79]:
df_transactional.head()

Unnamed: 0,Survey Year,Age,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,gender_id
0,2023,13-17 Years,81.5,75.2 to 86.5,289,75.2,86.5,25,3,66,1
1,2023,13-17 Years,90.2,86.8 to 92.8,559,86.8,92.8,26,13,66,2
2,2023,13-17 Years,93.6,90.9 to 95.5,559,90.9,95.5,26,14,66,2
3,2023,13-17 Years,95.3,92.7 to 97.0,559,92.7,97.0,27,3,66,2
4,2023,13-17 Years,79.4,74.8 to 83.3,559,74.8,83.3,25,3,66,3


### Loading datasets into fact tables

In [80]:
df_transactional.columns

Index(['Survey Year', 'Age', 'Estimate (%)', '95% CI (%)', 'Sample Size',
       'ci_lower', 'ci_upper', 'vaccine_id', 'dose_id', 'geography_id',
       'gender_id'],
      dtype='object')

In [82]:
df_transactional.head()

Unnamed: 0,Survey Year,Age,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,gender_id
0,2023,13-17 Years,81.5,75.2 to 86.5,289,75.2,86.5,25,3,66,1
1,2023,13-17 Years,90.2,86.8 to 92.8,559,86.8,92.8,26,13,66,2
2,2023,13-17 Years,93.6,90.9 to 95.5,559,90.9,95.5,26,14,66,2
3,2023,13-17 Years,95.3,92.7 to 97.0,559,92.7,97.0,27,3,66,2
4,2023,13-17 Years,79.4,74.8 to 83.3,559,74.8,83.3,25,3,66,3


In [135]:
# loading transaction fact table
try:
    insert_query='''
    INSERT INTO vaccine_transaction_fact(
        vaccine_transaction_survey_year,
        vaccine_transaction_age,
        vaccine_transaction_estimate_pct,
        vaccine_transaction_ci_lower,
        vaccine_transaction_ci_upper,
        vaccine_transaction_sample_size,
        vaccine_transaction_gender_id,
        vaccine_transaction_vaccine_id,
        vaccine_transaction_dose_id,
        vaccine_transaction_geography_id
    )
    VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    '''
    for index, row in df_transactional.iterrows():
        vaccine_id = int(row['vaccine_id']) if pd.notnull(row['vaccine_id']) else None
        dose_id = int(row['dose_id']) if pd.notnull(row['dose_id']) else None
        geography_id = int(row['geography_id']) if pd.notnull(row['geography_id']) else None
        gender_id = int(row['gender_id']) if pd.notnull(row['gender_id']) else None
        
        cursor.execute(insert_query,(
        row['Survey Year'],
        row['Age'],
        row['Estimate (%)'],
        row['ci_lower'],
        row['ci_upper'],
        row['Sample Size'],
        gender_id,
        vaccine_id,
        dose_id,
        geography_id))
    conn.commit()
    print(f"{len(df_transactional)} records inserted into vaccine_transaction_fact.")

except Exception as e:
    print(f"Error occurred: {e}")
    traceback.print_exc(file=sys.stdout)
    conn.rollback()



24315 records inserted into vaccine_transaction_fact.


In [81]:
df_cumulative_dimension.head()

Unnamed: 0,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,gender_id,start_cohort_year,end_cohort_year,cumulative_id,insurance_id,poverty_id,race_ethnicity_id,urbanicity_id,Overall
0,88.3,68.1 to 96.4,36,68.1,96.4,26,13,66,2,2018,2022,1,4.0,,,,
1,91.0,87.0 to 93.8,365,87.0,93.8,26,13,66,2,2018,2022,2,2.0,,,,
2,89.2,86.4 to 91.4,897,86.4,91.4,26,13,66,2,2018,2022,3,3.0,,,,
3,92.6,90.8 to 94.0,1581,90.8,94.0,26,13,66,2,2018,2022,4,1.0,,,,
4,40.1,23.3 to 59.6,36,23.3,59.6,25,15,66,3,2018,2022,5,4.0,,,,


In [130]:
df_cumulative_dimension.columns

Index(['Estimate (%)', '95% CI (%)', 'Sample Size', 'ci_lower', 'ci_upper',
       'vaccine_id', 'dose_id', 'geography_id', 'gender_id',
       'start_cohort_year', 'end_cohort_year', 'cumulative_id', 'insurance_id',
       'poverty_id', 'race_ethnicity_id', 'urbanicity_id', 'Overall'],
      dtype='object')

In [131]:
df_cumulative_overall

Unnamed: 0,Overall,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,gender_id,start_cohort_year,end_cohort_year,cumulative_id
52,Overall,63.5,61.2 to 65.8,2879,61.2,65.8,25,15,66,3,2018,2022,53
53,Overall,75.0,72.8 to 77.1,2879,72.8,77.1,25,3,66,3,2018,2022,54
54,Overall,94.8,93.7 to 95.7,2879,93.7,95.7,27,3,66,2,2018,2022,55
55,Overall,91.2,89.8 to 92.4,2879,89.8,92.4,26,13,66,2,2018,2022,56
89,Overall,33.3,30.2 to 36.5,1372,30.2,36.5,25,15,53,3,2018,2022,90
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3185,Overall,91.6,90.0 to 92.9,2207,90.0,92.9,26,13,21,2,2018,2022,3186
3186,Overall,72.4,69.4 to 75.2,1454,69.4,75.2,25,3,78,3,2018,2022,3187
3187,Overall,56.0,52.8 to 59.2,1454,52.8,59.2,25,15,78,3,2018,2022,3188
3236,Overall,85.9,83.5 to 88.1,1454,83.5,88.1,26,13,78,2,2018,2022,3237


In [132]:
df_cumulative_dimension.columns

Index(['Estimate (%)', '95% CI (%)', 'Sample Size', 'ci_lower', 'ci_upper',
       'vaccine_id', 'dose_id', 'geography_id', 'gender_id',
       'start_cohort_year', 'end_cohort_year', 'cumulative_id', 'insurance_id',
       'poverty_id', 'race_ethnicity_id', 'urbanicity_id', 'Overall'],
      dtype='object')

In [133]:
# loading transaction fact table
try:
    insert_query='''
    INSERT INTO vaccine_cumulative_fact(
        vaccine_cumulative_start_cohort_year,
        vaccine_cumulative_end_cohort_year,
        vaccine_cumulative_estimate_pct,
        vaccine_cumulative_sample_size,
        vaccine_cumulative_ci_lower,
        vaccine_cumulative_ci_upper,
        vaccine_cumulative_vaccine_id,
        vaccine_cumulative_dose_id,
        vaccine_cumulative_geography_id,
        vaccine_cumulative_insurance_id,
        vaccine_cumulative_poverty_id,
        vaccine_cumulative_race_ethnicity_id,
        vaccine_cumulative_urbanicity_id,
        vaccine_cumulative_gender_id,
        vaccine_cumulative_overall
    )
    VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    '''
    for index, row in df_cumulative_dimension.iterrows():
        start_cohort_year = int(row['start_cohort_year']) if pd.notnull(row['start_cohort_year']) else None
        end_cohort_year = int(row['end_cohort_year']) if pd.notnull(row['end_cohort_year']) else None
        vaccine_id = int(row['vaccine_id']) if pd.notnull(row['vaccine_id']) else None
        dose_id = int(row['dose_id']) if pd.notnull(row['dose_id']) else None
        geography_id = int(row['geography_id']) if pd.notnull(row['geography_id']) else None
        poverty_id = int(row['poverty_id']) if pd.notnull(row['poverty_id']) else None
        race_ethnicity_id = int(row['race_ethnicity_id']) if pd.notnull(row['race_ethnicity_id']) else None
        urbanicity_id = int(row['urbanicity_id']) if pd.notnull(row['urbanicity_id']) else None
        insurance_id = int(row['insurance_id']) if pd.notnull(row['insurance_id']) else None
        gender_id = int(row['gender_id']) if pd.notnull(row['gender_id']) else None
        sample_size = int(row['Sample Size']) if pd.notnull(row['Sample Size']) else None

        cursor.execute(insert_query,(
        start_cohort_year,
        end_cohort_year,
        row['Estimate (%)'],
        sample_size,
        row['ci_lower'],
        row['ci_upper'],
        vaccine_id,
        dose_id,
        geography_id,
        insurance_id,
        poverty_id,
        race_ethnicity_id,
        urbanicity_id,
        gender_id,
        row['Overall']))
    conn.commit()
    print(f"{len(df_cumulative_dimension)} records inserted into vaccine_cumulative_fact.")

except Exception as e:
    print(f"Error occurred: {e}")
    traceback.print_exc(file=sys.stdout)
    conn.rollback()


3238 records inserted into vaccine_cumulative_fact.
