# Loading Children dataset into DB

In [89]:
import pandas as pd
import numpy as np
import re
import traceback
import sys

In [90]:
df_children = df_children = pd.read_csv('dataset/Vaccination_Coverage_among_Young_Children.csv')
df_children.head()

Unnamed: 0,Vaccine,Dose,Geography Type,Geography,Birth Year/Birth Cohort,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size
0,DTaP,≥3 Doses,States/Local Areas,North Dakota,2019,Age,19 Months,93.5,88.0 to 96.6,263.0
1,DTaP,≥3 Doses,States/Local Areas,North Dakota,2018,Age,19 Months,95.2,91.0 to 97.5,293.0
2,DTaP,≥3 Doses,States/Local Areas,North Dakota,2018-2019,Age,19 Months,91.8,88.3 to 94.3,556.0
3,Polio,≥3 Doses,States/Local Areas,North Dakota,2021,Age,19 Months,89.4,81.9 to 94.1,143.0
4,Polio,≥2 Doses,States/Local Areas,North Dakota,2021,Age,5 Months,79.3,69.0 to 86.8,143.0


In [91]:
original_nrow = df_children.shape[0]
original_nrow

128188

Dropping the tables with empty values for critical measures. These don't provide useful insights in the analysis.

In [92]:
critical_measures = ['Estimate (%)', 'Sample Size', '95% CI (%)']
length_na_rows = len(df_children[df_children[critical_measures].isna().all(axis=1)])
percentage_rows_dropped = (length_na_rows/original_nrow) * 100
print(f"{percentage_rows_dropped}% contained NA fields in measures. Dropping {length_na_rows} rows.")

0.045246044871594844% contained NA fields in measures. Dropping 58 rows.


In [93]:
df_children[df_children[critical_measures].isna().all(axis=1)].head()

Unnamed: 0,Vaccine,Dose,Geography Type,Geography,Birth Year/Birth Cohort,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size
49614,Rotavirus,,States/Local Areas,Puerto Rico,2011,Age,8 Months,,,
49615,Rotavirus,,States/Local Areas,Puerto Rico,2012,Age,8 Months,,,
49616,Rotavirus,,States/Local Areas,Puerto Rico,2013,Age,8 Months,,,
49617,Rotavirus,,States/Local Areas,Puerto Rico,2013-2014,Age,8 Months,,,
49618,Rotavirus,,States/Local Areas,Puerto Rico,2014,Age,8 Months,,,


In [94]:
critical_measures = ['Estimate (%)', 'Sample Size', '95% CI (%)']
df_children = df_children.dropna(subset=critical_measures, how = 'all')
df_children[df_children['Sample Size'].isna()]

Unnamed: 0,Vaccine,Dose,Geography Type,Geography,Birth Year/Birth Cohort,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size


In [95]:
filled_df_nrow = df_children.shape[0]
filled_df_nrow

128130

Data Quality check: Check that the number of rows dropped and the number of remaining rows match the original dataset

In [96]:
# data qauality check
# checking that rows dropped + current rows match original rows
if (filled_df_nrow + length_na_rows == original_nrow):
    print("Dropped rows correctly")
else:
    print("Rows dropped incorrectly! Check the code above!")

Dropped rows correctly


In [97]:
df_children['Sample Size'] = pd.to_numeric(df_children['Sample Size'], errors='coerce').astype('Int64')
df_children.head()

Unnamed: 0,Vaccine,Dose,Geography Type,Geography,Birth Year/Birth Cohort,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size
0,DTaP,≥3 Doses,States/Local Areas,North Dakota,2019,Age,19 Months,93.5,88.0 to 96.6,263
1,DTaP,≥3 Doses,States/Local Areas,North Dakota,2018,Age,19 Months,95.2,91.0 to 97.5,293
2,DTaP,≥3 Doses,States/Local Areas,North Dakota,2018-2019,Age,19 Months,91.8,88.3 to 94.3,556
3,Polio,≥3 Doses,States/Local Areas,North Dakota,2021,Age,19 Months,89.4,81.9 to 94.1,143
4,Polio,≥2 Doses,States/Local Areas,North Dakota,2021,Age,5 Months,79.3,69.0 to 86.8,143


In [98]:
# splitting the confidence intervals
ci = df_children['95% CI (%)'].astype(str)
df_children[['ci_lower', 'ci_upper']] = ci.str.split(r'\s+to\s+', expand=True)
df_children['ci_lower'] = df_children['ci_lower'].astype(float)
df_children['ci_upper'] = df_children['ci_upper'].astype(float)

In [99]:
# fill in missing dimensions

for i, row in df_children.iterrows():
    match = re.search(r'(≥\d+|\d+)', row['Vaccine'])
    if match:
        df_children.at[i, 'Dose'] = str(match.group()) + ' Dose'
    else:
        pass  # Do nothing if no match

# fill in influenza and rotavirus
# to make things consistent, we can replace nan with '1 Dose only' 
df_children['Dose'] = np.where(pd.isnull(df_children['Dose']), '1 Dose Only', df_children['Dose'])

df_children.head()

Unnamed: 0,Vaccine,Dose,Geography Type,Geography,Birth Year/Birth Cohort,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper
0,DTaP,≥3 Doses,States/Local Areas,North Dakota,2019,Age,19 Months,93.5,88.0 to 96.6,263,88.0,96.6
1,DTaP,≥3 Doses,States/Local Areas,North Dakota,2018,Age,19 Months,95.2,91.0 to 97.5,293,91.0,97.5
2,DTaP,≥3 Doses,States/Local Areas,North Dakota,2018-2019,Age,19 Months,91.8,88.3 to 94.3,556,88.3,94.3
3,Polio,≥3 Doses,States/Local Areas,North Dakota,2021,Age,19 Months,89.4,81.9 to 94.1,143,81.9,94.1
4,Polio,≥2 Doses,States/Local Areas,North Dakota,2021,Age,5 Months,79.3,69.0 to 86.8,143,69.0,86.8


In [100]:
# getting the vaccine dimension table from postgres
import psycopg2 as pg
from creds import POSTGRES_USERNAME, POSTGRES_PW
conn = pg.connect(
    dbname='cs689_project',  
    user=POSTGRES_USERNAME,
    password=POSTGRES_PW,
    host='localhost',
    port='5432'
)
cursor = conn.cursor()

In [101]:
cursor.execute("SELECT vaccine_id, cleaned_vaccine, start_date FROM vaccine_dim")
mapping_data = cursor.fetchall()

vaccine_mapping = pd.DataFrame(mapping_data, columns=["vaccine_id", "cleaned_vaccine", "start_date"])
vaccine_mapping

Unnamed: 0,vaccine_id,cleaned_vaccine,start_date
0,1,DTaP,2011-09-01
1,2,Polio,2011-09-01
2,3,Hep B,2011-09-01
3,4,PCV,2011-09-01
4,7,Hib,2011-09-01
5,10,Combined 7 Series,2011-09-01
6,11,Rotavirus,2011-09-01
7,9,Influenza,2011-09-01
8,12,Influenza,2012-09-01
9,13,Influenza,2013-09-01


In [102]:
vaccine_mapping['start_date'] = pd.to_datetime(vaccine_mapping['start_date'])
vaccine_mapping['year'] = vaccine_mapping['start_date'].dt.year
vaccine_mapping


Unnamed: 0,vaccine_id,cleaned_vaccine,start_date,year
0,1,DTaP,2011-09-01,2011
1,2,Polio,2011-09-01,2011
2,3,Hep B,2011-09-01,2011
3,4,PCV,2011-09-01,2011
4,7,Hib,2011-09-01,2011
5,10,Combined 7 Series,2011-09-01,2011
6,11,Rotavirus,2011-09-01,2011
7,9,Influenza,2011-09-01,2011
8,12,Influenza,2012-09-01,2012
9,13,Influenza,2013-09-01,2013


In [103]:
df_children['Vaccine'].unique()

array(['DTaP', 'Polio', 'Hep B', 'PCV', '≥1 Dose Varicella',
       '≥1 Dose MMR', 'Hib', 'Hep A', 'Influenza', 'Combined 7 Series',
       'Rotavirus'], dtype=object)

In [104]:
# clean the vaccines first
df_children['cleaned_vaccine'] = df_children['Vaccine'].str.replace(r'(≥\d+\s*Dose\s*)', '', regex=True).str.strip()

In [105]:
# for influenza, we need to match it to the year
years = df_children['Birth Year/Birth Cohort'].astype(str).str[:4]
df_children['vaccine_year'] = np.where(df_children['Vaccine'] == 'Influenza',years,'2011')

In [106]:
df_children.head()

Unnamed: 0,Vaccine,Dose,Geography Type,Geography,Birth Year/Birth Cohort,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,cleaned_vaccine,vaccine_year
0,DTaP,≥3 Doses,States/Local Areas,North Dakota,2019,Age,19 Months,93.5,88.0 to 96.6,263,88.0,96.6,DTaP,2011
1,DTaP,≥3 Doses,States/Local Areas,North Dakota,2018,Age,19 Months,95.2,91.0 to 97.5,293,91.0,97.5,DTaP,2011
2,DTaP,≥3 Doses,States/Local Areas,North Dakota,2018-2019,Age,19 Months,91.8,88.3 to 94.3,556,88.3,94.3,DTaP,2011
3,Polio,≥3 Doses,States/Local Areas,North Dakota,2021,Age,19 Months,89.4,81.9 to 94.1,143,81.9,94.1,Polio,2011
4,Polio,≥2 Doses,States/Local Areas,North Dakota,2021,Age,5 Months,79.3,69.0 to 86.8,143,69.0,86.8,Polio,2011


In [107]:
df_children[df_children['vaccine_year'].isna()]
df_children['vaccine_year'] = (pd.to_numeric(df_children['vaccine_year'], errors='coerce').astype('Int64'))

In [108]:
# we merge df_children with the vaccine_dim
df_children = pd.merge(df_children, vaccine_mapping, left_on = ['cleaned_vaccine', 'vaccine_year'], right_on=['cleaned_vaccine', 'year'], how='left')
df_children.head()

Unnamed: 0,Vaccine,Dose,Geography Type,Geography,Birth Year/Birth Cohort,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,cleaned_vaccine,vaccine_year,vaccine_id,start_date,year
0,DTaP,≥3 Doses,States/Local Areas,North Dakota,2019,Age,19 Months,93.5,88.0 to 96.6,263,88.0,96.6,DTaP,2011,1,2011-09-01,2011
1,DTaP,≥3 Doses,States/Local Areas,North Dakota,2018,Age,19 Months,95.2,91.0 to 97.5,293,91.0,97.5,DTaP,2011,1,2011-09-01,2011
2,DTaP,≥3 Doses,States/Local Areas,North Dakota,2018-2019,Age,19 Months,91.8,88.3 to 94.3,556,88.3,94.3,DTaP,2011,1,2011-09-01,2011
3,Polio,≥3 Doses,States/Local Areas,North Dakota,2021,Age,19 Months,89.4,81.9 to 94.1,143,81.9,94.1,Polio,2011,2,2011-09-01,2011
4,Polio,≥2 Doses,States/Local Areas,North Dakota,2021,Age,5 Months,79.3,69.0 to 86.8,143,69.0,86.8,Polio,2011,2,2011-09-01,2011


In [109]:
df_children[df_children['vaccine_id'].isna()]

Unnamed: 0,Vaccine,Dose,Geography Type,Geography,Birth Year/Birth Cohort,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,cleaned_vaccine,vaccine_year,vaccine_id,start_date,year


In [110]:
def check_rows(df, len_df_children = filled_df_nrow):
    if df.shape[0] != len_df_children:
        print("Rows don't match!")
    else:
        print("Rows match!")

check_rows(df_children, filled_df_nrow)

Rows match!


In [111]:
cursor.execute("SELECT dose, dose_id FROM dose_dim")
mapping_data = cursor.fetchall()

dose_mapping = pd.DataFrame(mapping_data, columns=["dose", "dose_id"])
dose_mapping

Unnamed: 0,dose,dose_id
0,≥3 Doses,1
1,≥2 Doses,2
2,≥1 Dose,3
3,"≥1 Dose, 2 Day",4
4,Full Series,5
5,1 Dose Only,6
6,Primary Series,7
7,≥4 Doses,8
8,7 Dose,9
9,"≥1 Dose, 1 Day",10


In [112]:
# we merge df_children with the dose_dim
df_children = pd.merge(df_children, dose_mapping, left_on = ['Dose'], right_on=['dose'], how='left')
df_children.head()

Unnamed: 0,Vaccine,Dose,Geography Type,Geography,Birth Year/Birth Cohort,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,cleaned_vaccine,vaccine_year,vaccine_id,start_date,year,dose,dose_id
0,DTaP,≥3 Doses,States/Local Areas,North Dakota,2019,Age,19 Months,93.5,88.0 to 96.6,263,88.0,96.6,DTaP,2011,1,2011-09-01,2011,≥3 Doses,1
1,DTaP,≥3 Doses,States/Local Areas,North Dakota,2018,Age,19 Months,95.2,91.0 to 97.5,293,91.0,97.5,DTaP,2011,1,2011-09-01,2011,≥3 Doses,1
2,DTaP,≥3 Doses,States/Local Areas,North Dakota,2018-2019,Age,19 Months,91.8,88.3 to 94.3,556,88.3,94.3,DTaP,2011,1,2011-09-01,2011,≥3 Doses,1
3,Polio,≥3 Doses,States/Local Areas,North Dakota,2021,Age,19 Months,89.4,81.9 to 94.1,143,81.9,94.1,Polio,2011,2,2011-09-01,2011,≥3 Doses,1
4,Polio,≥2 Doses,States/Local Areas,North Dakota,2021,Age,5 Months,79.3,69.0 to 86.8,143,69.0,86.8,Polio,2011,2,2011-09-01,2011,≥2 Doses,2


In [113]:
df_children[df_children['vaccine_id'].isna()]

Unnamed: 0,Vaccine,Dose,Geography Type,Geography,Birth Year/Birth Cohort,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,cleaned_vaccine,vaccine_year,vaccine_id,start_date,year,dose,dose_id


In [114]:
# drop original columns
df_children.drop(columns=['Vaccine', 'cleaned_vaccine', 'vaccine_year', 'Dose', 'dose', 'start_date', 'year'], inplace=True)
check_rows(df_children)
df_children.head()

Rows match!


Unnamed: 0,Geography Type,Geography,Birth Year/Birth Cohort,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id
0,States/Local Areas,North Dakota,2019,Age,19 Months,93.5,88.0 to 96.6,263,88.0,96.6,1,1
1,States/Local Areas,North Dakota,2018,Age,19 Months,95.2,91.0 to 97.5,293,91.0,97.5,1,1
2,States/Local Areas,North Dakota,2018-2019,Age,19 Months,91.8,88.3 to 94.3,556,88.3,94.3,1,1
3,States/Local Areas,North Dakota,2021,Age,19 Months,89.4,81.9 to 94.1,143,81.9,94.1,2,1
4,States/Local Areas,North Dakota,2021,Age,5 Months,79.3,69.0 to 86.8,143,69.0,86.8,2,2


In [115]:
cursor.execute("SELECT geography_id, original_geography, original_geography_type FROM geography_dim")
mapping_data = cursor.fetchall()

geography_mapping = pd.DataFrame(mapping_data, columns=["geography_id", "original_geography", "original_geography_type"])
geography_mapping

Unnamed: 0,geography_id,original_geography,original_geography_type
0,66,New York,States/Local Areas
1,46,Maine,States/Local Areas
2,40,Kentucky,States/Local Areas
3,13,North Carolina,States/Local Areas
4,49,Hawaii,States/Local Areas
...,...,...,...
75,39,Florida,States/Local Areas
76,61,New Hampshire,States/Local Areas
77,34,Ohio,States/Local Areas
78,43,Arizona,States/Local Areas


In [116]:
# we merge df_children with the geography_mapping
df_children = pd.merge(df_children, geography_mapping, left_on = ['Geography', 'Geography Type'], right_on=['original_geography', 'original_geography_type'], how='left')
df_children.head()

Unnamed: 0,Geography Type,Geography,Birth Year/Birth Cohort,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,original_geography,original_geography_type
0,States/Local Areas,North Dakota,2019,Age,19 Months,93.5,88.0 to 96.6,263,88.0,96.6,1,1,12,North Dakota,States/Local Areas
1,States/Local Areas,North Dakota,2018,Age,19 Months,95.2,91.0 to 97.5,293,91.0,97.5,1,1,12,North Dakota,States/Local Areas
2,States/Local Areas,North Dakota,2018-2019,Age,19 Months,91.8,88.3 to 94.3,556,88.3,94.3,1,1,12,North Dakota,States/Local Areas
3,States/Local Areas,North Dakota,2021,Age,19 Months,89.4,81.9 to 94.1,143,81.9,94.1,2,1,12,North Dakota,States/Local Areas
4,States/Local Areas,North Dakota,2021,Age,5 Months,79.3,69.0 to 86.8,143,69.0,86.8,2,2,12,North Dakota,States/Local Areas


In [117]:
# drop original columns
df_children.drop(columns=['Geography', 'Geography Type', 'original_geography', 'original_geography_type'], inplace=True)
check_rows(df_children)
df_children.head()

Rows match!


Unnamed: 0,Birth Year/Birth Cohort,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id
0,2019,Age,19 Months,93.5,88.0 to 96.6,263,88.0,96.6,1,1,12
1,2018,Age,19 Months,95.2,91.0 to 97.5,293,91.0,97.5,1,1,12
2,2018-2019,Age,19 Months,91.8,88.3 to 94.3,556,88.3,94.3,1,1,12
3,2021,Age,19 Months,89.4,81.9 to 94.1,143,81.9,94.1,2,1,12
4,2021,Age,5 Months,79.3,69.0 to 86.8,143,69.0,86.8,2,2,12


We see that the Birth Year/Cohort contains both an individual year and a cohort year, where more than 2 years are grouped together. We can infer that rows containing one year in the Birth Year column is a transaction while rows containing cohort years are cumulative. We can divide the dataset into transaction and cumulative datasets using this information.

In [118]:
# creating transactional snaphot
df_transactional = df_children[df_children['Birth Year/Birth Cohort'].str.len() == 4]

df_cumulative = df_children[df_children['Birth Year/Birth Cohort'].str.len() != 4]
df_transactional.shape[0] + df_cumulative.shape[0]

128130

In [119]:
df_transactional.head()

Unnamed: 0,Birth Year/Birth Cohort,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id
0,2019,Age,19 Months,93.5,88.0 to 96.6,263,88.0,96.6,1,1,12
1,2018,Age,19 Months,95.2,91.0 to 97.5,293,91.0,97.5,1,1,12
3,2021,Age,19 Months,89.4,81.9 to 94.1,143,81.9,94.1,2,1,12
4,2021,Age,5 Months,79.3,69.0 to 86.8,143,69.0,86.8,2,2,12
6,2021,Age,5 Months,79.3,69.0 to 86.8,143,69.0,86.8,1,2,12


In [120]:
# there is only the Age dimension in the transactional dataset
df_transactional['Dimension Type'].unique()

array(['Age'], dtype=object)

In [121]:
df_transactional['Dimension'].unique()

array(['19 Months', '5 Months', '13 Months', '3 Months', '35 Months',
       '24 Months', '8 Months', '0-3 Days', '7 Months', '0-2 Days',
       '0-1 Days'], dtype=object)

In [122]:
df_transactional.drop(columns=['Dimension Type'], inplace=True)
df_transactional.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_transactional.drop(columns=['Dimension Type'], inplace=True)


Unnamed: 0,Birth Year/Birth Cohort,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id
0,2019,19 Months,93.5,88.0 to 96.6,263,88.0,96.6,1,1,12
1,2018,19 Months,95.2,91.0 to 97.5,293,91.0,97.5,1,1,12
3,2021,19 Months,89.4,81.9 to 94.1,143,81.9,94.1,2,1,12
4,2021,5 Months,79.3,69.0 to 86.8,143,69.0,86.8,2,2,12
6,2021,5 Months,79.3,69.0 to 86.8,143,69.0,86.8,1,2,12


In [123]:
df_transactional.rename(columns={'Dimension': 'Age'}, inplace=True)
df_transactional.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_transactional.rename(columns={'Dimension': 'Age'}, inplace=True)


Unnamed: 0,Birth Year/Birth Cohort,Age,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id
0,2019,19 Months,93.5,88.0 to 96.6,263,88.0,96.6,1,1,12
1,2018,19 Months,95.2,91.0 to 97.5,293,91.0,97.5,1,1,12
3,2021,19 Months,89.4,81.9 to 94.1,143,81.9,94.1,2,1,12
4,2021,5 Months,79.3,69.0 to 86.8,143,69.0,86.8,2,2,12
6,2021,5 Months,79.3,69.0 to 86.8,143,69.0,86.8,1,2,12


In [124]:
# making sure that we don't drop any rows
df_transactional.shape[0] + df_cumulative.shape[0]


128130

In [125]:
df_cumulative.head()

Unnamed: 0,Birth Year/Birth Cohort,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id
2,2018-2019,Age,19 Months,91.8,88.3 to 94.3,556,88.3,94.3,1,1,12
5,2020-2021,Age,13 Months,91.7,87.4 to 94.7,391,87.4,94.7,2,2,12
9,2020-2021,Age,19 Months,92.0,87.8 to 94.9,391,87.8,94.9,3,1,12
12,2020-2021,Age,0-2 Days,86.9,81.6 to 90.9,391,81.6,90.9,3,4,12
13,2020-2021,Age,19 Months,72.9,66.0 to 78.9,391,66.0,78.9,7,5,12


In [126]:
# denormalise the birth year/cohort as it may be useful for future analysis
for _, row in df_cumulative.iterrows():
    start_year = row['Birth Year/Birth Cohort'][:4]
    end_year = row['Birth Year/Birth Cohort'][-4:]
    df_cumulative.loc[_, 'start_cohort_year'] = start_year
    df_cumulative.loc[_, 'end_cohort_year'] = end_year
df_cumulative

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cumulative.loc[_, 'start_cohort_year'] = start_year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cumulative.loc[_, 'end_cohort_year'] = end_year


Unnamed: 0,Birth Year/Birth Cohort,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,start_cohort_year,end_cohort_year
2,2018-2019,Age,19 Months,91.8,88.3 to 94.3,556,88.3,94.3,1,1,12,2018,2019
5,2020-2021,Age,13 Months,91.7,87.4 to 94.7,391,87.4,94.7,2,2,12,2020,2021
9,2020-2021,Age,19 Months,92.0,87.8 to 94.9,391,87.8,94.9,3,1,12,2020,2021
12,2020-2021,Age,0-2 Days,86.9,81.6 to 90.9,391,81.6,90.9,3,4,12,2020,2021
13,2020-2021,Age,19 Months,72.9,66.0 to 78.9,391,66.0,78.9,7,5,12,2020,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...
128102,2020-2021,Age,35 Months,95.5,92.5 to 97.5,315,92.5,97.5,3,1,61,2020,2021
128114,2020-2021,Age,35 Months,94.6,91.2 to 97.1,315,91.2,97.1,1,8,61,2020,2021
128122,2020-2021,Age,35 Months,96.4,93.1 to 98.4,315,93.1,98.4,4,1,61,2020,2021
128123,2020-2021,Age,13 Months,90.9,85.7 to 94.3,315,85.7,94.3,7,1,61,2020,2021


In [127]:
df_cumulative.drop(columns=['Birth Year/Birth Cohort'], inplace = True)
df_cumulative = df_cumulative.reset_index(drop=True)
df_cumulative['cumulative_id'] = df_cumulative.index + 1
print(df_cumulative.shape[0])
display(df_cumulative)

68715


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cumulative.drop(columns=['Birth Year/Birth Cohort'], inplace = True)


Unnamed: 0,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,start_cohort_year,end_cohort_year,cumulative_id
0,Age,19 Months,91.8,88.3 to 94.3,556,88.3,94.3,1,1,12,2018,2019,1
1,Age,13 Months,91.7,87.4 to 94.7,391,87.4,94.7,2,2,12,2020,2021,2
2,Age,19 Months,92.0,87.8 to 94.9,391,87.8,94.9,3,1,12,2020,2021,3
3,Age,0-2 Days,86.9,81.6 to 90.9,391,81.6,90.9,3,4,12,2020,2021,4
4,Age,19 Months,72.9,66.0 to 78.9,391,66.0,78.9,7,5,12,2020,2021,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
68710,Age,35 Months,95.5,92.5 to 97.5,315,92.5,97.5,3,1,61,2020,2021,68711
68711,Age,35 Months,94.6,91.2 to 97.1,315,91.2,97.1,1,8,61,2020,2021,68712
68712,Age,35 Months,96.4,93.1 to 98.4,315,93.1,98.4,4,1,61,2020,2021,68713
68713,Age,13 Months,90.9,85.7 to 94.3,315,85.7,94.3,7,1,61,2020,2021,68714


In [128]:
# sanity check: ensure that the bug was fixed correctly
df_cumulative['start_cohort_year'].unique()

array(['2018', '2020', '2016', '2014', '2017', '2015', '2013', '2019'],
      dtype=object)

In [129]:
# making sure that we don't drop any rows
df_transactional.shape[0] + df_cumulative.shape[0]

128130

In [130]:
df_cumulative['Dimension Type'].unique()

array(['Age', 'Insurance Coverage', 'Overall', 'Poverty',
       'Race and Ethnicity', 'Urbanicity'], dtype=object)

In [131]:
# handle the age dimension in df_cumulative
df_cumulative_age = df_cumulative[df_cumulative['Dimension Type'] == 'Age']
df_cumulative_age.head()

Unnamed: 0,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,start_cohort_year,end_cohort_year,cumulative_id
0,Age,19 Months,91.8,88.3 to 94.3,556,88.3,94.3,1,1,12,2018,2019,1
1,Age,13 Months,91.7,87.4 to 94.7,391,87.4,94.7,2,2,12,2020,2021,2
2,Age,19 Months,92.0,87.8 to 94.9,391,87.8,94.9,3,1,12,2020,2021,3
3,Age,0-2 Days,86.9,81.6 to 90.9,391,81.6,90.9,3,4,12,2020,2021,4
4,Age,19 Months,72.9,66.0 to 78.9,391,66.0,78.9,7,5,12,2020,2021,5


In [132]:
age_nrow = df_cumulative_age.shape[0]
age_nrow

43830

In [133]:
# do the same thing as what we did in the transactional
df_cumulative_age.rename(columns={'Dimension': 'Age'}, inplace=True)
df_cumulative_age.drop(columns=['Dimension Type'], inplace=True)
print(df_cumulative_age.shape[0])
df_cumulative_age.head()

43830


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cumulative_age.rename(columns={'Dimension': 'Age'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cumulative_age.drop(columns=['Dimension Type'], inplace=True)


Unnamed: 0,Age,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,start_cohort_year,end_cohort_year,cumulative_id
0,19 Months,91.8,88.3 to 94.3,556,88.3,94.3,1,1,12,2018,2019,1
1,13 Months,91.7,87.4 to 94.7,391,87.4,94.7,2,2,12,2020,2021,2
2,19 Months,92.0,87.8 to 94.9,391,87.8,94.9,3,1,12,2020,2021,3
3,0-2 Days,86.9,81.6 to 90.9,391,81.6,90.9,3,4,12,2020,2021,4
4,19 Months,72.9,66.0 to 78.9,391,66.0,78.9,7,5,12,2020,2021,5


In [134]:
def check_dimension_rows(df, nrow):
    if df.shape[0] == nrow:
        print('Rows matched correctly')
    else:
        print("Rows don't match! Check code above!")

In [135]:
check_dimension_rows(df_cumulative_age, age_nrow)

Rows matched correctly


In [136]:
# handle insurance
df_cumulative_insurance = df_cumulative[df_cumulative['Dimension Type'] == 'Insurance Coverage']
df_cumulative_insurance.head()

Unnamed: 0,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,start_cohort_year,end_cohort_year,cumulative_id
54,Insurance Coverage,Private Insurance Only,95.8,93.6 to 97.9,695,93.6,97.9,4,1,12,2016,2019,55
55,Insurance Coverage,Private Insurance Only,87.6,83.8 to 91.5,695,83.8,91.5,7,5,12,2016,2019,56
61,Insurance Coverage,Private Insurance Only,90.7,87.7 to 93.6,695,87.7,93.6,4,8,12,2016,2019,62
67,Insurance Coverage,Private Insurance Only,72.9,68.6 to 77.2,562,68.6,77.2,14,6,12,2014,2017,68
68,Insurance Coverage,Other,52.1,33.1 to 71.1,42,33.1,71.1,14,6,12,2014,2017,69


In [137]:
nrow_insurance = df_cumulative_insurance.shape[0]

In [138]:
cursor.execute("SELECT insurance_coverage, insurance_id FROM insurance_dim")
mapping_data = cursor.fetchall()

insurance_mapping = pd.DataFrame(mapping_data, columns=["insurance_coverage", "insurance_id"])
insurance_mapping

Unnamed: 0,insurance_coverage,insurance_id
0,Uninsured,4
1,Other,2
2,Any Medicaid,3
3,Private Insurance Only,1


In [139]:
# we merge df_children with the insurance_mapping
df_cumulative_insurance = pd.merge(df_cumulative_insurance, insurance_mapping, left_on = ['Dimension'], right_on=['insurance_coverage'], how='left')
df_cumulative_insurance.head()

Unnamed: 0,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,start_cohort_year,end_cohort_year,cumulative_id,insurance_coverage,insurance_id
0,Insurance Coverage,Private Insurance Only,95.8,93.6 to 97.9,695,93.6,97.9,4,1,12,2016,2019,55,Private Insurance Only,1
1,Insurance Coverage,Private Insurance Only,87.6,83.8 to 91.5,695,83.8,91.5,7,5,12,2016,2019,56,Private Insurance Only,1
2,Insurance Coverage,Private Insurance Only,90.7,87.7 to 93.6,695,87.7,93.6,4,8,12,2016,2019,62,Private Insurance Only,1
3,Insurance Coverage,Private Insurance Only,72.9,68.6 to 77.2,562,68.6,77.2,14,6,12,2014,2017,68,Private Insurance Only,1
4,Insurance Coverage,Other,52.1,33.1 to 71.1,42,33.1,71.1,14,6,12,2014,2017,69,Other,2


In [140]:
df_cumulative_insurance.drop(columns=['Dimension Type', 'Dimension', 'insurance_coverage'], inplace=True)
print(df_cumulative_insurance.shape[0])
df_cumulative_insurance.head()

5339


Unnamed: 0,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,start_cohort_year,end_cohort_year,cumulative_id,insurance_id
0,95.8,93.6 to 97.9,695,93.6,97.9,4,1,12,2016,2019,55,1
1,87.6,83.8 to 91.5,695,83.8,91.5,7,5,12,2016,2019,56,1
2,90.7,87.7 to 93.6,695,87.7,93.6,4,8,12,2016,2019,62,1
3,72.9,68.6 to 77.2,562,68.6,77.2,14,6,12,2014,2017,68,1
4,52.1,33.1 to 71.1,42,33.1,71.1,14,6,12,2014,2017,69,2


In [141]:
check_dimension_rows(df_cumulative_insurance, nrow_insurance)

Rows matched correctly


In [142]:
# handle poverty next
df_cumulative_poverty = df_cumulative[df_cumulative['Dimension Type'] == 'Poverty']
print(df_cumulative_poverty.shape[0])
nrow_poverty = df_cumulative_poverty.shape[0]
df_cumulative_poverty.head()

5655


Unnamed: 0,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,start_cohort_year,end_cohort_year,cumulative_id
57,Poverty,<133% FPL,54.8,45.7 to 64.0,155,45.7,64.0,14,6,12,2014,2017,58
58,Poverty,133% to <400% FPL,65.2,59.7 to 70.8,408,59.7,70.8,14,6,12,2014,2017,59
75,Poverty,>400% FPL,76.5,70.7 to 82.4,267,70.7,82.4,14,6,12,2014,2017,76
129,Poverty,>400% FPL,87.3,82.8 to 91.8,267,82.8,91.8,7,5,12,2014,2017,130
140,Poverty,<133% FPL,80.8,73.7 to 87.8,155,73.7,87.8,7,5,12,2014,2017,141


In [143]:
cursor.execute("SELECT poverty_status, poverty_id FROM poverty_dim")
mapping_data = cursor.fetchall()

poverty_mapping = pd.DataFrame(mapping_data, columns=["poverty_status", "poverty_id"])
poverty_mapping

Unnamed: 0,poverty_status,poverty_id
0,<133% FPL,1
1,133% to <400% FPL,2
2,>400% FPL,3
3,Below Poverty Level,4
4,Living At or Above Poverty Level,5


In [144]:
# we merge df_children with the poverty_mapping
df_cumulative_poverty = pd.merge(df_cumulative_poverty, poverty_mapping, left_on = ['Dimension'], right_on=['poverty_status'], how='left')
df_cumulative_poverty.head()

Unnamed: 0,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,start_cohort_year,end_cohort_year,cumulative_id,poverty_status,poverty_id
0,Poverty,<133% FPL,54.8,45.7 to 64.0,155,45.7,64.0,14,6,12,2014,2017,58,<133% FPL,1
1,Poverty,133% to <400% FPL,65.2,59.7 to 70.8,408,59.7,70.8,14,6,12,2014,2017,59,133% to <400% FPL,2
2,Poverty,>400% FPL,76.5,70.7 to 82.4,267,70.7,82.4,14,6,12,2014,2017,76,>400% FPL,3
3,Poverty,>400% FPL,87.3,82.8 to 91.8,267,82.8,91.8,7,5,12,2014,2017,130,>400% FPL,3
4,Poverty,<133% FPL,80.8,73.7 to 87.8,155,73.7,87.8,7,5,12,2014,2017,141,<133% FPL,1


In [145]:
df_cumulative_poverty.drop(columns=['Dimension Type', 'Dimension', 'poverty_status'], inplace=True)
df_cumulative_poverty.head()

Unnamed: 0,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,start_cohort_year,end_cohort_year,cumulative_id,poverty_id
0,54.8,45.7 to 64.0,155,45.7,64.0,14,6,12,2014,2017,58,1
1,65.2,59.7 to 70.8,408,59.7,70.8,14,6,12,2014,2017,59,2
2,76.5,70.7 to 82.4,267,70.7,82.4,14,6,12,2014,2017,76,3
3,87.3,82.8 to 91.8,267,82.8,91.8,7,5,12,2014,2017,130,3
4,80.8,73.7 to 87.8,155,73.7,87.8,7,5,12,2014,2017,141,1


In [146]:
check_dimension_rows(df_cumulative_poverty, nrow_poverty)

Rows matched correctly


In [147]:
# handle race and ethnicity
df_cumulative_race = df_cumulative[df_cumulative['Dimension Type'] == 'Race and Ethnicity']
nrow_race = df_cumulative_race.shape[0]
df_cumulative_race.head()

Unnamed: 0,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,start_cohort_year,end_cohort_year,cumulative_id
60,Race and Ethnicity,"Other or Multiple Races, Non-Hispanic",63.7,49.5 to 77.9,120,49.5,77.9,8,2,12,2016,2019,61
62,Race and Ethnicity,"White, Non-Hispanic",68.4,64.1 to 72.6,637,64.1,72.6,14,6,12,2014,2017,63
66,Race and Ethnicity,"Other or Multiple Races, Non-Hispanic",78.9,69.4 to 88.3,120,69.4,88.3,7,5,12,2016,2019,67
69,Race and Ethnicity,Hispanic,53.6,37.2 to 69.9,48,37.2,69.9,14,6,12,2014,2017,70
70,Race and Ethnicity,"Other or Multiple Races, Non-Hispanic",65.0,54.5 to 75.4,123,54.5,75.4,14,6,12,2014,2017,71


In [148]:
cursor.execute("SELECT race_ethnicity, race_ethnicity_id FROM race_ethnicity_dim")
mapping_data = cursor.fetchall()

race_ethnicity_mapping = pd.DataFrame(mapping_data, columns=["race_ethnicity", "race_ethnicity_id"])
race_ethnicity_mapping

Unnamed: 0,race_ethnicity,race_ethnicity_id
0,"American Indian or Alaska Native, Non-Hispanic",5
1,"Asian, Non-Hispanic",6
2,"White, Non-Hispanic",2
3,Hispanic,3
4,"Other or Multiple Races, Non-Hispanic",1
5,"Black, Non-Hispanic",4


In [149]:
df_cumulative_race = pd.merge(df_cumulative_race, race_ethnicity_mapping, left_on = ['Dimension'], right_on=['race_ethnicity'], how='left')
df_cumulative_race.head()

Unnamed: 0,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,start_cohort_year,end_cohort_year,cumulative_id,race_ethnicity,race_ethnicity_id
0,Race and Ethnicity,"Other or Multiple Races, Non-Hispanic",63.7,49.5 to 77.9,120,49.5,77.9,8,2,12,2016,2019,61,"Other or Multiple Races, Non-Hispanic",1
1,Race and Ethnicity,"White, Non-Hispanic",68.4,64.1 to 72.6,637,64.1,72.6,14,6,12,2014,2017,63,"White, Non-Hispanic",2
2,Race and Ethnicity,"Other or Multiple Races, Non-Hispanic",78.9,69.4 to 88.3,120,69.4,88.3,7,5,12,2016,2019,67,"Other or Multiple Races, Non-Hispanic",1
3,Race and Ethnicity,Hispanic,53.6,37.2 to 69.9,48,37.2,69.9,14,6,12,2014,2017,70,Hispanic,3
4,Race and Ethnicity,"Other or Multiple Races, Non-Hispanic",65.0,54.5 to 75.4,123,54.5,75.4,14,6,12,2014,2017,71,"Other or Multiple Races, Non-Hispanic",1


In [150]:
df_cumulative_race.drop(columns=['Dimension Type', 'Dimension', 'race_ethnicity'], inplace=True)
df_cumulative_race.head()

Unnamed: 0,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,start_cohort_year,end_cohort_year,cumulative_id,race_ethnicity_id
0,63.7,49.5 to 77.9,120,49.5,77.9,8,2,12,2016,2019,61,1
1,68.4,64.1 to 72.6,637,64.1,72.6,14,6,12,2014,2017,63,2
2,78.9,69.4 to 88.3,120,69.4,88.3,7,5,12,2016,2019,67,1
3,53.6,37.2 to 69.9,48,37.2,69.9,14,6,12,2014,2017,70,3
4,65.0,54.5 to 75.4,123,54.5,75.4,14,6,12,2014,2017,71,1


In [151]:
check_dimension_rows(df_cumulative_race, nrow_race)

Rows matched correctly


In [152]:
# handle urbanicity
df_cumulative_urban = df_cumulative[df_cumulative['Dimension Type'] == 'Urbanicity']
nrow_ubanicity = df_cumulative_urban.shape[0]
df_cumulative_urban.head()

Unnamed: 0,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,start_cohort_year,end_cohort_year,cumulative_id
63,Urbanicity,Living In a MSA Principal City,75.9,70.2 to 81.6,285,70.2,81.6,14,6,12,2014,2017,64
64,Urbanicity,Living In a MSA Non-Principal City,67.0,56.9 to 77.1,124,56.9,77.1,14,6,12,2014,2017,65
65,Urbanicity,Living In a Non-MSA,57.2,51.5 to 62.9,421,51.5,62.9,14,6,12,2014,2017,66
124,Urbanicity,Living In a MSA Non-Principal City,79.4,70.9 to 87.8,124,70.9,87.8,7,5,12,2014,2017,125
128,Urbanicity,Living In a MSA Principal City,94.6,91.5 to 97.8,285,91.5,97.8,2,1,12,2014,2017,129


In [153]:
cursor.execute("SELECT urbanicity, urbanicity_id FROM urbanicity_dim")
mapping_data = cursor.fetchall()

urbanicity_mapping = pd.DataFrame(mapping_data, columns=["urbanicity", "urbanicity_id"])
urbanicity_mapping

Unnamed: 0,urbanicity,urbanicity_id
0,Living In a MSA Principal City,1
1,Living In a MSA Non-Principal City,2
2,Living In a Non-MSA,3


In [154]:
df_cumulative_urban = pd.merge(df_cumulative_urban, urbanicity_mapping, left_on = ['Dimension'], right_on=['urbanicity'], how='left')
df_cumulative_urban.head()

Unnamed: 0,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,start_cohort_year,end_cohort_year,cumulative_id,urbanicity,urbanicity_id
0,Urbanicity,Living In a MSA Principal City,75.9,70.2 to 81.6,285,70.2,81.6,14,6,12,2014,2017,64,Living In a MSA Principal City,1
1,Urbanicity,Living In a MSA Non-Principal City,67.0,56.9 to 77.1,124,56.9,77.1,14,6,12,2014,2017,65,Living In a MSA Non-Principal City,2
2,Urbanicity,Living In a Non-MSA,57.2,51.5 to 62.9,421,51.5,62.9,14,6,12,2014,2017,66,Living In a Non-MSA,3
3,Urbanicity,Living In a MSA Non-Principal City,79.4,70.9 to 87.8,124,70.9,87.8,7,5,12,2014,2017,125,Living In a MSA Non-Principal City,2
4,Urbanicity,Living In a MSA Principal City,94.6,91.5 to 97.8,285,91.5,97.8,2,1,12,2014,2017,129,Living In a MSA Principal City,1


In [155]:
df_cumulative_urban.drop(columns=['Dimension Type', 'Dimension', 'urbanicity'], inplace=True)
df_cumulative_urban.head()

Unnamed: 0,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,start_cohort_year,end_cohort_year,cumulative_id,urbanicity_id
0,75.9,70.2 to 81.6,285,70.2,81.6,14,6,12,2014,2017,64,1
1,67.0,56.9 to 77.1,124,56.9,77.1,14,6,12,2014,2017,65,2
2,57.2,51.5 to 62.9,421,51.5,62.9,14,6,12,2014,2017,66,3
3,79.4,70.9 to 87.8,124,70.9,87.8,7,5,12,2014,2017,125,2
4,94.6,91.5 to 97.8,285,91.5,97.8,2,1,12,2014,2017,129,1


In [156]:
check_dimension_rows(df_cumulative_urban, nrow_ubanicity)

Rows matched correctly


In [157]:
# handle overall
df_cumulative_overall = df_cumulative[df_cumulative['Dimension Type'] == 'Overall']
nrow_overall = df_cumulative_overall.shape[0]
df_cumulative_overall.head()

Unnamed: 0,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,start_cohort_year,end_cohort_year,cumulative_id
56,Overall,Overall,66.1,62.3 to 70.0,830,62.3,70.0,14,6,12,2014,2017,57
138,Overall,Overall,84.7,81.6 to 87.8,830,81.6,87.8,7,5,12,2014,2017,139
180,Overall,Overall,94.2,92.3 to 96.0,830,92.3,96.0,1,1,12,2014,2017,181
216,Overall,Overall,89.3,86.9 to 91.8,830,86.9,91.8,8,3,12,2014,2017,217
231,Overall,Overall,86.2,82.2 to 90.2,830,82.2,90.2,8,2,12,2014,2017,232


In [158]:
df_cumulative_overall.rename(columns={'Dimension': 'Overall'}, inplace=True)
df_cumulative_overall.drop(columns=['Dimension Type'], inplace=True)
print(df_cumulative_overall.shape[0])

1890


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cumulative_overall.rename(columns={'Dimension': 'Overall'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cumulative_overall.drop(columns=['Dimension Type'], inplace=True)


In [159]:
check_dimension_rows(df_cumulative_overall, nrow_overall)

Rows matched correctly


In [160]:
df_cumulative_dimension = pd.concat([df_cumulative_age, df_cumulative_insurance, df_cumulative_poverty, df_cumulative_race, df_cumulative_urban, df_cumulative_overall])
df_cumulative_dimension.head()

Unnamed: 0,Age,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id,start_cohort_year,end_cohort_year,cumulative_id,insurance_id,poverty_id,race_ethnicity_id,urbanicity_id,Overall
0,19 Months,91.8,88.3 to 94.3,556,88.3,94.3,1,1,12,2018,2019,1,,,,,
1,13 Months,91.7,87.4 to 94.7,391,87.4,94.7,2,2,12,2020,2021,2,,,,,
2,19 Months,92.0,87.8 to 94.9,391,87.8,94.9,3,1,12,2020,2021,3,,,,,
3,0-2 Days,86.9,81.6 to 90.9,391,81.6,90.9,3,4,12,2020,2021,4,,,,,
4,19 Months,72.9,66.0 to 78.9,391,66.0,78.9,7,5,12,2020,2021,5,,,,,


In [161]:
check_dimension_rows(df_cumulative_dimension, (nrow_insurance+nrow_overall+nrow_poverty+nrow_race+nrow_ubanicity+age_nrow))

Rows matched correctly


In [162]:
if df_cumulative_dimension.shape[0] + df_transactional.shape[0] == filled_df_nrow:
    print("Rows still match!")
else:
    print("Rows don't match! Check code!")

Rows still match!


In [166]:
df_transactional

Unnamed: 0,Birth Year/Birth Cohort,Age,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,vaccine_id,dose_id,geography_id
0,2019,19 Months,93.5,88.0 to 96.6,263,88.0,96.6,1,1,12
1,2018,19 Months,95.2,91.0 to 97.5,293,91.0,97.5,1,1,12
3,2021,19 Months,89.4,81.9 to 94.1,143,81.9,94.1,2,1,12
4,2021,5 Months,79.3,69.0 to 86.8,143,69.0,86.8,2,2,12
6,2021,5 Months,79.3,69.0 to 86.8,143,69.0,86.8,1,2,12
...,...,...,...,...,...,...,...,...,...,...
128124,2021,24 Months,92.3,85.7 to 96.6,126,85.7,96.6,8,3,61
128125,2021,24 Months,46.7,36.7 to 58.0,126,36.7,58.0,8,2,61
128126,2021,19 Months,95.0,90.2 to 97.5,126,90.2,97.5,1,1,61
128128,2021,7 Months,70.0,59.6 to 78.7,126,59.6,78.7,4,1,61


## Loading tables into fact tables

In [167]:
df_transactional.columns

Index(['Birth Year/Birth Cohort', 'Age', 'Estimate (%)', '95% CI (%)',
       'Sample Size', 'ci_lower', 'ci_upper', 'vaccine_id', 'dose_id',
       'geography_id'],
      dtype='object')

In [169]:
# loading transaction fact table
try:
    insert_query='''
    INSERT INTO vaccine_transaction_fact(
        vaccine_transaction_birth_year,
        vaccine_transaction_age,
        vaccine_transaction_estimate_pct,
        vaccine_transaction_ci_lower,
        vaccine_transaction_ci_upper,
        vaccine_transaction_sample_size,
        vaccine_transaction_vaccine_id,
        vaccine_transaction_dose_id,
        vaccine_transaction_geography_id
    )
    VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s)
    '''
    for index, row in df_transactional.iterrows():
        vaccine_id = int(row['vaccine_id']) if pd.notnull(row['vaccine_id']) else None
        dose_id = int(row['dose_id']) if pd.notnull(row['dose_id']) else None
        geography_id = int(row['geography_id']) if pd.notnull(row['geography_id']) else None
        
        cursor.execute(insert_query,(
        row['Birth Year/Birth Cohort'],
        row['Age'],
        row['Estimate (%)'],
        row['ci_lower'],
        row['ci_upper'],
        row['Sample Size'],
        vaccine_id,
        dose_id,
        geography_id))
    conn.commit()
    print(f"{len(df_transactional)} records inserted into vaccine_transaction_fact.")

except Exception as e:
    print(f"Error occurred: {e}")
    traceback.print_exc(file=sys.stdout)
    conn.rollback()

59415 records inserted into vaccine_transaction_fact.


In [81]:
df_cumulative_dimension.columns

Index(['Age', 'Estimate (%)', '95% CI (%)', 'Sample Size', 'ci_lower',
       'ci_upper', 'vaccine_id', 'dose_id', 'geography_id',
       'start_cohort_year', 'end_cohort_year', 'cumulative_id', 'insurance_id',
       'poverty_id', 'race_ethnicity_id', 'urbanicity_id', 'Overall'],
      dtype='object')

In [87]:
# loading cumulative fact table
try:
    insert_query='''
    INSERT INTO vaccine_cumulative_fact(
        vaccine_cumulative_start_cohort_year,
        vaccine_cumulative_end_cohort_year,
        vaccine_cumulative_estimate_pct,
        vaccine_cumulative_sample_size,
        vaccine_cumulative_ci_lower,
        vaccine_cumulative_ci_upper,
        vaccine_cumulative_vaccine_id,
        vaccine_cumulative_dose_id,
        vaccine_cumulative_geography_id,
        vaccine_cumulative_insurance_id,
        vaccine_cumulative_poverty_id,
        vaccine_cumulative_race_ethnicity_id,
        vaccine_cumulative_urbanicity_id,
        vaccine_cumulative_overall
    )
    VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    '''
    for index, row in df_cumulative_dimension.iterrows():
        start_cohort_year = int(row['start_cohort_year']) if pd.notnull(row['start_cohort_year']) else None
        end_cohort_year = int(row['end_cohort_year']) if pd.notnull(row['end_cohort_year']) else None
        vaccine_id = int(row['vaccine_id']) if pd.notnull(row['vaccine_id']) else None
        dose_id = int(row['dose_id']) if pd.notnull(row['dose_id']) else None
        geography_id = int(row['geography_id']) if pd.notnull(row['geography_id']) else None
        poverty_id = int(row['poverty_id']) if pd.notnull(row['poverty_id']) else None
        race_ethnicity_id = int(row['race_ethnicity_id']) if pd.notnull(row['race_ethnicity_id']) else None
        urbanicity_id = int(row['urbanicity_id']) if pd.notnull(row['urbanicity_id']) else None
        insurance_id = int(row['insurance_id']) if pd.notnull(row['insurance_id']) else None
        sample_size = int(row['Sample Size']) if pd.notnull(row['Sample Size']) else None

        cursor.execute(insert_query,(
        start_cohort_year,
        end_cohort_year,
        row['Estimate (%)'],
        sample_size,
        row['ci_lower'],
        row['ci_upper'],
        vaccine_id,
        dose_id,
        geography_id,
        insurance_id,
        poverty_id,
        race_ethnicity_id,
        urbanicity_id,
        row['Overall']))
    conn.commit()
    print(f"{len(df_cumulative_dimension)} records inserted into vaccine_cumulative_fact.")

except Exception as e:
    print(f"Error occurred: {e}")
    traceback.print_exc(file=sys.stdout)
    conn.rollback()


68715 records inserted into vaccine_cumulative_fact.
