# Create influenza vaccination cumulative fact table

In [1]:
import pandas as pd
import numpy as np
import re
import traceback
import sys
from variables import * 


In [2]:
df = pd.read_csv('dataset/Influenza_Vaccination_Coverage.csv')
df.head()

Unnamed: 0,Vaccine,Geography Type,Geography,FIPS,Season/Survey Year,Month,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size
0,Seasonal Influenza,Counties,New Haven,9009,2018,1,>=18 Years,Non-Medical Setting,45.5,43.9 to 47.2,
1,Seasonal Influenza,Counties,New Haven,9009,2021,1,>=18 Years,Non-Medical Setting,53.0,46.0 to 60.9,
2,Seasonal Influenza,Counties,New Haven,9009,2020,1,Age,>=18 Years,52.4,50.6 to 54.3,
3,Seasonal Influenza,Counties,New Haven,9009,2021,1,Age,>=18 Years,50.2,45.4 to 55.8,
4,Seasonal Influenza,Counties,New Haven,9009,2018,1,Age,>=18 Years,34.0,32.6 to 35.5,


In [3]:
df.shape[0]

220729

In [4]:
df[df['Estimate (%)'].isna()]

Unnamed: 0,Vaccine,Geography Type,Geography,FIPS,Season/Survey Year,Month,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220729 entries, 0 to 220728
Data columns (total 11 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Vaccine             220729 non-null  object 
 1   Geography Type      220729 non-null  object 
 2   Geography           220729 non-null  object 
 3   FIPS                220729 non-null  int64  
 4   Season/Survey Year  220729 non-null  object 
 5   Month               220729 non-null  int64  
 6   Dimension Type      220729 non-null  object 
 7   Dimension           220729 non-null  object 
 8   Estimate (%)        220729 non-null  object 
 9   95% CI (%)          220495 non-null  object 
 10  Sample Size         197270 non-null  float64
dtypes: float64(1), int64(2), object(8)
memory usage: 18.5+ MB


In [6]:
df['Vaccine'].unique()

array(['Seasonal Influenza',
       'Any Influenza Vaccination, Seasonal or H1N1',
       'Influenza A (H1N1) 2009 Monovalent'], dtype=object)

In [7]:
# the H1N1 vaccine is only in 2009-2010( H1N1 pandemic)
df['Season/Survey Year'][df['Vaccine'] == 'Influenza A (H1N1) 2009 Monovalent'].unique()

array(['2009-10'], dtype=object)

In [8]:
# and the influenza + h1n1 combi is only for 2009-2010
df['Season/Survey Year'][df['Vaccine'] == 'Any Influenza Vaccination, Seasonal or H1N1'].unique()

array(['2009-10'], dtype=object)

In [9]:
df['Season/Survey Year'][df['Vaccine'] == 'Seasonal Influenza'].unique()

array(['2018', '2021', '2020', '2019', '2009-10', '2020-21', '2017-18',
       '2014-15', '2015-16', '2019-20', '2013-14', '2018-19', '2012-13',
       '2016-17', '2011-12', '2010-11', '2021-22', '2023-24', '2022',
       '2022-23'], dtype=object)

In [10]:
filled_df_nrow = df.shape[0]
filled_df_nrow

220729

In [11]:
df['Sample Size'] = pd.to_numeric(df['Sample Size'], errors='coerce').astype('Int64')
df.head()

Unnamed: 0,Vaccine,Geography Type,Geography,FIPS,Season/Survey Year,Month,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size
0,Seasonal Influenza,Counties,New Haven,9009,2018,1,>=18 Years,Non-Medical Setting,45.5,43.9 to 47.2,
1,Seasonal Influenza,Counties,New Haven,9009,2021,1,>=18 Years,Non-Medical Setting,53.0,46.0 to 60.9,
2,Seasonal Influenza,Counties,New Haven,9009,2020,1,Age,>=18 Years,52.4,50.6 to 54.3,
3,Seasonal Influenza,Counties,New Haven,9009,2021,1,Age,>=18 Years,50.2,45.4 to 55.8,
4,Seasonal Influenza,Counties,New Haven,9009,2018,1,Age,>=18 Years,34.0,32.6 to 35.5,


- * Estimates not reliable because sample size is <30. 
- † Estimates not reliable because relative standard error is >0.3. 
- ‡ Estimates might not be reliable because confidence interval half-width is >10
We can discard them first

In [12]:
df['Estimate (%)'] = pd.to_numeric(df['Estimate (%)'], errors='coerce').astype(float)

In [13]:
ci = df['95% CI (%)'].astype(str).str.replace('%', '', regex=False).str.strip()
df[['ci_lower', 'ci_upper']] = ci.str.extract(r'([\d.]+)\s+to\s+([\d.]+)')
df[['ci_lower', 'ci_upper']] = ci.str.split(r'\s+to\s+', expand=True)

In [14]:
df.head()

Unnamed: 0,Vaccine,Geography Type,Geography,FIPS,Season/Survey Year,Month,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper
0,Seasonal Influenza,Counties,New Haven,9009,2018,1,>=18 Years,Non-Medical Setting,45.5,43.9 to 47.2,,43.9,47.2
1,Seasonal Influenza,Counties,New Haven,9009,2021,1,>=18 Years,Non-Medical Setting,53.0,46.0 to 60.9,,46.0,60.9
2,Seasonal Influenza,Counties,New Haven,9009,2020,1,Age,>=18 Years,52.4,50.6 to 54.3,,50.6,54.3
3,Seasonal Influenza,Counties,New Haven,9009,2021,1,Age,>=18 Years,50.2,45.4 to 55.8,,45.4,55.8
4,Seasonal Influenza,Counties,New Haven,9009,2018,1,Age,>=18 Years,34.0,32.6 to 35.5,,32.6,35.5


In [15]:
# getting the vaccine dimension table from postgres
import psycopg2 as pg
from creds import POSTGRES_USERNAME, POSTGRES_PW
conn = pg.connect(
    dbname='cs689_project',  
    user=POSTGRES_USERNAME,
    password=POSTGRES_PW,
    host='localhost',
    port='5432'
)
cursor = conn.cursor()

In [16]:
cursor.execute("SELECT vaccine_id, cleaned_vaccine, start_date, vaccine_strains FROM vaccine_dim")
mapping_data = cursor.fetchall()

vaccine_mapping = pd.DataFrame(mapping_data, columns=["vaccine_id", "cleaned_vaccine", "start_date", "vaccine_strains"])
vaccine_mapping

Unnamed: 0,vaccine_id,cleaned_vaccine,start_date,vaccine_strains
0,1,DTaP,2011-09-01,"Diphtheria, Tetanus, and Bordetella pertussis ..."
1,2,Polio,2011-09-01,"Poliovirus Types 1, 2, and 3 (inactivated)"
2,3,Hep B,2011-09-01,Hepatitis B virus (HBV)
3,4,PCV,2011-09-01,Streptococcus pneumoniae — multiple serotypes ...
4,7,Hib,2011-09-01,Haemophilus influenzae type b
5,10,Combined 7 Series,2011-09-01,"Includes: DTaP, IPV, MMR, Hib, Hep B, Varicell..."
6,11,Rotavirus,2011-09-01,"Rotavirus types G1, G2, G3, G4, G9 (Rotarix); ..."
7,9,Influenza,2011-09-01,"A/California/7/2009, A/Victoria/361/2011 (H3N2..."
8,12,Influenza,2012-09-01,"A/California/7/2009, A/Victoria/361/2011 (H3N2..."
9,13,Influenza,2013-09-01,"A/California/7/2009, A/Texas/50/2012 (H3N2), B..."


In [17]:
# forming the vaccine for 2009-2011
vaccine_dim = ['Influenza', 'Any Influenza Vaccination, Seasonal or H1N1',
'Influenza A (H1N1) 2009 Monovalent']
vaccine_dim = pd.DataFrame(vaccine_dim)
vaccine_dim.columns = ['Vaccine']
vaccine_dim['vaccine_strain'] = ['A/Brisbane/59/2007 (H1N1)-like, A/Brisbane/10/2007 (H3N2), B/Brisbane/60/2008-like', 'A/California/7/2009 (H1N1)pdm09-like', 
'A/Brisbane/59/2007 (H1N1)-like, A/Brisbane/10/2007 (H3N2), A/California/7/2009 (H1N1)pdm09-like']
vaccine_dim['description'] = ['Protect against the three influenza strains anticipated to be most prevalent during the season',
'In response to the 2009 H1N1 pandemic, a separate monovalent vaccine was developed specifically to target the new H1N1 strain',
'Both influenza strains and the H1N1 monovalent vaccine']
vaccine_dim['start_datetime'] = pd.to_datetime('2009-09')
vaccine_dim['end_datetime'] = pd.to_datetime('2100-12')

In [18]:
vaccine_dim

Unnamed: 0,Vaccine,vaccine_strain,description,start_datetime,end_datetime
0,Influenza,"A/Brisbane/59/2007 (H1N1)-like, A/Brisbane/10/...",Protect against the three influenza strains an...,2009-09-01,2100-12-01
1,"Any Influenza Vaccination, Seasonal or H1N1",A/California/7/2009 (H1N1)pdm09-like,"In response to the 2009 H1N1 pandemic, a separ...",2009-09-01,2100-12-01
2,Influenza A (H1N1) 2009 Monovalent,"A/Brisbane/59/2007 (H1N1)-like, A/Brisbane/10/...",Both influenza strains and the H1N1 monovalent...,2009-09-01,2100-12-01


In [19]:
vaccine_2011 = pd.DataFrame([{
    'Vaccine': 'Influenza',
    'description': 'Protects against seasonal flu viruses',
    'vaccine_strain': 'A/California/7/2009 (H1N1)pdm09-like virus, A/Perth/16/2009 (H3N2)-like virus, B/Brisbane/60/2008-like virus'
, 'start_datetime':pd.to_datetime('2010-09'),
'end_datetime':pd.to_datetime('2100-12')}])
vaccine_dim = pd.concat([vaccine_dim, vaccine_2011], ignore_index=True)
vaccine_dim['vaccine_name'] = ['Fluzone,Fluarix,FluLaval,Fluvirin,Afluria', 
'Fluzone, FluMist, Fluarix, FluLaval, Afluria, Fluvirin',
'Sanofi Pasteur H1N1 Monovalent, Novartis H1N1 Monovalent, CSL Limited H1N1 Monovalent, ID Biomedical H1N1 Monovalent, MedImmune H1N1 Monovalent',
'Fluzone, FluMist, Fluarix, FluLaval, Afluria, Fluvirin']
vaccine_dim

Unnamed: 0,Vaccine,vaccine_strain,description,start_datetime,end_datetime,vaccine_name
0,Influenza,"A/Brisbane/59/2007 (H1N1)-like, A/Brisbane/10/...",Protect against the three influenza strains an...,2009-09-01,2100-12-01,"Fluzone,Fluarix,FluLaval,Fluvirin,Afluria"
1,"Any Influenza Vaccination, Seasonal or H1N1",A/California/7/2009 (H1N1)pdm09-like,"In response to the 2009 H1N1 pandemic, a separ...",2009-09-01,2100-12-01,"Fluzone, FluMist, Fluarix, FluLaval, Afluria, ..."
2,Influenza A (H1N1) 2009 Monovalent,"A/Brisbane/59/2007 (H1N1)-like, A/Brisbane/10/...",Both influenza strains and the H1N1 monovalent...,2009-09-01,2100-12-01,"Sanofi Pasteur H1N1 Monovalent, Novartis H1N1 ..."
3,Influenza,"A/California/7/2009 (H1N1)pdm09-like virus, A/...",Protects against seasonal flu viruses,2010-09-01,2100-12-01,"Fluzone, FluMist, Fluarix, FluLaval, Afluria, ..."


In [20]:
def update_vaccine_dim(df_update_vaccine):
    update_query = """
    UPDATE vaccine_dim
    SET end_date     = %s - INTERVAL '1 month',
        current_flag = 'N'
    WHERE cleaned_vaccine = %s
    AND current_flag   = 'Y';
    """

    insert_query = """
    INSERT INTO vaccine_dim
    (cleaned_vaccine,
    vaccine_description,
    vaccine_strains,
    vaccine_names,
    start_date,
    end_date,
    current_flag)
    VALUES (%s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (cleaned_vaccine, start_date) DO NOTHING;
    """
    try:
        for _, row in df_update_vaccine.iterrows():
            cursor.execute(
                update_query,
                (row['start_datetime'], row['Vaccine'])
            )
            cursor.execute(
                insert_query,
                (
                    row['Vaccine'],
                    row['description'],
                    row['vaccine_strain'],
                    row['vaccine_name'],
                    row['start_datetime'],
                    row['end_datetime'],
                    'Y'
                )
            )
        conn.commit()
        print(f"{len(df_update_vaccine)} SCD2 records processed.")

    except Exception as e:
        conn.rollback()
        print("Error occurred:", e)

In [21]:
update_vaccine_dim(vaccine_dim)

4 SCD2 records processed.


In [22]:
# clean the vaccines first
df['Vaccine'] = df['Vaccine'].replace('Seasonal Influenza', 'Influenza')
df.head()

Unnamed: 0,Vaccine,Geography Type,Geography,FIPS,Season/Survey Year,Month,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper
0,Influenza,Counties,New Haven,9009,2018,1,>=18 Years,Non-Medical Setting,45.5,43.9 to 47.2,,43.9,47.2
1,Influenza,Counties,New Haven,9009,2021,1,>=18 Years,Non-Medical Setting,53.0,46.0 to 60.9,,46.0,60.9
2,Influenza,Counties,New Haven,9009,2020,1,Age,>=18 Years,52.4,50.6 to 54.3,,50.6,54.3
3,Influenza,Counties,New Haven,9009,2021,1,Age,>=18 Years,50.2,45.4 to 55.8,,45.4,55.8
4,Influenza,Counties,New Haven,9009,2018,1,Age,>=18 Years,34.0,32.6 to 35.5,,32.6,35.5


In [23]:
cursor.execute("SELECT vaccine_id, cleaned_vaccine, start_date, vaccine_strains FROM vaccine_dim")
mapping_data = cursor.fetchall()

vaccine_mapping = pd.DataFrame(mapping_data, columns=["vaccine_id", "cleaned_vaccine", "start_date", "vaccine_strains"])
vaccine_mapping

Unnamed: 0,vaccine_id,cleaned_vaccine,start_date,vaccine_strains
0,1,DTaP,2011-09-01,"Diphtheria, Tetanus, and Bordetella pertussis ..."
1,2,Polio,2011-09-01,"Poliovirus Types 1, 2, and 3 (inactivated)"
2,3,Hep B,2011-09-01,Hepatitis B virus (HBV)
3,4,PCV,2011-09-01,Streptococcus pneumoniae — multiple serotypes ...
4,7,Hib,2011-09-01,Haemophilus influenzae type b
5,10,Combined 7 Series,2011-09-01,"Includes: DTaP, IPV, MMR, Hib, Hep B, Varicell..."
6,11,Rotavirus,2011-09-01,"Rotavirus types G1, G2, G3, G4, G9 (Rotarix); ..."
7,9,Influenza,2011-09-01,"A/California/7/2009, A/Victoria/361/2011 (H3N2..."
8,12,Influenza,2012-09-01,"A/California/7/2009, A/Victoria/361/2011 (H3N2..."
9,13,Influenza,2013-09-01,"A/California/7/2009, A/Texas/50/2012 (H3N2), B..."


In [24]:
vaccine_mapping['start_date'] = pd.to_datetime(vaccine_mapping['start_date'])
vaccine_mapping['year'] = vaccine_mapping['start_date'].dt.year
vaccine_mapping

Unnamed: 0,vaccine_id,cleaned_vaccine,start_date,vaccine_strains,year
0,1,DTaP,2011-09-01,"Diphtheria, Tetanus, and Bordetella pertussis ...",2011
1,2,Polio,2011-09-01,"Poliovirus Types 1, 2, and 3 (inactivated)",2011
2,3,Hep B,2011-09-01,Hepatitis B virus (HBV),2011
3,4,PCV,2011-09-01,Streptococcus pneumoniae — multiple serotypes ...,2011
4,7,Hib,2011-09-01,Haemophilus influenzae type b,2011
5,10,Combined 7 Series,2011-09-01,"Includes: DTaP, IPV, MMR, Hib, Hep B, Varicell...",2011
6,11,Rotavirus,2011-09-01,"Rotavirus types G1, G2, G3, G4, G9 (Rotarix); ...",2011
7,9,Influenza,2011-09-01,"A/California/7/2009, A/Victoria/361/2011 (H3N2...",2011
8,12,Influenza,2012-09-01,"A/California/7/2009, A/Victoria/361/2011 (H3N2...",2012
9,13,Influenza,2013-09-01,"A/California/7/2009, A/Texas/50/2012 (H3N2), B...",2013


Deal with cumulative dataset first (scoped to HHS region and state)

In [25]:
df_cumulative = df[df['Geography Type'] != 'Counties']
df_cumulative.head()

Unnamed: 0,Vaccine,Geography Type,Geography,FIPS,Season/Survey Year,Month,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper
6,Influenza,States/Local Areas,New Jersey,34,2009-10,5,Race and Ethnicity,"American Indian or Alaska Native, Non-Hispanic",57.7,40.2 to 75.2 ‡,48,40.2,75.2 ‡
7,Influenza,States/Local Areas,New Jersey,34,2009-10,4,Race and Ethnicity,"American Indian or Alaska Native, Non-Hispanic",57.7,40.2 to 75.2 ‡,48,40.2,75.2 ‡
8,Influenza,States/Local Areas,New Jersey,34,2009-10,11,Race and Ethnicity,"Asian, Non-Hispanic",34.7,29.1 to 40.3,552,29.1,40.3
9,Influenza,States/Local Areas,New Jersey,34,2009-10,10,Race and Ethnicity,"Asian, Non-Hispanic",24.8,19.7 to 29.9,552,19.7,29.9
10,Influenza,States/Local Areas,New Jersey,34,2009-10,9,Race and Ethnicity,"Asian, Non-Hispanic",10.8,6.6 to 15.0,552,6.6,15.0


In [26]:
for _, row in df_cumulative.iterrows():
    start_year = row['Season/Survey Year'][:4]
    end_year = '20' + row['Season/Survey Year'][-2:]
    df_cumulative.loc[_, 'start_cohort_year'] = start_year
    df_cumulative.loc[_, 'start_cohort_datetime'] = str(start_year) + '-' + str(row['Month'])
    df_cumulative.loc[_, 'end_cohort_datetime'] = str(end_year) + '-' + str(row['Month'])
df_cumulative

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cumulative.loc[_, 'start_cohort_year'] = start_year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cumulative.loc[_, 'start_cohort_datetime'] = str(start_year) + '-' + str(row['Month'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cumulative.loc[_, 'end_cohort_datetime'] = str(end_year) 

Unnamed: 0,Vaccine,Geography Type,Geography,FIPS,Season/Survey Year,Month,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper,start_cohort_year,start_cohort_datetime,end_cohort_datetime
6,Influenza,States/Local Areas,New Jersey,34,2009-10,5,Race and Ethnicity,"American Indian or Alaska Native, Non-Hispanic",57.7,40.2 to 75.2 ‡,48,40.2,75.2 ‡,2009,2009-5,2010-5
7,Influenza,States/Local Areas,New Jersey,34,2009-10,4,Race and Ethnicity,"American Indian or Alaska Native, Non-Hispanic",57.7,40.2 to 75.2 ‡,48,40.2,75.2 ‡,2009,2009-4,2010-4
8,Influenza,States/Local Areas,New Jersey,34,2009-10,11,Race and Ethnicity,"Asian, Non-Hispanic",34.7,29.1 to 40.3,552,29.1,40.3,2009,2009-11,2010-11
9,Influenza,States/Local Areas,New Jersey,34,2009-10,10,Race and Ethnicity,"Asian, Non-Hispanic",24.8,19.7 to 29.9,552,19.7,29.9,2009,2009-10,2010-10
10,Influenza,States/Local Areas,New Jersey,34,2009-10,9,Race and Ethnicity,"Asian, Non-Hispanic",10.8,6.6 to 15.0,552,6.6,15.0,2009,2009-9,2010-9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220724,Influenza,States/Local Areas,Maine,23,2017-18,1,>=65 Years,Non-Medical Setting,41.4,36.8 to 46.0,,36.8,46.0,2017,2017-1,2018-1
220725,Influenza,States/Local Areas,Maine,23,2020-21,1,>=65 Years,Non-Medical Setting,30.0,26.6 to 33.4,2064,26.6,33.4,2020,2020-1,2021-1
220726,Influenza,States/Local Areas,Maine,23,2017-18,1,18-64 Years,Non-Medical Setting,45.6,39.7 to 51.5,,39.7,51.5,2017,2017-1,2018-1
220727,Influenza,States/Local Areas,Maine,23,2020-21,1,18-64 Years,Non-Medical Setting,33.7,29.7 to 37.7,2772,29.7,37.7,2020,2020-1,2021-1


In [27]:
df_cumulative['start_cohort_year'] = (pd.to_numeric(df_cumulative['start_cohort_year'], errors='coerce').astype('Int64'))
df_cumulative = pd.merge(df_cumulative, vaccine_mapping, left_on = ['Vaccine', 'start_cohort_year'], right_on=['cleaned_vaccine', 'year'], how='left')
df_cumulative.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cumulative['start_cohort_year'] = (pd.to_numeric(df_cumulative['start_cohort_year'], errors='coerce').astype('Int64'))


Unnamed: 0,Vaccine,Geography Type,Geography,FIPS,Season/Survey Year,Month,Dimension Type,Dimension,Estimate (%),95% CI (%),...,ci_lower,ci_upper,start_cohort_year,start_cohort_datetime,end_cohort_datetime,vaccine_id,cleaned_vaccine,start_date,vaccine_strains,year
0,Influenza,States/Local Areas,New Jersey,34,2009-10,5,Race and Ethnicity,"American Indian or Alaska Native, Non-Hispanic",57.7,40.2 to 75.2 ‡,...,40.2,75.2 ‡,2009,2009-5,2010-5,34,Influenza,2009-09-01,"A/Brisbane/59/2007 (H1N1)-like, A/Brisbane/10/...",2009
1,Influenza,States/Local Areas,New Jersey,34,2009-10,4,Race and Ethnicity,"American Indian or Alaska Native, Non-Hispanic",57.7,40.2 to 75.2 ‡,...,40.2,75.2 ‡,2009,2009-4,2010-4,34,Influenza,2009-09-01,"A/Brisbane/59/2007 (H1N1)-like, A/Brisbane/10/...",2009
2,Influenza,States/Local Areas,New Jersey,34,2009-10,11,Race and Ethnicity,"Asian, Non-Hispanic",34.7,29.1 to 40.3,...,29.1,40.3,2009,2009-11,2010-11,34,Influenza,2009-09-01,"A/Brisbane/59/2007 (H1N1)-like, A/Brisbane/10/...",2009
3,Influenza,States/Local Areas,New Jersey,34,2009-10,10,Race and Ethnicity,"Asian, Non-Hispanic",24.8,19.7 to 29.9,...,19.7,29.9,2009,2009-10,2010-10,34,Influenza,2009-09-01,"A/Brisbane/59/2007 (H1N1)-like, A/Brisbane/10/...",2009
4,Influenza,States/Local Areas,New Jersey,34,2009-10,9,Race and Ethnicity,"Asian, Non-Hispanic",10.8,6.6 to 15.0,...,6.6,15.0,2009,2009-9,2010-9,34,Influenza,2009-09-01,"A/Brisbane/59/2007 (H1N1)-like, A/Brisbane/10/...",2009


In [28]:
df_cumulative.head()

Unnamed: 0,Vaccine,Geography Type,Geography,FIPS,Season/Survey Year,Month,Dimension Type,Dimension,Estimate (%),95% CI (%),...,ci_lower,ci_upper,start_cohort_year,start_cohort_datetime,end_cohort_datetime,vaccine_id,cleaned_vaccine,start_date,vaccine_strains,year
0,Influenza,States/Local Areas,New Jersey,34,2009-10,5,Race and Ethnicity,"American Indian or Alaska Native, Non-Hispanic",57.7,40.2 to 75.2 ‡,...,40.2,75.2 ‡,2009,2009-5,2010-5,34,Influenza,2009-09-01,"A/Brisbane/59/2007 (H1N1)-like, A/Brisbane/10/...",2009
1,Influenza,States/Local Areas,New Jersey,34,2009-10,4,Race and Ethnicity,"American Indian or Alaska Native, Non-Hispanic",57.7,40.2 to 75.2 ‡,...,40.2,75.2 ‡,2009,2009-4,2010-4,34,Influenza,2009-09-01,"A/Brisbane/59/2007 (H1N1)-like, A/Brisbane/10/...",2009
2,Influenza,States/Local Areas,New Jersey,34,2009-10,11,Race and Ethnicity,"Asian, Non-Hispanic",34.7,29.1 to 40.3,...,29.1,40.3,2009,2009-11,2010-11,34,Influenza,2009-09-01,"A/Brisbane/59/2007 (H1N1)-like, A/Brisbane/10/...",2009
3,Influenza,States/Local Areas,New Jersey,34,2009-10,10,Race and Ethnicity,"Asian, Non-Hispanic",24.8,19.7 to 29.9,...,19.7,29.9,2009,2009-10,2010-10,34,Influenza,2009-09-01,"A/Brisbane/59/2007 (H1N1)-like, A/Brisbane/10/...",2009
4,Influenza,States/Local Areas,New Jersey,34,2009-10,9,Race and Ethnicity,"Asian, Non-Hispanic",10.8,6.6 to 15.0,...,6.6,15.0,2009,2009-9,2010-9,34,Influenza,2009-09-01,"A/Brisbane/59/2007 (H1N1)-like, A/Brisbane/10/...",2009


In [29]:
df_cumulative[df_cumulative['vaccine_id'].isna()]

Unnamed: 0,Vaccine,Geography Type,Geography,FIPS,Season/Survey Year,Month,Dimension Type,Dimension,Estimate (%),95% CI (%),...,ci_lower,ci_upper,start_cohort_year,start_cohort_datetime,end_cohort_datetime,vaccine_id,cleaned_vaccine,start_date,vaccine_strains,year


In [32]:
df_cumulative.drop(columns=['Vaccine', 'cleaned_vaccine','vaccine_strains', 'year', 'start_date', 'Season/Survey Year', 'start_cohort_year', 'Month', '95% CI (%)'], inplace=True)
df_cumulative.head()

Unnamed: 0,Geography Type,Geography,FIPS,Dimension Type,Dimension,Estimate (%),Sample Size,ci_lower,ci_upper,start_cohort_datetime,end_cohort_datetime,vaccine_id
0,States/Local Areas,New Jersey,34,Race and Ethnicity,"American Indian or Alaska Native, Non-Hispanic",57.7,48,40.2,75.2 ‡,2009-5,2010-5,34
1,States/Local Areas,New Jersey,34,Race and Ethnicity,"American Indian or Alaska Native, Non-Hispanic",57.7,48,40.2,75.2 ‡,2009-4,2010-4,34
2,States/Local Areas,New Jersey,34,Race and Ethnicity,"Asian, Non-Hispanic",34.7,552,29.1,40.3,2009-11,2010-11,34
3,States/Local Areas,New Jersey,34,Race and Ethnicity,"Asian, Non-Hispanic",24.8,552,19.7,29.9,2009-10,2010-10,34
4,States/Local Areas,New Jersey,34,Race and Ethnicity,"Asian, Non-Hispanic",10.8,552,6.6,15.0,2009-9,2010-9,34


In [110]:
df_cumulative['start_cohort_datetime'] = pd.to_datetime(df_cumulative['start_cohort_datetime']).dt.strftime('%Y-%m-%d')
df_cumulative['end_cohort_datetime'] = pd.to_datetime(df_cumulative['end_cohort_datetime']).dt.strftime('%Y-%m-%d')

In [33]:
df_cumulative.head()

Unnamed: 0,Geography Type,Geography,FIPS,Dimension Type,Dimension,Estimate (%),Sample Size,ci_lower,ci_upper,start_cohort_datetime,end_cohort_datetime,vaccine_id
0,States/Local Areas,New Jersey,34,Race and Ethnicity,"American Indian or Alaska Native, Non-Hispanic",57.7,48,40.2,75.2 ‡,2009-5,2010-5,34
1,States/Local Areas,New Jersey,34,Race and Ethnicity,"American Indian or Alaska Native, Non-Hispanic",57.7,48,40.2,75.2 ‡,2009-4,2010-4,34
2,States/Local Areas,New Jersey,34,Race and Ethnicity,"Asian, Non-Hispanic",34.7,552,29.1,40.3,2009-11,2010-11,34
3,States/Local Areas,New Jersey,34,Race and Ethnicity,"Asian, Non-Hispanic",24.8,552,19.7,29.9,2009-10,2010-10,34
4,States/Local Areas,New Jersey,34,Race and Ethnicity,"Asian, Non-Hispanic",10.8,552,6.6,15.0,2009-9,2010-9,34


In [34]:
cursor.execute("SELECT geography_id, original_geography, original_geography_type FROM geography_dim")
mapping_data = cursor.fetchall()

geography_mapping = pd.DataFrame(mapping_data, columns=["geography_id", "original_geography", "original_geography_type"])
geography_mapping

Unnamed: 0,geography_id,original_geography,original_geography_type
0,4,Region 7,HHS Regions/National
1,7,Region 1,HHS Regions/National
2,8,Region 8,HHS Regions/National
3,3,Region 10,HHS Regions/National
4,1,Region 6,HHS Regions/National
...,...,...,...
75,55,Maryland,States/Local Areas
76,58,NY-Rest of state,States/Local Areas
77,12,North Dakota,States/Local Areas
78,77,TX-Dallas County,States/Local Areas


In [35]:
df_cumulative = pd.merge(df_cumulative, geography_mapping, left_on = ['Geography', 'Geography Type'], right_on=['original_geography', 'original_geography_type'], how='left')
df_cumulative.head()

Unnamed: 0,Geography Type,Geography,FIPS,Dimension Type,Dimension,Estimate (%),Sample Size,ci_lower,ci_upper,start_cohort_datetime,end_cohort_datetime,vaccine_id,geography_id,original_geography,original_geography_type
0,States/Local Areas,New Jersey,34,Race and Ethnicity,"American Indian or Alaska Native, Non-Hispanic",57.7,48,40.2,75.2 ‡,2009-5,2010-5,34,14,New Jersey,States/Local Areas
1,States/Local Areas,New Jersey,34,Race and Ethnicity,"American Indian or Alaska Native, Non-Hispanic",57.7,48,40.2,75.2 ‡,2009-4,2010-4,34,14,New Jersey,States/Local Areas
2,States/Local Areas,New Jersey,34,Race and Ethnicity,"Asian, Non-Hispanic",34.7,552,29.1,40.3,2009-11,2010-11,34,14,New Jersey,States/Local Areas
3,States/Local Areas,New Jersey,34,Race and Ethnicity,"Asian, Non-Hispanic",24.8,552,19.7,29.9,2009-10,2010-10,34,14,New Jersey,States/Local Areas
4,States/Local Areas,New Jersey,34,Race and Ethnicity,"Asian, Non-Hispanic",10.8,552,6.6,15.0,2009-9,2010-9,34,14,New Jersey,States/Local Areas


In [36]:
df_cumulative.drop(columns=['Geography', 'Geography Type', 'original_geography', 'original_geography_type'], inplace=True)
df_cumulative.head()

Unnamed: 0,FIPS,Dimension Type,Dimension,Estimate (%),Sample Size,ci_lower,ci_upper,start_cohort_datetime,end_cohort_datetime,vaccine_id,geography_id
0,34,Race and Ethnicity,"American Indian or Alaska Native, Non-Hispanic",57.7,48,40.2,75.2 ‡,2009-5,2010-5,34,14
1,34,Race and Ethnicity,"American Indian or Alaska Native, Non-Hispanic",57.7,48,40.2,75.2 ‡,2009-4,2010-4,34,14
2,34,Race and Ethnicity,"Asian, Non-Hispanic",34.7,552,29.1,40.3,2009-11,2010-11,34,14
3,34,Race and Ethnicity,"Asian, Non-Hispanic",24.8,552,19.7,29.9,2009-10,2010-10,34,14
4,34,Race and Ethnicity,"Asian, Non-Hispanic",10.8,552,6.6,15.0,2009-9,2010-9,34,14


In [37]:
df_race = df_cumulative[df_cumulative['Dimension Type'] == 'Race and Ethnicity']
df_race.head()

Unnamed: 0,FIPS,Dimension Type,Dimension,Estimate (%),Sample Size,ci_lower,ci_upper,start_cohort_datetime,end_cohort_datetime,vaccine_id,geography_id
0,34,Race and Ethnicity,"American Indian or Alaska Native, Non-Hispanic",57.7,48,40.2,75.2 ‡,2009-5,2010-5,34,14
1,34,Race and Ethnicity,"American Indian or Alaska Native, Non-Hispanic",57.7,48,40.2,75.2 ‡,2009-4,2010-4,34,14
2,34,Race and Ethnicity,"Asian, Non-Hispanic",34.7,552,29.1,40.3,2009-11,2010-11,34,14
3,34,Race and Ethnicity,"Asian, Non-Hispanic",24.8,552,19.7,29.9,2009-10,2010-10,34,14
4,34,Race and Ethnicity,"Asian, Non-Hispanic",10.8,552,6.6,15.0,2009-9,2010-9,34,14


In [38]:
cursor.execute("SELECT race_ethnicity, race_ethnicity_id FROM race_ethnicity_dim")
mapping_data = cursor.fetchall()

race_ethnicity_mapping = pd.DataFrame(mapping_data, columns=["race_ethnicity", "race_ethnicity_id"])
race_ethnicity_mapping

Unnamed: 0,race_ethnicity,race_ethnicity_id
0,"American Indian or Alaska Native, Non-Hispanic",5
1,"Asian, Non-Hispanic",6
2,"White, Non-Hispanic",2
3,Hispanic,3
4,"Other or Multiple Races, Non-Hispanic",1
5,"Black, Non-Hispanic",4


In [39]:
race_dim = df_race['Dimension'].unique()
race_dim
# form into a dataframe
race_dim = pd.DataFrame(race_dim)
race_dim.columns = ['Race and Ethnicity']
race_dim = race_dim.reset_index(drop=True)
race_dim['id'] = race_dim.index + 1
display(race_dim)

Unnamed: 0,Race and Ethnicity,id
0,"American Indian or Alaska Native, Non-Hispanic",1
1,"Asian, Non-Hispanic",2
2,"White, Non-Hispanic",3
3,Hispanic,4
4,"Other or Multiple Races, Non-Hispanic",5
5,"Black, Non-Hispanic",6


In [122]:
try:
    insert_query = '''
    INSERT INTO race_ethnicity_dim (
        race_ethnicity
    )
    VALUES (%s)
    ON CONFLICT (race_ethnicity) 
    DO UPDATE SET
        race_ethnicity = EXCLUDED.race_ethnicity;
    '''
    for index, row in race_dim.iterrows():
        print(row['Race and Ethnicity'])
        cursor.execute(insert_query, (row['Race and Ethnicity'],))
    conn.commit()
    print(f"{len(race_dim)} records inserted into race_dim.")

except Exception as e:
    print(f"Error occurred: {e}")
    conn.rollback()

American Indian or Alaska Native, Non-Hispanic
Asian, Non-Hispanic
White, Non-Hispanic
Hispanic
Other or Multiple Races, Non-Hispanic
Black, Non-Hispanic
6 records inserted into race_dim.


In [118]:
cursor.execute("SELECT race_ethnicity, race_ethnicity_id FROM race_ethnicity_dim")
mapping_data = cursor.fetchall()

race_ethnicity_mapping = pd.DataFrame(mapping_data, columns=["race_ethnicity", "race_ethnicity_id"])
race_ethnicity_mapping

Unnamed: 0,race_ethnicity,race_ethnicity_id
0,"American Indian or Alaska Native, Non-Hispanic",5
1,"Asian, Non-Hispanic",6
2,"White, Non-Hispanic",2
3,Hispanic,3
4,"Other or Multiple Races, Non-Hispanic",1
5,"Black, Non-Hispanic",4


In [40]:
df_race = pd.merge(df_race, race_ethnicity_mapping, left_on = ['Dimension'], right_on=['race_ethnicity'], how='left')
df_race.head()

Unnamed: 0,FIPS,Dimension Type,Dimension,Estimate (%),Sample Size,ci_lower,ci_upper,start_cohort_datetime,end_cohort_datetime,vaccine_id,geography_id,race_ethnicity,race_ethnicity_id
0,34,Race and Ethnicity,"American Indian or Alaska Native, Non-Hispanic",57.7,48,40.2,75.2 ‡,2009-5,2010-5,34,14,"American Indian or Alaska Native, Non-Hispanic",5
1,34,Race and Ethnicity,"American Indian or Alaska Native, Non-Hispanic",57.7,48,40.2,75.2 ‡,2009-4,2010-4,34,14,"American Indian or Alaska Native, Non-Hispanic",5
2,34,Race and Ethnicity,"Asian, Non-Hispanic",34.7,552,29.1,40.3,2009-11,2010-11,34,14,"Asian, Non-Hispanic",6
3,34,Race and Ethnicity,"Asian, Non-Hispanic",24.8,552,19.7,29.9,2009-10,2010-10,34,14,"Asian, Non-Hispanic",6
4,34,Race and Ethnicity,"Asian, Non-Hispanic",10.8,552,6.6,15.0,2009-9,2010-9,34,14,"Asian, Non-Hispanic",6


In [41]:
df_race.drop(columns=['Dimension Type', 'Dimension', 'race_ethnicity'], inplace=True)
df_race.head()

Unnamed: 0,FIPS,Estimate (%),Sample Size,ci_lower,ci_upper,start_cohort_datetime,end_cohort_datetime,vaccine_id,geography_id,race_ethnicity_id
0,34,57.7,48,40.2,75.2 ‡,2009-5,2010-5,34,14,5
1,34,57.7,48,40.2,75.2 ‡,2009-4,2010-4,34,14,5
2,34,34.7,552,29.1,40.3,2009-11,2010-11,34,14,6
3,34,24.8,552,19.7,29.9,2009-10,2010-10,34,14,6
4,34,10.8,552,6.6,15.0,2009-9,2010-9,34,14,6


In [42]:
df_age = df_cumulative[df_cumulative['Dimension Type'] == 'Age']
df_age.head()

Unnamed: 0,FIPS,Dimension Type,Dimension,Estimate (%),Sample Size,ci_lower,ci_upper,start_cohort_datetime,end_cohort_datetime,vaccine_id,geography_id
13,34,Age,>=65 Years,65.2,2554,62.5,67.9,2009-5,2010-5,34,14
14,34,Age,>=65 Years,65.2,2554,62.5,67.9,2009-4,2010-4,34,14
15,34,Age,>=65 Years,65.1,2554,62.4,67.8,2009-3,2010-3,34,14
16,34,Age,>=65 Years,64.4,2554,61.7,67.1,2009-2,2010-2,34,14
17,34,Age,>=65 Years,63.8,2554,61.1,66.5,2009-1,2010-1,34,14


In [43]:
# do the same thing as what we did in the transactional
df_age.rename(columns={'Dimension': 'Age'}, inplace=True)
df_age.drop(columns=['Dimension Type'], inplace=True)
print(df_age.shape[0])
df_age.head()

148036


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_age.rename(columns={'Dimension': 'Age'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_age.drop(columns=['Dimension Type'], inplace=True)


Unnamed: 0,FIPS,Age,Estimate (%),Sample Size,ci_lower,ci_upper,start_cohort_datetime,end_cohort_datetime,vaccine_id,geography_id
13,34,>=65 Years,65.2,2554,62.5,67.9,2009-5,2010-5,34,14
14,34,>=65 Years,65.2,2554,62.5,67.9,2009-4,2010-4,34,14
15,34,>=65 Years,65.1,2554,62.4,67.8,2009-3,2010-3,34,14
16,34,>=65 Years,64.4,2554,61.7,67.1,2009-2,2010-2,34,14
17,34,>=65 Years,63.8,2554,61.1,66.5,2009-1,2010-1,34,14


In [58]:
df_location = df_cumulative[~df_cumulative['Dimension Type'].isin(['Race and Ethnicity', 'Age'])]
df_location.rename(columns={'Dimension Type': 'Age', 'Dimension':'vaccine_location'}, inplace=True)
df_location.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_location.rename(columns={'Dimension Type': 'Age', 'Dimension':'vaccine_location'}, inplace=True)


Unnamed: 0,FIPS,Age,vaccine_location,Estimate (%),Sample Size,ci_lower,ci_upper,start_cohort_datetime,end_cohort_datetime,vaccine_id,geography_id
66,34,>=18 Years,Non-Medical Setting,53.7,3285.0,49.6,57.8,2020-1,2021-1,20,14
118,34,>=18 Years,Non-Medical Setting,42.4,,36.3,48.5,2017-1,2018-1,17,14
131,34,6 Months - 17 Years,Non-Medical Setting,3.4,51.0,2.2,4.6,2014-1,2015-1,14,14
132,34,18-49 Years,Non-Medical Setting,49.4,,37.3,61.5,2017-1,2018-1,17,14
133,34,6 Months - 17 Years,Non-Medical Setting,8.8,1814.0,7.0,11.1,2020-1,2021-1,20,14


In [59]:
vaccine_location_dim = df_location['vaccine_location'].unique()
vaccine_location_dim = pd.DataFrame(vaccine_location_dim)
vaccine_location_dim.columns = ['vaccine_location']
vaccine_location_dim = vaccine_location_dim.reset_index(drop=True)
display(vaccine_location_dim)

Unnamed: 0,vaccine_location
0,Non-Medical Setting
1,School
2,Pharmacy/Store
3,Workplace
4,Medical Setting


In [125]:
try:
    insert_query = '''
    INSERT INTO vaccine_location_dim (
        vaccine_location
    )
    VALUES (%s);
    '''
    for index, row in vaccine_location_dim.iterrows():
        print(row['vaccine_location'])
        cursor.execute(insert_query, (row['vaccine_location'],))
    conn.commit()
    print(f"{len(vaccine_location_dim)} records inserted into vaccine_location_dim.")

except Exception as e:
    print(f"Error occurred: {e}")
    conn.rollback()

Non-Medical Setting
Error occurred: duplicate key value violates unique constraint "unique_vaccine_location_dim"
DETAIL:  Key (vaccine_location)=(Non-Medical Setting) already exists.



In [60]:
cursor.execute("SELECT vaccine_location, vaccine_location_id FROM vaccine_location_dim")
mapping_data = cursor.fetchall()

vac_location = pd.DataFrame(mapping_data, columns=["vaccine_location", "vaccine_location_id"])
vac_location

Unnamed: 0,vaccine_location,vaccine_location_id
0,Non-Medical Setting,1
1,School,2
2,Pharmacy/Store,3
3,Workplace,4
4,Medical Setting,5


In [61]:
# we merge with the vaccine_dim
df_location = pd.merge(df_location, vac_location, left_on = ['vaccine_location'], right_on=['vaccine_location'], how='left')
df_location.head()

Unnamed: 0,FIPS,Age,vaccine_location,Estimate (%),Sample Size,ci_lower,ci_upper,start_cohort_datetime,end_cohort_datetime,vaccine_id,geography_id,vaccine_location_id
0,34,>=18 Years,Non-Medical Setting,53.7,3285.0,49.6,57.8,2020-1,2021-1,20,14,1
1,34,>=18 Years,Non-Medical Setting,42.4,,36.3,48.5,2017-1,2018-1,17,14,1
2,34,6 Months - 17 Years,Non-Medical Setting,3.4,51.0,2.2,4.6,2014-1,2015-1,14,14,1
3,34,18-49 Years,Non-Medical Setting,49.4,,37.3,61.5,2017-1,2018-1,17,14,1
4,34,6 Months - 17 Years,Non-Medical Setting,8.8,1814.0,7.0,11.1,2020-1,2021-1,20,14,1


In [62]:
df_location.drop(columns=['vaccine_location'], inplace=True)
df_location.head()


Unnamed: 0,FIPS,Age,Estimate (%),Sample Size,ci_lower,ci_upper,start_cohort_datetime,end_cohort_datetime,vaccine_id,geography_id,vaccine_location_id
0,34,>=18 Years,53.7,3285.0,49.6,57.8,2020-1,2021-1,20,14,1
1,34,>=18 Years,42.4,,36.3,48.5,2017-1,2018-1,17,14,1
2,34,6 Months - 17 Years,3.4,51.0,2.2,4.6,2014-1,2015-1,14,14,1
3,34,18-49 Years,49.4,,37.3,61.5,2017-1,2018-1,17,14,1
4,34,6 Months - 17 Years,8.8,1814.0,7.0,11.1,2020-1,2021-1,20,14,1


In [49]:
df_cumulative_dimension = pd.concat([df_age, df_race, df_location])
df_cumulative_dimension.head()

Unnamed: 0,FIPS,Age,Estimate (%),Sample Size,ci_lower,ci_upper,start_cohort_datetime,end_cohort_datetime,vaccine_id,geography_id,race_ethnicity_id,vaccine_location_id
13,34,>=65 Years,65.2,2554,62.5,67.9,2009-5,2010-5,34,14,,
14,34,>=65 Years,65.2,2554,62.5,67.9,2009-4,2010-4,34,14,,
15,34,>=65 Years,65.1,2554,62.4,67.8,2009-3,2010-3,34,14,,
16,34,>=65 Years,64.4,2554,61.7,67.1,2009-2,2010-2,34,14,,
17,34,>=65 Years,63.8,2554,61.1,66.5,2009-1,2010-1,34,14,,


In [63]:
df_cumulative_dimension.shape[0]

198911

In [51]:
df_cumulative.shape[0]

198911

In [52]:
df_cumulative_dimension.head()

Unnamed: 0,FIPS,Age,Estimate (%),Sample Size,ci_lower,ci_upper,start_cohort_datetime,end_cohort_datetime,vaccine_id,geography_id,race_ethnicity_id,vaccine_location_id
13,34,>=65 Years,65.2,2554,62.5,67.9,2009-5,2010-5,34,14,,
14,34,>=65 Years,65.2,2554,62.5,67.9,2009-4,2010-4,34,14,,
15,34,>=65 Years,65.1,2554,62.4,67.8,2009-3,2010-3,34,14,,
16,34,>=65 Years,64.4,2554,61.7,67.1,2009-2,2010-2,34,14,,
17,34,>=65 Years,63.8,2554,61.1,66.5,2009-1,2010-1,34,14,,


In [133]:
df_cumulative_dimension.columns

Index(['FIPS', 'Age', 'Estimate (%)', 'Sample Size', 'ci_lower', 'ci_upper',
       'start_cohort_datetime', 'end_cohort_datetime', 'vaccine_id',
       'geography_id', 'race_ethnicity_id', 'vaccine_location_id'],
      dtype='object')

In [58]:
df['Dimension Type'].unique()


array(['>=18 Years', 'Age', 'Race and Ethnicity', '6 Months - 17 Years',
       '18-49 Years', '>=65 Years', '50-64 Years', '18-64 Years'],
      dtype=object)

In [59]:
age_groups = ['>=18 Years', '6 Months - 17 Years', '18-49 Years', '>=65 Years', '50-64 Years', '18-64 Years']
df_age = df[df['Dimension Type'].isin(age_groups)]
df_age.head()

Unnamed: 0,Vaccine,Geography Type,Geography,FIPS,Season/Survey Year,Month,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size,ci_lower,ci_upper
0,Influenza,Counties,New Haven,9009,2018,1,>=18 Years,Non-Medical Setting,45.5,43.9 to 47.2,,43.9,47.2
1,Influenza,Counties,New Haven,9009,2021,1,>=18 Years,Non-Medical Setting,53.0,46.0 to 60.9,,46.0,60.9
72,Influenza,States/Local Areas,New Jersey,34,2020-21,1,>=18 Years,Non-Medical Setting,53.7,49.6 to 57.8,3285.0,49.6,57.8
124,Influenza,States/Local Areas,New Jersey,34,2017-18,1,>=18 Years,Non-Medical Setting,42.4,36.3 to 48.5,,36.3,48.5
137,Influenza,States/Local Areas,New Jersey,34,2014-15,1,6 Months - 17 Years,Non-Medical Setting,3.4,2.2 to 4.6,51.0,2.2,4.6


In [60]:
df_age['Dimension'].unique()

array(['Non-Medical Setting', 'School', 'Pharmacy/Store', 'Workplace',
       'Medical Setting'], dtype=object)

In [134]:
df_cumulative_dimension.shape[0]

198911

In [135]:
df_cumulative_dimension.head()

Unnamed: 0,FIPS,Age,Estimate (%),Sample Size,ci_lower,ci_upper,start_cohort_datetime,end_cohort_datetime,vaccine_id,geography_id,race_ethnicity_id,vaccine_location_id
13,34,>=65 Years,65.2,2554,62.5,67.9,2009-05-01,2010-05-01,34,14,,
14,34,>=65 Years,65.2,2554,62.5,67.9,2009-04-01,2010-04-01,34,14,,
15,34,>=65 Years,65.1,2554,62.4,67.8,2009-03-01,2010-03-01,34,14,,
16,34,>=65 Years,64.4,2554,61.7,67.1,2009-02-01,2010-02-01,34,14,,
17,34,>=65 Years,63.8,2554,61.1,66.5,2009-01-01,2010-01-01,34,14,,


In [53]:
df_cumulative_dimension.columns

Index(['FIPS', 'Age', 'Estimate (%)', 'Sample Size', 'ci_lower', 'ci_upper',
       'start_cohort_datetime', 'end_cohort_datetime', 'vaccine_id',
       'geography_id', 'race_ethnicity_id', 'vaccine_location_id'],
      dtype='object')

In [54]:
df_cumulative_dimension['ci_lower'] = pd.to_numeric(df['ci_lower'], errors='coerce').astype(float)
df_cumulative_dimension['ci_upper'] = pd.to_numeric(df['ci_upper'], errors='coerce').astype(float)

In [55]:
df_cumulative_dimension['ci_lower'] = df_cumulative_dimension['ci_lower'].fillna(0)
df_cumulative_dimension['ci_upper'] = df_cumulative_dimension['ci_upper'].fillna(0)
df_cumulative_dimension['Estimate (%)'] = df_cumulative_dimension['Estimate (%)'].fillna(0)

In [56]:
df_cumulative_dimension['ci_lower']

13      40.2
14      36.7
15      24.8
16       0.0
17      40.2
        ... 
3075    40.3
3076    45.3
3077    48.2
3078     0.4
3079     2.6
Name: ci_lower, Length: 198911, dtype: float64

In [142]:
# loading cumulative fact table
try:
    insert_query='''
    INSERT INTO influenza_cumulative_fact_test(
        age,
        influenza_cumulative_estimate_pct,
        influenza_cumulative_sample_size,
        influenza_cumulative_ci_lower,
        influenza_cumulative_ci_upper,
        influenza_cumulative_start_cohort_datetime,
        influenza_cumulative_end_cohort_datetime,
        influenza_cumulative_vaccine_id,
        influenza_cumulative_geography_id,
        influenza_cumulative_race_ethnicity_id,
        influenza_cumulative_vaccine_location_id
    )
    VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    '''
    for index, row in df_cumulative_dimension.iterrows():
        start_cohort_date = pd.to_datetime(row['start_cohort_datetime']).date() if pd.notnull(row['start_cohort_datetime']) else None
        end_cohort_date = pd.to_datetime(row['end_cohort_datetime']).date() if pd.notnull(row['end_cohort_datetime']) else None
        vaccine_id = int(row['vaccine_id']) if pd.notnull(row['vaccine_id']) else None
        geography_id = int(row['geography_id']) if pd.notnull(row['geography_id']) else None
        race_ethnicity_id = int(row['race_ethnicity_id']) if pd.notnull(row['race_ethnicity_id']) else None
        sample_size = int(row['Sample Size']) if pd.notnull(row['Sample Size']) else None
        vaccine_location_id = int(row['vaccine_location_id']) if pd.notnull(row['vaccine_location_id']) else None

        cursor.execute(insert_query,(
        row['Age'],
        row['Estimate (%)'],
        sample_size,
        row['ci_lower'],
        row['ci_upper'],
        start_cohort_date,
        end_cohort_date,
        vaccine_id,
        geography_id,
        race_ethnicity_id,
        vaccine_location_id 
        ))
    conn.commit()
    print(f"{len(df_cumulative_dimension)} records inserted into influenza_cumulative_fact.")

except Exception as e:
    print(f"Error occurred: {e}")
    traceback.print_exc(file=sys.stdout)
    conn.rollback()


198911 records inserted into influenza_cumulative_fact.
