#### Objective
Gather data and create summary table for Visit History.xlsx

#### Pre-requisites
The following packages need to be installed in your notebook environment:
1. <a href = https://dev.mysql.com/doc/connector-python/en/>mysql.connector</a>
2. <a href = https://pandas.pydata.org/>pandas</a>
3. <a href = https://numpy.org/>numpy</a>
4. <a href = https://pypi.org/project/tableone/>tableone</a>
5. <a href = https://pypi.org/project/openpyxl/>openpyxl</a>

Be sure to install these packages in one of the directories in the system path(`print sys.path`) for the notebook environment.

You will need access (username and password) to seronet vaccine response database, `seronetdb-Vaccine_Response`, accessible at the AWS endpoint: `seronet-dev-instance.cwe7vdnqsvxr.us-east-1.rds.amazonaws.com`. <br>
**Note:** Store all mysql connector parameters in a `.env` file as shown below. <i>The `.env` needs to be in the same directory as the notebooks</i>.

Ensure that the Excel workbook <i>Release_Data_Dictionary.xlsx</i> is in the same folder as this notebook. This workbook is also on versioned on GitHub.

In [1]:
import mysql.connector as connection
import pandas as pd
import numpy as np
import os
from tableone import TableOne
from openpyxl import load_workbook, Workbook
from openpyxl.styles import Font
import sqlalchemy as sd
import datetime

In [2]:
last_edit = '12/13/2024'

In [3]:
V4_fitler = False

In [4]:
pd.options.mode.chained_assignment = None  # default='warn'

In [5]:
'''
Create a .env file to store hostname, database, username, and password as environment variables.  
The .env file is stored in same folder as notebook.
==============
Format:
HOST=a.b.c.d
DB=<my.database>
USER=john
PWD=abc#123
==============
Note: no space around the "=" sign. Do not put any of the values in quotes.
'''

env = {}
with open("test.env") as f:
    for line in f:
        (k, v) = line.split("=")
        k = k.strip()
        env[k] = v.strip()
        
HOST = env["HOST"]
DB = env["DB"]
USER = env["USER"]
PWD = env["PWD"]

In [6]:
if V4_fitler == True:
    version_num = "4.2.0"
    output_folder = r"C:\Users\breadsp2\Desktop\Release_4.2.0"
else: 
    version_num = "5.2.0"
    output_folder = r"C:\Users\breadsp2\Desktop\Release_5.2.0"
file_sep = os.path.sep
sheet_name = "Vaccination_Data"
release_date = '2024-10-01'

In [7]:
creds = {'usr': USER, 'pwd': PWD, 'hst': HOST, "prt": 3306, 'dbn': DB}
connstr = "mysql+mysqlconnector://{usr}:{pwd}@{hst}:{prt}/{dbn}"
engine = sd.create_engine(connstr.format(**creds))
conn = engine.connect()

In [8]:
curr_visit_file = output_folder + file_sep + "Participant_Visit_Info_" + version_num + ".xlsx"
curr_visit_data = pd.read_excel(curr_visit_file, sheet_name="Detailed_Report")

In [9]:
demo_file = output_folder + file_sep + "Demographics_" + version_num + ".xlsx"
demo_data = pd.read_excel(demo_file, sheet_name="Detailed_Report")

In [10]:
All_Vaccine = pd.read_sql(("SELECT cv.*,   (cv.`SARS-CoV-2_Vaccination_Date_Duration_From_Index` -  voff.Offset_Value) as 'Vaccine_Duration_From_Visit_1' " +
        
                           "FROM Covid_Vaccination_Status as cv  " +
                           "left join Visit_One_Offset_Correction as voff on cv.Research_Participant_ID = voff.Research_Participant_ID "
                           "where cv.Covid_Vaccination_Status_Comments not in ('Record previously submitted in error.', " +
                           "'Vaccine Status does not exist, input error', 'Record previously submitted: Duplicated.', 'Record occurs after last Visit: Not able to Link') or cv.Covid_Vaccination_Status_Comments is NULL"), conn)

In [11]:
x = All_Vaccine.query("Vaccination_Status in ('No vaccination event reported', 'Unvaccinated')")
All_Vaccine.loc[x.index, 'Vaccine_Duration_From_Visit_1'] = np.nan

In [12]:
visit_data = pd.read_sql(("SELECT * FROM Normalized_Visit_Info;"), conn)
visit_data.rename(columns={"Vaccination_Status": "Last Vaccine Received", "SARS-CoV-2_Vaccine_Type": "Last Vaccine_Type Received"}, inplace=True)

In [13]:

visit_data["Last Vaccine Received"] = visit_data["Last Vaccine Received"].replace("Unvaccinated", "No vaccination event reported")
visit_data["Last Vaccine Received"] = visit_data["Last Vaccine Received"].replace("No Vaccination Data", "No vaccination event reported")
visit_data["Last Vaccine Received"] = visit_data["Last Vaccine Received"].replace("No Vaccination Data Reported", "No vaccination event reported")

x = visit_data.query("`Last Vaccine Received` == 'No vaccination event reported'") 
visit_data.loc[x.index, "Last Vaccine_Type Received"] = "No vaccination event reported"

In [14]:
All_Vaccine["Vaccination_Status"] = All_Vaccine["Vaccination_Status"].replace("Unvaccinated", "No vaccination event reported")
All_Vaccine["Vaccination_Status"] = All_Vaccine["Vaccination_Status"].replace("Not reported", "No vaccination event reported")


x = All_Vaccine.query("Vaccination_Status == 'No vaccination event reported'") 
All_Vaccine.loc[x.index, "SARS-CoV-2_Vaccine_Type"] = "No vaccination event reported"

In [15]:
part_table = pd.read_sql(("SELECT * FROM Participant"), conn)
All_Vaccine = All_Vaccine.merge(part_table[["Research_Participant_ID", "Seronet_Participant_ID"]])     #only include particpants with valid start date

In [16]:
All_Vaccine =  All_Vaccine.merge(visit_data, on=["Research_Participant_ID", "Visit_Info_ID"], how="outer")

In [17]:
All_Vaccine = All_Vaccine.merge(demo_data["Seronet_Participant_ID"])
All_Vaccine = All_Vaccine.merge(curr_visit_data[["Seronet_Participant_ID", 'Normalized_Visit_Index']], how="right")

In [18]:
All_Vaccine.drop_duplicates(inplace=True)
All_Vaccine.to_csv("all_vaccine.csv")

In [19]:
#All_Vaccine = All_Vaccine.query("Seronet_Participant_ID == 'SN615520'")#

In [20]:
all_data  = []
uni_id = list(set(All_Vaccine["Research_Participant_ID"]))
#uni_id = uni_id [:50]
#uni_id = ['32_221119']

for curr_part in uni_id:
    
    visit_1_info = All_Vaccine.query(f"Research_Participant_ID == '{curr_part}'")
    
    visit_1_info["SARS-CoV-2_Vaccination_Date_Duration_From_Index"].fillna(-1000, inplace=True) #set unvacc / missing to -1000
    visit_1_info["SARS-CoV-2_Vaccination_Date_Duration_From_Index"].replace('N/A',-1000, inplace=True) #set unvacc / missing to -1000
    visit_1_info["SARS-CoV-2_Vaccination_Date_Duration_From_Index"] = [int(i) for i in visit_1_info["SARS-CoV-2_Vaccination_Date_Duration_From_Index"]]
    visit_1_info.sort_values(["Normalized_Visit_Index", "SARS-CoV-2_Vaccination_Date_Duration_From_Index"], inplace=True)    

    #print(f"{curr_part} has {len(visit_1_info)} visits")
    
    for curr_visit in range(int(np.nanmin(visit_1_info["Normalized_Visit_Index"])), int(np.nanmax(visit_1_info["Normalized_Visit_Index"])) + 1):
        if curr_visit == 0:
            continue
        if curr_visit not in list(set(visit_1_info["Normalized_Visit_Index"])):
            continue
        
        samp_visit = visit_1_info[:visit_1_info["Normalized_Visit_Index"].tolist().index(curr_visit)+1]
        samp_visit["Vaccination_Status"].fillna('No vaccination event reported', inplace=True)
        samp_visit["SARS-CoV-2_Vaccine_Type"].fillna('No vaccination event reported', inplace=True)
        
        visit_summary= pd.crosstab(samp_visit["Research_Participant_ID"],samp_visit["Vaccination_Status"] + samp_visit["SARS-CoV-2_Vaccine_Type"])

        visit_summary["Normalized_Visit"] = curr_visit
        visit_summary["Visit_Vaccine_History"] = "Missing Vaccination Data"
        
        if "No vaccination event reportedNo vaccination event reported" in visit_summary.columns:
            x = visit_summary.query("`No vaccination event reportedNo vaccination event reported` >= 1")
            visit_summary["Visit_Vaccine_History"][x.index] = "No Vacccine_History"

        if "UnvaccinatedN/A" in visit_summary.columns:
            x = visit_summary.query("`UnvaccinatedN/A` >= 1")
            visit_summary["Visit_Vaccine_History"][x.index] = "No Vacccine_History"

        if "Dose 1 of 1Johnson & Johnson" in visit_summary.columns:
            x = visit_summary.query("`Dose 1 of 1Johnson & Johnson` >= 1")
            visit_summary["Visit_Vaccine_History"][x.index] = "Primary Series: Johnson & Johnson (1 Dose)"

        if "Dose 1 of 2Pfizer" in visit_summary.columns and "Dose 2 of 2Pfizer" in visit_summary.columns:
            x = visit_summary.query("`Dose 1 of 2Pfizer` >= 1 and `Dose 2 of 2Pfizer` >= 1")
            visit_summary["Visit_Vaccine_History"][x.index] = "Primary Series: Pfizer (2 Doses)"

        if "Dose 1 of 2Moderna" in visit_summary.columns and "Dose 2 of 2Moderna" in visit_summary.columns:
            x = visit_summary.query("`Dose 1 of 2Moderna` >= 1 and `Dose 2 of 2Moderna` >= 1")
            visit_summary["Visit_Vaccine_History"][x.index] = "Primary Series: Moderna (2 Doses)"            
            

        visit_summary["Dose 1 of 1"] = visit_summary[[col for col in visit_summary.columns if col.startswith('Dose 1 of 1')]].count(axis=1)
        visit_summary["Dose 1 of 2"] = visit_summary[[col for col in visit_summary.columns if col.startswith('Dose 1 of 2')]].count(axis=1)
        visit_summary["Dose 2 of 2"] = visit_summary[[col for col in visit_summary.columns if col.startswith('Dose 2 of 2')]].count(axis=1)
        
        visit_summary["Extra Dose 2"] = visit_summary[[col for col in visit_summary.columns 
                                                       if col.startswith('Dose 2') and not(col.startswith('Dose 2 of 2'))]].count(axis=1)
        visit_summary["Extra Dose 3"] = visit_summary[[col for col in visit_summary.columns if col.startswith('Dose 3')]].count(axis=1)
        
        x = visit_summary.query("Visit_Vaccine_History == 'Primary Series: Johnson & Johnson (1 Dose)' and `Dose 2 of 2` > 0")
        visit_summary.loc[x.index, "Dose 2 of 2"] = 0
        visit_summary.loc[x.index, "Extra Dose 2"] = 1
        
        
        #visit_summary["Extra Dose 2_a"] = (visit_summary["Extra Dose 2"] - visit_summary["Dose 2 of 2"])
        
        
        visit_summary["Booster Count"] = visit_summary[[col for col in visit_summary.columns if col.startswith('Booster')]].count(axis=1)
        visit_summary["Bivalent Boosters"] = visit_summary[[col for col in visit_summary.columns if col.startswith('Booster') and ("Bivalent" in col)]].count(axis=1)
        visit_summary["XBB1.5 Booosters"] = visit_summary[[col for col in visit_summary.columns if col.startswith('Booster') and ("XBB.1.5" in col)]].count(axis=1)
        
        #print(visit_summary)
        
        visit_summary["Extra Doses"] = visit_summary["Extra Dose 2"] + visit_summary["Extra Dose 3"]
        
        if "Dose 1 of 2Moderna" in visit_summary.columns:
            x = visit_summary.query("`Dose 1 of 2` == 1 and `Dose 2 of 2` == 0 and `Dose 1 of 2Moderna` >= 1")
            visit_summary["Visit_Vaccine_History"][x.index] = "Primary Series: Moderna (1 Dose)" #"Moderna: Dose 1 of 2 only"

        if "Dose 1 of 2Pfizer" in visit_summary.columns:
            x = visit_summary.query("`Dose 1 of 2` == 1 and `Dose 2 of 2` == 0 and `Dose 1 of 2Pfizer` >= 1")
            visit_summary["Visit_Vaccine_History"][x.index] = "Primary Series: Pfizer (1 Dose)" #Pfizer: Dose 1 of 2 only"
            
        if "Dose 1 of 2Unknown" in visit_summary.columns:
            x = visit_summary.query("`Dose 1 of 2` == 1 and `Dose 2 of 2` == 0 and `Dose 1 of 2Unknown` >= 1")
            visit_summary["Visit_Vaccine_History"][x.index] = "Primary Series: Unknown / Other (1 Dose)" #"Unknown: Dose 1 of 2 only"

            
        if "Dose 1 of 2Sinovac" in visit_summary.columns:
            x = visit_summary.query("`Dose 1 of 2` >= 1 and `Dose 2 of 2` == 0")
            visit_summary["Visit_Vaccine_History"][x.index] = "Primary Series: Unknown / Other (1 Dose)"
        if "Dose 1 of 2Novavax" in visit_summary.columns:
            x = visit_summary.query("`Dose 1 of 2` >= 1 and `Dose 2 of 2` == 0")
            visit_summary["Visit_Vaccine_History"][x.index] = "Primary Series: Unknown / Other (1 Dose)"
        if "Dose 1 of 2Sputnik V" in visit_summary.columns:
            x = visit_summary.query("`Dose 1 of 2` >= 1 and `Dose 2 of 2` == 0")
            visit_summary["Visit_Vaccine_History"][x.index] = "Primary Series: Unknown / Other (1 Dose)"
            
            
            
        x = visit_summary.query("`Dose 1 of 2` == 1 and `Dose 2 of 2` == 1 and `Visit_Vaccine_History` in ['Missing Vaccination Data', 'No Vacccine_History']")
        visit_summary["Visit_Vaccine_History"][x.index] = "Primary Series: Mixed/Unknown/Other (2 Doses)"
        
        visit_summary["Primary Series"] = visit_summary["Visit_Vaccine_History"]

        x = visit_summary.query("`Extra Doses` == 0")
        visit_summary.loc[x.index, "Visit_Vaccine_History"] = (visit_summary["Visit_Vaccine_History"] + " and " +
                                                              visit_summary["Booster Count"].astype(str) + " Boosters")
        
        x = visit_summary.query("`Extra Doses` == 1 and Visit_Vaccine_History == 'Primary Series: Johnson & Johnson (1 Dose)'")
        visit_summary.loc[x.index, "Visit_Vaccine_History"] = ("Primary Series: Johnson & Johnson (2 Dose)* and " +
                                                              visit_summary["Booster Count"].astype(str) + " Boosters")
        
        x = visit_summary.query("`Extra Doses` == 1 and Visit_Vaccine_History == 'Primary Series: Moderna (2 Doses)'")
        visit_summary.loc[x.index, "Visit_Vaccine_History"] = ("Primary Series: Moderna (3 Doses)* and " +
                                                              visit_summary["Booster Count"].astype(str) + " Boosters")
        
        x = visit_summary.query("`Extra Doses` == 1 and Visit_Vaccine_History == 'Primary Series: Pfizer (2 Doses)'")
        visit_summary.loc[x.index, "Visit_Vaccine_History"] = ("Primary Series: Pfizer (3 Doses)* and " +
                                                              visit_summary["Booster Count"].astype(str) + " Boosters")
        
        x = visit_summary.query("`Extra Doses` == 1 and Visit_Vaccine_History == 'Mixed/Unknown/Other (2 Doses)'")
        visit_summary.loc[x.index, "Visit_Vaccine_History"] = ("Primary Series: Mixed/Unknown/Other (3 Doses)* and " +
                                                              visit_summary["Booster Count"].astype(str) + " Boosters")
        
        
        #visit_summary["Visit_Vaccine_History"] =  [i.replace('1 Extra Doses', '1 Extra Dose') for i in visit_summary["Visit_Vaccine_History"]]
        visit_summary["Visit_Vaccine_History"] =  [i.replace(' and 0.0 Boosters', '') for i in visit_summary["Visit_Vaccine_History"]]
        visit_summary["Visit_Vaccine_History"] =  [i.replace(' and 0 Boosters', '') for i in visit_summary["Visit_Vaccine_History"]]
       
        #visit_summary["Visit_Vaccine_History"] =  [i.replace("with 0.0 Extra Doses and", 'with') for i in visit_summary["Visit_Vaccine_History"]]
        #visit_summary["Visit_Vaccine_History"] =  [i.replace("with 0 Extra Doses and", 'with') for i in visit_summary["Visit_Vaccine_History"]]
      
        visit_summary["Visit_Vaccine_History"] =  [i.replace('1 Boosters', '1 Booster') for i in visit_summary["Visit_Vaccine_History"]]
        #visit_summary["Visit_Vaccine_History"].replace('Missing Vaccination Data', 'Unable to Resolve Vaccination History', inplace=True)
        
        
        
           
        moderna_list = ["Dose 1 of 2Moderna", "Dose 2 of 2Moderna", "Dose 2Moderna", "Dose 3Moderna",
                       "Dose 2:Monovalent XBB.1.5Moderna","Dose 3:BivalentModerna","Dose 3:Monovalent XBB.1.5Moderna"]
        pfizer_list  = ["Dose 1 of 2Pfizer", "Dose 2 of 2Pfizer", "Dose 2Pfizer", "Dose 3Pfizer",
                        "Dose 2:Monovalent XBB.1.5Pfizer","Dose 3:BivalentPfizer","Dose 3:Monovalent XBB.1.5Pfizer"]
        JandJ_list   = ["Dose 1 of 1Johnson & Johnson", "Dose 1 of 2Johnson & Johnson", "Dose 2Johnson & Johnson", 
                        "Dose 2 of 2Johnson & Johnson"]

        Sinovac_list = ["Dose 1 of 2Sinovac", "Dose 2 of 2Sinovac"]
        Other_list = ["Dose 1 of 2Unknown","Dose 2 of 2Unknown"]
        Novavax_list = ["Dose 1 of 2Novavax","Dose 2 of 2Novavax"]
        Sputnik_list = ["Dose 1 of 2Sputnik V","Dose 2 of 2Sputnik V"]
        
        visit_summary["Dose 1"] = ""
        visit_summary["Dose 2"] = ""
        visit_summary["Dose 3"] = ""
        
        for curr_vacc in pfizer_list:
            if curr_vacc in visit_summary.columns:
                if len(visit_summary[visit_summary[curr_vacc] >= 1]) == 1:
                    visit_summary[curr_vacc[:6]] = "Pfizer"
        for curr_vacc in moderna_list:
            if curr_vacc in visit_summary.columns:
                if len(visit_summary[visit_summary[curr_vacc] >= 1]) == 1:
                    visit_summary[curr_vacc[:6]] = "Moderna"
        for curr_vacc in JandJ_list:
            if curr_vacc in visit_summary.columns:
                if len(visit_summary[visit_summary[curr_vacc] >= 1]) == 1:
                    visit_summary[curr_vacc[:6]] = "Johnson & Johnson"
                    
        for curr_vacc in Sinovac_list:
            if curr_vacc in visit_summary.columns:
                if len(visit_summary[visit_summary[curr_vacc] >= 1]) == 1:
                    visit_summary[curr_vacc[:6]] = "Sinovac"
        for curr_vacc in Novavax_list:
            if curr_vacc in visit_summary.columns:
                if len(visit_summary[visit_summary[curr_vacc] >= 1]) == 1:
                    visit_summary[curr_vacc[:6]] = "Novavax"
        for curr_vacc in Sputnik_list:
            if curr_vacc in visit_summary.columns:
                if len(visit_summary[visit_summary[curr_vacc] >= 1]) == 1:
                    visit_summary[curr_vacc[:6]] = "Sputnik V"
        for curr_vacc in Other_list:
            if curr_vacc in visit_summary.columns:
                if len(visit_summary[visit_summary[curr_vacc] >= 1]) == 1:
                    visit_summary[curr_vacc[:6]] = "Unknown"
                    
                    
        if len(visit_summary.query("`Dose 1 of 2` == 0 and `Dose 1 of 1` == 0")) == 1: 
           visit_summary["Dose 1"] = "Not Received"
        
        if len(visit_summary.query("`Dose 2 of 2` == 0 and `Extra Dose 2` == 0")) == 1:
            visit_summary["Dose 2"] = "Not Received"
        if len(visit_summary.query("`Extra Dose 3` == 0")) == 1:
            visit_summary["Dose 3"] = "Not Received"
        
        x = visit_summary.query("`Dose 1 of 2` == 0 and `Dose 1 of 1` == 0 and `Dose 2 of 2` > 0")  #has does 2 but missing does 1
        visit_summary.loc[x.index, "Visit_Vaccine_History"] = 'Unable to Resolve Vaccination History'
        
        x = visit_summary.query("`Dose 1 of 2` == 0 and `Dose 1 of 1` == 0 and `Extra Doses` > 0")  #has does 3 but missing does 1
        visit_summary.loc[x.index, "Visit_Vaccine_History"] = 'Unable to Resolve Vaccination History'
        
        x = visit_summary.query("`Dose 1 of 1` == 0 and `Dose 2 of 2` == 0 and `Extra Doses` > 0")  #has does 3 but missing does 2
        visit_summary.loc[x.index, "Visit_Vaccine_History"] = 'Unable to Resolve Vaccination History'
        
        x = visit_summary.query("`Dose 1 of 2` > 0 and `Dose 2 of 2` == 0 and `Booster Count` > 0")  #has does 3 but missing does 2
        visit_summary.loc[x.index, "Visit_Vaccine_History"] = 'Unable to Resolve Vaccination History'
        
        x = visit_summary.query("`Dose 1 of 1` == 0 and `Dose 1 of 2` == 0 and `Dose 2 of 2` == 0 and `Booster Count` > 0")  #has does 3 but missing does 2
        visit_summary.loc[x.index, "Visit_Vaccine_History"] = 'Unable to Resolve Vaccination History'
        
        visit_summary = visit_summary.reset_index( drop=False)
        visit_summary = pd.DataFrame(visit_summary.values, columns=visit_summary.columns.tolist())
        
        if len(all_data) == 0:
            all_data = visit_summary
        else:
            all_data = pd.concat([all_data, visit_summary])
            


In [21]:
#if V4_fitler == True:
#    all_data = all_data.merge(v4_visits[['Research_Participant_ID', 'Visit_Info_ID', 'Normalized_Visit_Index']], 
#                              left_on = ["Research_Participant_ID", "Normalized_Visit"], right_on=['Research_Participant_ID', 'Normalized_Visit_Index'], how="left")

all_data = all_data.merge(part_table[["Research_Participant_ID", "Seronet_Participant_ID"]])     #only include particpants with valid start date
  
all_data.drop_duplicates(inplace=True)

In [22]:
'''
This block adds a CBC column to dataframe based on the Research_Participant_ID.
'''
all_data['CBC'] = "Not Reported"

conditions = [
    all_data['Research_Participant_ID'].str.startswith('14'),
    all_data['Research_Participant_ID'].str.startswith('27'),
    all_data['Research_Participant_ID'].str.startswith('41'),
    all_data['Research_Participant_ID'].str.startswith('32_22'),  #ASU_Midwestern
    all_data['Research_Participant_ID'].str.startswith('32_33'),  #ASU_Dignity Health
    all_data['Research_Participant_ID'].str.startswith('32_44'),  #ASU_ValleyWise
    all_data['Research_Participant_ID'].str.startswith('32_77'),  #ASU_Phoenix Childrens Hospital
    all_data['Research_Participant_ID'].str.startswith('32_55'),  #ASU_Columbia
]

values = ["Mount_Sinai", "University of Minnesota", "Feinstein_Northwell", "ASU: Midwestern", "ASU: Dignity Health",
          "ASU: ValleyWise", "ASU: Phoenix Childrens Hospital", "Columbia University"]

all_data['CBC'] = np.select(conditions, values)

In [23]:
all_data.reset_index(inplace=True)

all_data.rename(columns={"Booster Count": "Original Boosters"}, inplace=True)
all_data["Original Boosters"] = all_data["Original Boosters"]  - (all_data["Bivalent Boosters"] + all_data["XBB1.5 Booosters"])

#all_data.to_csv("all_data_1.csv")


In [24]:
all_data.reset_index(drop = True, inplace=True)
uni_id = list(set(all_data["Research_Participant_ID"]))

for curr_part in uni_id:
    visit_1_info = all_data.query(f"Research_Participant_ID == '{curr_part}'")
    x = visit_1_info.query("Visit_Vaccine_History =='Unable to Resolve Vaccination History'")
    if len(x) >= 1:
        all_data.loc[visit_1_info.index, "Visit_Vaccine_History"] = 'Unable to Resolve Vaccination History'

In [25]:
all_data = all_data.merge(visit_data, left_on=["Research_Participant_ID",'Normalized_Visit'], 
               right_on = ["Research_Participant_ID",'Normalized_Visit_Index'], how="left")

In [26]:
#if V4_fitler == True:
#    all_data = all_data.merge(v4_visits[['Research_Participant_ID', 'Visit_Info_ID']])
#    All_Vaccine = All_Vaccine.merge(v4_visits[['Research_Participant_ID', 'Visit_Info_ID']])

In [27]:
#col_list = ["CBC", "Research_Participant_ID" , "SeroNet_Cohort", "Normalized_Visit", 'Visit_Info_ID', "Visit_Vaccine_History", 
 #           "Primary Series", "Dose 1", "Dose 2", "Dose 3",  "Original Boosters", "Bivalent Boosters", "XBB1.5 Booosters"]

col_list = ["Seronet_Participant_ID" , "SeroNet_Cohort", "Normalized_Visit", "Visit_Vaccine_History", 
            "Primary Series", "Dose 1", "Dose 2", "Dose 3",  "Original Boosters", "Bivalent Boosters", "XBB1.5 Booosters"]

all_data = all_data[col_list]

In [28]:
All_Vaccine['Vaccine_Window_Cat'] = ""

In [29]:
x = All_Vaccine.query("Vaccination_Status in ['Unvaccinated', 'No Vaccination Data']")
All_Vaccine.loc[x.index, 'Vaccine_Window_Cat'] = np.nan  #no vaccination event

x = All_Vaccine.query("Vaccination_Status not in ['Unvaccinated', 'No Vaccination Data'] and Duration_Between_Vaccine_and_Visit == 0")
All_Vaccine.loc[x.index, 'Vaccine_Window_Cat'] = "Same Day"

x = All_Vaccine.query("Vaccination_Status not in ['Unvaccinated', 'No Vaccination Data'] and Duration_Between_Vaccine_and_Visit > 0 and Duration_Between_Vaccine_and_Visit < 15")
All_Vaccine.loc[x.index, 'Vaccine_Window_Cat'] = "1 to 14 days"

x = All_Vaccine.query("Vaccination_Status not in ['Unvaccinated', 'No Vaccination Data'] and Duration_Between_Vaccine_and_Visit >= 15 and Duration_Between_Vaccine_and_Visit <= 45")
All_Vaccine.loc[x.index, 'Vaccine_Window_Cat'] = "15 to 45 days (1 month)"

x = All_Vaccine.query("Vaccination_Status not in ['Unvaccinated', 'No Vaccination Data'] and Duration_Between_Vaccine_and_Visit >= 46 and Duration_Between_Vaccine_and_Visit <= 74")
All_Vaccine.loc[x.index, 'Vaccine_Window_Cat'] = "46 to 74 days"

x = All_Vaccine.query("Vaccination_Status not in ['Unvaccinated', 'No Vaccination Data'] and Duration_Between_Vaccine_and_Visit >= 75 and Duration_Between_Vaccine_and_Visit <= 105")
All_Vaccine.loc[x.index, 'Vaccine_Window_Cat'] = "75 to 105 days (3 month)"

x = All_Vaccine.query("Vaccination_Status not in ['Unvaccinated', 'No Vaccination Data'] and Duration_Between_Vaccine_and_Visit >= 106 and Duration_Between_Vaccine_and_Visit <= 164")
All_Vaccine.loc[x.index, 'Vaccine_Window_Cat'] = "106 to 164 days"

x = All_Vaccine.query("Vaccination_Status not in ['Unvaccinated', 'No Vaccination Data'] and Duration_Between_Vaccine_and_Visit >= 165 and Duration_Between_Vaccine_and_Visit <= 195")
All_Vaccine.loc[x.index, 'Vaccine_Window_Cat'] = "165 to 195 days (6 month)"

x = All_Vaccine.query("Vaccination_Status not in ['Unvaccinated', 'No Vaccination Data'] and Duration_Between_Vaccine_and_Visit >= 196 and Duration_Between_Vaccine_and_Visit <= 344")
All_Vaccine.loc[x.index, 'Vaccine_Window_Cat'] = "196 to 344 days"

x = All_Vaccine.query("Vaccination_Status not in ['Unvaccinated', 'No Vaccination Data'] and Duration_Between_Vaccine_and_Visit >= 345 and Duration_Between_Vaccine_and_Visit <= 375")
All_Vaccine.loc[x.index, 'Vaccine_Window_Cat'] = "345 to 375 days (12 month)"


x = All_Vaccine.query("Vaccination_Status not in ['Unvaccinated', 'No Vaccination Data'] and Duration_Between_Vaccine_and_Visit >= 376 and Duration_Between_Vaccine_and_Visit <= 524")
All_Vaccine.loc[x.index, 'Vaccine_Window_Cat'] = "376 to 524 days"

x = All_Vaccine.query("Vaccination_Status not in ['Unvaccinated', 'No Vaccination Data'] and Duration_Between_Vaccine_and_Visit >= 525 and Duration_Between_Vaccine_and_Visit <= 555")
All_Vaccine.loc[x.index, 'Vaccine_Window_Cat'] = "525 to 555 days (18 month)"

x = All_Vaccine.query("Vaccination_Status not in ['Unvaccinated', 'No Vaccination Data'] and Duration_Between_Vaccine_and_Visit >= 556 and Duration_Between_Vaccine_and_Visit <= 704")
All_Vaccine.loc[x.index, 'Vaccine_Window_Cat'] = "556 to 704 days"

x = All_Vaccine.query("Vaccination_Status not in ['Unvaccinated', 'No Vaccination Data'] and Duration_Between_Vaccine_and_Visit >= 705 and Duration_Between_Vaccine_and_Visit <= 735")
All_Vaccine.loc[x.index, 'Vaccine_Window_Cat'] = "705 to 735 days (24 month)"

x = All_Vaccine.query("Vaccination_Status not in ['Unvaccinated', 'No Vaccination Data'] and Duration_Between_Vaccine_and_Visit >= 736")
All_Vaccine.loc[x.index, 'Vaccine_Window_Cat'] = "736 days or more"

x = All_Vaccine.query("Vaccination_Status not in ['Unvaccinated', 'No Vaccination Data'] and Duration_Between_Vaccine_and_Visit >= 5000")
All_Vaccine.loc[x.index, 'Vaccine_Window_Cat'] = "Unable to determine (Vaccine Date not provided)"

In [30]:
all_data = all_data.sort_values(["Seronet_Participant_ID", "Normalized_Visit"])
All_Vaccine = All_Vaccine.sort_values(["Seronet_Participant_ID", "Normalized_Visit_Index", 'Vaccine_Duration_From_Baseline'])

In [31]:
#All_Vaccine.query("Research_Participant_ID == '32_221297'")

In [32]:
x = All_Vaccine.query("Vaccine_Window_Cat == 'Unable to determine (Vaccine Date not provided)'")
All_Vaccine.loc[x.index, "Duration_Between_Vaccine_and_Visit"] = np.nan
All_Vaccine.loc[x.index, "Vaccine_Duration_From_Baseline"] = np.nan


In [33]:
#All_Vaccine = All_Vaccine.query("Vaccination_Status not in ['No vaccination event reported', 'Unvaccinated']")

In [34]:
All_Vaccine["Vaccination_Status"] = All_Vaccine["Vaccination_Status"].replace("Unvaccinated", "No vaccination event reported")

All_Vaccine["Last Vaccine Received"] = All_Vaccine["Last Vaccine Received"].replace("Unvaccinated", "No vaccination event reported")
All_Vaccine["Last Vaccine Received"] = All_Vaccine["Last Vaccine Received"].replace("No Vaccination Data", "No vaccination event reported")
All_Vaccine["Last Vaccine Received"] = All_Vaccine["Last Vaccine Received"].replace("No Vaccination Data Reported", "No vaccination event reported")

In [35]:
x = All_Vaccine.query("`Last Vaccine Received` == 'No vaccination event reported'")
All_Vaccine.loc[x.index, "Last Vaccine_Type Received"] = 'No vaccination event reported'
All_Vaccine.loc[x.index, "Vaccine_Duration_From_Baseline"] = np.nan
All_Vaccine.loc[x.index, "Duration_Between_Vaccine_and_Visit"] = np.nan

All_Vaccine.loc[x.index, 'Vaccine_Window_Cat'] = "No Vaccination Event"

In [36]:
x = All_Vaccine.query("Vaccination_Status == 'No vaccination event reported'") 
All_Vaccine.loc[x.index, "Vaccination_Status"] = x["Last Vaccine Received"]

In [37]:
#All_Vaccine = All_Vaccine.query("Vaccination_Status == `Last Vaccine Received`")
All_Vaccine = All_Vaccine.query("Normalized_Visit_Index == Normalized_Visit_Index")

In [38]:
len(all_data)

15518

In [39]:
All_Vaccine["Last Booster Type"]  = ""

x = [i.split(":") for i in All_Vaccine["Last Vaccine Received"]]
All_Vaccine["Last Vaccine Received"]  = [i[0].strip() if len(i) == 1 else i[0].strip() for i in x]
All_Vaccine["Last Booster Type Received"]  = ["N/A" if len(i) == 1 else i[1].strip() for i in x]

x = All_Vaccine.query("`Last Vaccine Received` == 'No vaccination event reported'")
All_Vaccine.loc[x.index, "Last Booster Type Received"] = 'N/A'
All_Vaccine.loc[x.index, "Last Vaccine_Type Received"] = 'N/A'

In [40]:
new_cols = ["Seronet_Participant_ID", "SeroNet_Cohort", "Normalized_Visit_Index",
            "Last Vaccine Received", "Last Vaccine_Type Received", "Last Booster Type Received", "Vaccine_Duration_From_Baseline", "Duration_Between_Vaccine_and_Visit", 
            "Vaccine_Window_Cat",  "SARS-CoV-2_Vaccination_Side_Effects", "Other_SARS-CoV-2_Vaccination_Side_Effects", "Covid_Vaccination_Status_Comments"]

All_Vaccine = All_Vaccine[new_cols]

In [41]:
all_data.columns

Index(['Seronet_Participant_ID', 'SeroNet_Cohort', 'Normalized_Visit',
       'Visit_Vaccine_History', 'Primary Series', 'Dose 1', 'Dose 2', 'Dose 3',
       'Original Boosters', 'Bivalent Boosters', 'XBB1.5 Booosters'],
      dtype='object')

In [42]:
all_data.drop_duplicates(['Seronet_Participant_ID', "Normalized_Visit"], inplace=True)
All_Vaccine.drop_duplicates(['Seronet_Participant_ID', "Normalized_Visit_Index", "Last Vaccine Received"], inplace=True)

In [43]:
all_data["Visit_Vaccine_History"] = [i.strip() for i in all_data["Visit_Vaccine_History"]]

In [44]:
all_data.replace('Missing Vaccination Data', 'Unable to Resolve Vaccination History', inplace=True)
all_data.replace('No Vacccine_History with 1 Booster', 'Unable to Resolve Vaccination History', inplace=True)
all_data.replace('No Vacccine_History with 2 Boosters', 'Unable to Resolve Vaccination History', inplace=True)

In [45]:
visit_1_data = all_data.query("Normalized_Visit == 1")

In [46]:
order_dict_1 = {}
order_dict_a = {}


order_list = [ 
    
'No Vacccine_History',

'Primary Series: Johnson & Johnson (1 Dose)',
'Primary Series: Johnson & Johnson (1 Dose) and 1 Booster',
'Primary Series: Johnson & Johnson (1 Dose) and 2 Boosters',
'Primary Series: Johnson & Johnson (1 Dose) and 3 Boosters',
'Primary Series: Johnson & Johnson (1 Dose) and 4 Boosters',
'Primary Series: Johnson & Johnson (1 Dose) and 5 Boosters',
    
'Primary Series: Johnson & Johnson (2 Dose)*',
'Primary Series: Johnson & Johnson (2 Dose)* and 1 Booster',
'Primary Series: Johnson & Johnson (2 Dose)* and 2 Boosters',
'Primary Series: Johnson & Johnson (2 Dose)* and 3 Boosters',

'Primary Series: Moderna (1 Dose)',
'Primary Series: Moderna (2 Doses)',
'Primary Series: Moderna (2 Doses) and 1 Booster',
'Primary Series: Moderna (2 Doses) and 2 Boosters',
'Primary Series: Moderna (2 Doses) and 3 Boosters',
'Primary Series: Moderna (2 Doses) and 4 Boosters',
'Primary Series: Moderna (2 Doses) and 5 Boosters', 
    
'Primary Series: Moderna (3 Doses)*',
'Primary Series: Moderna (3 Doses)* and 1 Booster',
'Primary Series: Moderna (3 Doses)* and 2 Boosters',
'Primary Series: Moderna (3 Doses)* and 3 Boosters',
'Primary Series: Moderna (3 Doses)* and 4 Boosters',
'Primary Series: Moderna (3 Doses)* and 5 Boosters', 
    
'Primary Series: Pfizer (1 Dose)',    
'Primary Series: Pfizer (2 Doses)',
'Primary Series: Pfizer (2 Doses) and 1 Booster',
'Primary Series: Pfizer (2 Doses) and 2 Boosters',
'Primary Series: Pfizer (2 Doses) and 3 Boosters',
'Primary Series: Pfizer (2 Doses) and 4 Boosters',
'Primary Series: Pfizer (2 Doses) and 5 Boosters',
'Primary Series: Pfizer (2 Doses) and 6 Boosters',
'Primary Series: Pfizer (2 Doses) and 7 Boosters',

'Primary Series: Pfizer (3 Doses)*',
'Primary Series: Pfizer (3 Doses)* and 1 Booster',
'Primary Series: Pfizer (3 Doses)* and 2 Boosters',
'Primary Series: Pfizer (3 Doses)* and 3 Boosters',
'Primary Series: Pfizer (3 Doses)* and 4 Boosters',
'Primary Series: Pfizer (3 Doses)* and 5 Boosters',
    
'Primary Series: Unknown / Other (1 Dose)',
'Primary Series: Mixed/Unknown/Other (2 Doses)',
'Primary Series: Mixed/Unknown/Other (2 Doses) and 1 Booster',
'Primary Series: Mixed/Unknown/Other (2 Doses) and 2 Boosters',
'Primary Series: Mixed/Unknown/Other (2 Doses) and 3 Boosters',

'Unable to Resolve Vaccination History']

has_value_1 = list(set(visit_1_data['Visit_Vaccine_History']))
has_value_A = list(set(all_data['Visit_Vaccine_History']))

order_dict_1['Original Boosters'] = list(set(visit_1_data['Original Boosters']))
order_dict_1['Original Boosters'].sort()

order_dict_a['Original Boosters'] = list(set(all_data['Original Boosters']))
order_dict_a['Original Boosters'].sort()

order_dict_1["Visit_Vaccine_History"] = [i for (i, v) in zip(order_list, [i in has_value_1 for i in order_list]) if v]
order_dict_a["Visit_Vaccine_History"] = [i for (i, v) in zip(order_list, [i in has_value_A for i in order_list]) if v]

In [47]:
list(set(visit_1_data['Original Boosters']))

[0, 1]

In [48]:
columns = ['Visit_Vaccine_History', 'Original Boosters', 'Bivalent Boosters', 'XBB1.5 Booosters']
categorical = ['Visit_Vaccine_History', 'Original Boosters', 'Bivalent Boosters', 'XBB1.5 Booosters']
groupby = ['SeroNet_Cohort']


visit_1_summary_table1 = TableOne(visit_1_data, columns, categorical, groupby , order=order_dict_1)
all_visit_summary_table1 = TableOne(all_data, columns, categorical, groupby, order=order_dict_a)

In [49]:
All_Vaccine.columns

Index(['Seronet_Participant_ID', 'SeroNet_Cohort', 'Normalized_Visit_Index',
       'Last Vaccine Received', 'Last Vaccine_Type Received',
       'Last Booster Type Received', 'Vaccine_Duration_From_Baseline',
       'Duration_Between_Vaccine_and_Visit', 'Vaccine_Window_Cat',
       'SARS-CoV-2_Vaccination_Side_Effects',
       'Other_SARS-CoV-2_Vaccination_Side_Effects',
       'Covid_Vaccination_Status_Comments'],
      dtype='object')

In [50]:
visit_file = output_folder + file_sep + "Participant_Visit_Info_" + version_num + ".xlsx"
all_visit_data = pd.read_excel(visit_file, sheet_name="Detailed_Report")

All_Vaccine = All_Vaccine.merge(all_visit_data[["Seronet_Participant_ID", "SeroNet_Cohort", "Normalized_Visit_Index"]])

In [51]:
columns =     ['Vaccine_Window_Cat']
categorical = ['Vaccine_Window_Cat']
groupby = ['SeroNet_Cohort']

order_dict = {}

order_dict['Vaccine_Window_Cat'] = [
'No Vaccination Event', 'Same Day', '1 to 14 days','15 to 45 days (1 month)', '46 to 74 days', '75 to 105 days (3 month)', '106 to 164 days',
'165 to 195 days (6 month)','196 to 344 days','345 to 375 days (12 month)', '376 to 524 days','525 to 555 days (18 month)',
    '556 to 704 days','705 to 735 days (24 month)', '736 days or more', 'Unable to determine (Vaccine Date not provided)']

order_dict["SeroNet_Cohort"] = ["Healthy Cohort", "Comorbidity Cohort",  "Cancer",   "IBD",  "HIV", "Transplant"]

all_visit_window_table1 = TableOne(All_Vaccine, columns, categorical, groupby, order=order_dict)

In [52]:
vaccine_writer = pd.ExcelWriter(f'{output_folder}{file_sep}{sheet_name}_{version_num}.xlsx')

In [53]:
visit_1_summary_table1.to_excel(vaccine_writer, 'Baseline_Summary')
visit_1_data.to_excel(vaccine_writer, 'Baseline_Vaccination_History', index = False)

all_visit_summary_table1.to_excel(vaccine_writer, 'All_Visit_Summary')
all_data.to_excel(vaccine_writer, 'All_Vaccination History', index = False)

all_visit_window_table1.to_excel(vaccine_writer, 'Visit Window Summary')
All_Vaccine.to_excel(vaccine_writer, 'Visit Vacc and Side Effects', index = False) 

In [54]:
'''
Ensure that the Release_Data_Dictionary.xlsx workbook is stored in the same folder as this notebook.
'''
dictionary = pd.read_excel("Release_Data_Dictionary_External.xlsx", sheet_name = "Vaccine_Status")
dictionary.to_excel(vaccine_writer, 'Data_Dictionary', index = False)

In [55]:
workbook = vaccine_writer.book
worksheet = vaccine_writer.sheets['Baseline_Summary']
column_fmt = workbook.add_format({'font_name': 'Arial', 'font_size': 10, 'align': 'center'})
worksheet.set_column(0, 0, 30, column_fmt)   #this is column header names
worksheet.set_column(1, 1, 65, column_fmt)   #this is sub column header names
worksheet.set_column(2, 3, 12, column_fmt)   #all the data
worksheet.set_column(4, 10, 20, column_fmt)  #all the data

In [56]:
workbook = vaccine_writer.book
worksheet = vaccine_writer.sheets['All_Visit_Summary']
column_fmt = workbook.add_format({'font_name': 'Arial', 'font_size': 10, 'align': 'center'})
worksheet.set_column(0, 0, 30, column_fmt)   #this is column header names
worksheet.set_column(1, 1, 65, column_fmt)   #this is sub column header names
worksheet.set_column(2, 3, 12, column_fmt)   #all the data
worksheet.set_column(4, 10, 20, column_fmt)  #all the data

0

In [57]:
workbook = vaccine_writer.book
worksheet = vaccine_writer.sheets['Visit Window Summary']
column_fmt = workbook.add_format({'font_name': 'Arial', 'font_size': 10, 'align': 'center'})
worksheet.set_column(0, 0, 30, column_fmt)   #this is column header names
worksheet.set_column(1, 1, 50, column_fmt)   #this is sub column header names
worksheet.set_column(2, 3, 12, column_fmt)   #all the data
worksheet.set_column(4, 10, 20, column_fmt)  #all the data

0

In [58]:
vaccine_writer.close()

In [59]:
'''
Open the workbook to add additional notes to the Summary page.
Each note shoud be on a separate line. Add note as a single value array: noteX = ['noteX: This is a sample note.']
Append note to sheet.
Save workbook.
'''
#wb = load_workbook(f'{output_folder}{file_sep}{sheet_name}_{version_num}.xlsx')
#sheet = wb['Baseline_Summary']
#note1 = ['Note *:   In addition to receiving the Primary Series, Immunodeficient participants at MSSM received an additional dose. ']
#note2 = ["                For Pfizer and Moderna this is a 3rd Dose, and for Johnson and Johnson this is a 2nd dose as part of the primary series (additional doses received prior to any boosters). "]
#
#sheet.append(note1)
#sheet.append(note2)
#
#wb.save(filename = f'{output_folder}{file_sep}{sheet_name}_{version_num}.xlsx')

"\nOpen the workbook to add additional notes to the Summary page.\nEach note shoud be on a separate line. Add note as a single value array: noteX = ['noteX: This is a sample note.']\nAppend note to sheet.\nSave workbook.\n"

In [60]:
'''
Open the workbook to add additional notes to the Summary page.
Each note shoud be on a separate line. Add note as a single value array: noteX = ['noteX: This is a sample note.']
Append note to sheet.
Save workbook.
'''
wb = load_workbook(f'{output_folder}{file_sep}{sheet_name}_{version_num}.xlsx')
sheet = wb['All_Visit_Summary']
note1 = ['Note *:   In addition to receiving the Primary Series, Immunodeficient participants at MSSM received an additional dose. ']
note2 = ["                For Pfizer and Moderna this is a 3rd Dose, and for Johnson and Johnson this is a 2nd dose as part of the primary series (additional doses received prior to any boosters). "]
note3 = ['']
note4 = ["Note 1: Unable to Resolve Vaccination History means there is a vaccination event missing or dates are out of order"]
note5 = ["            For example Participant has Dose 2 but is missing Dose 1 information, or Dose 1 was occurs after Dose 2 was reported"]

sheet.append(note1)
sheet.append(note2)
sheet.append(note3)
sheet.append(note4)
sheet.append(note5)
wb.save(filename = f'{output_folder}{file_sep}{sheet_name}_{version_num}.xlsx')