#### Objective
Gather data and create summary table for Covid_History.xlsx

#### Pre-requisites
The following packages need to be installed in your notebook environment:
1. <a href = https://dev.mysql.com/doc/connector-python/en/>mysql.connector</a>
2. <a href = https://pandas.pydata.org/>pandas</a>
3. <a href = https://numpy.org/>numpy</a>
4. <a href = https://pypi.org/project/tableone/>tableone</a>
5. <a href = https://pypi.org/project/openpyxl/>openpyxl</a>

Be sure to install these packages in one of the directories in the system path(`print sys.path`) for the notebook environment.

You will need access (username and password) to seronet vaccine response database, `seronetdb-Vaccine_Response`, accessible at the AWS endpoint: `seronet-dev-instance.cwe7vdnqsvxr.us-east-1.rds.amazonaws.com`. <br>
**Note:** Store all mysql connector parameters in a `.env` file as shown below. <i>The `.env` needs to be in the same directory as the notebooks</i>.

Ensure that the Excel workbook <i>Release_Data_Dictionary.xlsx</i> is in the same folder as this notebook. This workbook is also on versioned on GitHub.

#### Post-processing files
<u>Summary sheet:</u>
1. Remove all borders from Table 1.
2. Shade table rows in grey/white.
3. Right justify second column.
4. Center all columns reporting values.
5. Autoformat column widths.
6. Highlight all notes in bold.

<u>Detailed report sheets:</u>
1. All columns left justified.
2. Autoformat column widths.
3. Fill all empty cells with "Not Reported".


In [1]:
import mysql.connector as connection
import pandas as pd
import numpy as np
import os
from tableone import TableOne
from openpyxl import load_workbook, Workbook
from openpyxl.styles import Font
import sqlalchemy as sd
import datetime

In [2]:
last_edit = '11/20/2024'

In [3]:
v4_visits = r"C:\Users\breadsp2\Documents\Release_4.0.0\Participant_Visit_Info_4.0.0.xlsx"
v4_visits = pd.read_excel(v4_visits, sheet_name="Detailed_Report")
V4_fitler = False

In [4]:
pd.options.mode.chained_assignment = None  # default='warn'

In [5]:
'''
Create a .env file to store hostname, database, username, and password as environment variables.  
The .env file is stored in same folder as notebook.
==============
Format:
HOST=a.b.c.d
DB=<my.database>
USER=john
PWD=abc#123
==============
Note: no space around the "=" sign. Do not put any of the values in quotes.
'''

env = {}
with open("test.env") as f:
    for line in f:
        (k, v) = line.split("=")
        k = k.strip()
        env[k] = v.strip()
        
HOST = env["HOST"]
DB = env["DB"]
USER = env["USER"]
PWD = env["PWD"]

In [6]:
if V4_fitler == True:
    version_num = "4.2.0"
    output_folder = r"C:\Users\breadsp2\Desktop\Release_4.2.0"
else: 
    version_num = "5.2.0"
    output_folder = r"C:\Users\breadsp2\Desktop\Release_5.2.0"
file_sep = os.path.sep
release_date = '2025-01-01'

In [7]:
sheet_name = "Covid_History"

In [8]:
pd.options.mode.chained_assignment = None  # default='warn'
mydb = connection.connect(host = HOST, database = DB ,user= USER, passwd = PWD,use_pure = True)

In [9]:
creds = {'usr': USER, 'pwd': PWD, 'hst': HOST, "prt": 3306, 'dbn': DB}
connstr = "mysql+mysqlconnector://{usr}:{pwd}@{hst}:{prt}/{dbn}"
engine = sd.create_engine(connstr.format(**creds))
conn = engine.connect()

In [10]:
All_Visits = pd.read_sql(("SELECT p.Seronet_Participant_ID, nv.Research_Participant_ID, nv.Seronet_Cohort, nv.Normalized_Visit_Index,  " +
                          "nv.Visit_Info_ID  From Normalized_Visit_Info as nv " + 
                          "join Participant as p on nv.Research_Participant_ID = p.Research_Participant_ID"), conn)

In [11]:
#All_Visits["Primary_Cohort"].replace("IBD", "Autoimmune", inplace=True)
#All_Visits["Primary_Cohort"].replace("Convalescent", "Healthy Control", inplace=True)
#All_Visits["Primary_Cohort"].replace("Inflammatory", "Healthy Control", inplace=True)
#All_Visits["Primary_Cohort"].replace("Chronic Conditions", "Healthy Control", inplace=True)

In [12]:
demo_file = output_folder + file_sep + "Demographics_" + version_num + ".xlsx"
demo_data = pd.read_excel(demo_file, sheet_name="Detailed_Report")

In [13]:
curr_visit_file = output_folder + file_sep + "Participant_Visit_Info_" + version_num + ".xlsx"
curr_visit_data = pd.read_excel(curr_visit_file, sheet_name="Detailed_Report")

In [14]:
All_Visits= All_Visits.merge(demo_data["Seronet_Participant_ID"])
All_Visits= All_Visits.merge(curr_visit_data[["Seronet_Participant_ID", 'Normalized_Visit_Index']], how="right")

In [15]:
query1  = ("SELECT * FROM Covid_History as ch " +
           "where ch.COVID_Status not in ('No COVID event reported', 'No COVID data collected')")

In [16]:
offset = pd.read_sql(("SELECT * FROM Visit_One_Offset_Correction"), conn)

In [17]:
covid_history = pd.read_sql(query1, conn)

In [18]:
covid_history = All_Visits.merge(covid_history, how="left")

In [19]:
covid_history = covid_history.merge(offset, how="left")

In [20]:
'''
This block adds a CBC column to dataframe based on the Research_Participant_ID.
'''
covid_history['CBC'] = "Not Reported"

conditions = [
    covid_history['Research_Participant_ID'].str.startswith('14'),
    covid_history['Research_Participant_ID'].str.startswith('27'),
    covid_history['Research_Participant_ID'].str.startswith('41'),
    covid_history['Research_Participant_ID'].str.startswith('32'),
]

values = ["Mount_Sinai", "UMN", "Feinstein_Northwell", "ASU"]

covid_history['CBC'] = np.select(conditions, values)

In [21]:
covid_visits = pd.DataFrame(columns=["Seronet_Participant_ID" ,"Seronet_Cohort", "Normalized_Visit_Index",  "COVID_Status"])

In [22]:
uni_visit = list(set(covid_history["Visit_Info_ID"]))
filt_covid_history = covid_history[["Visit_Info_ID", "Seronet_Participant_ID" ,"Seronet_Cohort", "Normalized_Visit_Index",  "COVID_Status"]]

In [23]:
for curr_visit in uni_visit:
    x = filt_covid_history.query("Visit_Info_ID == @curr_visit")
    covid_list = x["COVID_Status"].tolist()
    x.drop_duplicates(["Visit_Info_ID", "Seronet_Participant_ID" ,"Seronet_Cohort", "Normalized_Visit_Index",  "COVID_Status"], inplace=True)
    if covid_list == [np.nan]:
        covid_list = ["No Covid Event Reported"]
    try:
        covid_list.sort()
        x["COVID_Status"] = (" | ").join(covid_list)
    except Exception as e:
        print(covid_list)
        break
    finally:
        covid_visits = pd.concat([covid_visits, x])

In [24]:
covid_visits.reset_index(drop=True, inplace=True)

In [25]:
rapid_test =  covid_visits["COVID_Status"].str.contains("Positive by Rapid Antigen Test")
pcr_test =    covid_visits["COVID_Status"].str.contains("Positive by PCR")
anti_test =   covid_visits["COVID_Status"].str.contains("Positive by Antibody Test")

z = pd.DataFrame(columns = ["PCR", "Rapid_Antigen", "Antibody"],data = list(zip(pcr_test, rapid_test, anti_test)))

#covid_visits["Covid Test Summary"] = [i.find('Positive') for i in covid_visits["COVID_Status"]]

In [26]:
covid_visits["Covid Test Summary"] = "Negative Test"

In [27]:
x = z.query("PCR == True and Rapid_Antigen == False and Antibody == False")
covid_visits["Covid Test Summary"][x.index] = "Positive by PCR Test"

x = z.query("PCR == False and Rapid_Antigen == True and Antibody == False")
covid_visits["Covid Test Summary"][x.index] = "Positive by Rapid Antigen Test"

x = z.query("PCR == False and Rapid_Antigen == False and Antibody == True")
covid_visits["Covid Test Summary"][x.index] = "Positive by Antibody Test"

no_test = covid_visits.query("COVID_Status == 'Likely COVID Positive'")
covid_visits["Covid Test Summary"][no_test.index] = "No Test, self reported: Likely COVID Positive"

In [28]:
z["all_tests"] =  z.sum(axis=1)
x = z.query("all_tests > 1")
covid_visits["Covid Test Summary"][x.index] = "Two or More Positive Tests"

In [29]:
no_test = covid_visits.query("COVID_Status == 'No Covid Event Reported'")
covid_visits["Covid Test Summary"][no_test.index] = "No Covid Event Reported"

In [30]:
columns = ['Covid Test Summary']
categorical = ['Covid Test Summary']
groupby = 'Seronet_Cohort'

order_dict = {}

order_dict['Covid Test Summary'] = [
'No Covid Event Reported',
'No Test, self reported: Likely COVID Positive',
'Negative Test',
'Positive by Antibody Test',
'Positive by PCR Test',
'Positive by Rapid Antigen Test',
'Two or More Positive Tests']

order_dict["SeroNet_Cohort"] = ["Healthy Cohort", "Comorbidity Cohort",  "Cancer",   "IBD",  "HIV", "Transplant"]


In [31]:
covid_history["PCR_Duration_Normalized"] = [np.nan if a == "N/A" else a-b  for a,b in zip(covid_history["PCR_Test_Date_Duration_From_Index"],covid_history["Offset_Value"])]
covid_history["Rapid_Antigen_Duration_Normalized"] = [np.nan if a == "N/A" else a-b  for a,b in zip(covid_history["Rapid_Antigen_Test_Date_Duration_From_Index"],covid_history["Offset_Value"])]
covid_history["Antibody_Duration_Normalized"] = [np.nan if a == "N/A" else a-b  for a,b in zip(covid_history["Antibody_Test_Date_Duration_From_Index"],covid_history["Offset_Value"])]

In [32]:
col_names = ["Seronet_Participant_ID", "Seronet_Cohort", "Normalized_Visit_Index", "COVID_Status",
             "PCR_Duration_Normalized", "Rapid_Antigen_Duration_Normalized", "Antibody_Duration_Normalized",
             "Breakthrough_COVID", "Symptomatic_COVID", "Recovered_From_COVID", "Duration_of_Disease", "Recovery_Date_Duration_From_Index", "Disease_Severity",
             "Level_Of_Care", "Symptoms", "Other_Symptoms", "COVID_complications", "Long_COVID_symptoms", "Other_Long_COVID_symptoms", "COVID_Therapy",
             "Covid_History_Comments"]

covid_history = covid_history[col_names]

In [33]:
covid_visits.drop(["Visit_Info_ID"], axis=1, inplace=True)

In [34]:
covid_history["COVID_Status"].fillna("No Covid Event Reported", inplace=True)
covid_history["COVID_Summary"] = [i.find('Positive') for i in covid_history["COVID_Status"]]
pos_covid_visits = covid_history.query("COVID_Summary >= 0")

covid_history.drop("COVID_Summary", axis=1, inplace=True)

In [35]:
covid_visits.drop_duplicates(inplace=True)
covid_history.drop_duplicates(inplace=True)

In [36]:
covid_history_table1 = TableOne(covid_visits, columns, categorical, groupby, order = order_dict)



In [37]:
pos_covid_visits["Symptoms"].fillna("No symptoms reported", inplace=True)
pos_covid_visits["Symptoms"].replace("N/A", "No symptoms reported", inplace=True)

pos_covid_visits["Other_Symptoms"].fillna("N/A", inplace=True)

pos_covid_visits["all_symptoms"] = list(zip([i.split("|") for i in pos_covid_visits["Symptoms"]] , [i.split("|") for i in pos_covid_visits["Other_Symptoms"]]))

pos_covid_visits["all_symptoms"] = [[x for xs in i for x in xs] for i in  pos_covid_visits["all_symptoms"]]
pos_covid_visits["all_symptoms"] = [list(filter(lambda x: x not in ["N/A", "Other", "No symptoms reported"], i )) for i in pos_covid_visits["all_symptoms"] ]

pos_covid_visits["Number_Of_Symptoms"] = [len(i) for i in pos_covid_visits["all_symptoms"]]

pos_covid_visits.drop(["COVID_Summary", "all_symptoms"], axis=1, inplace=True)

In [38]:
pos_covid_visits["Harmonized Therapy Name"] =  pos_covid_visits["COVID_Therapy"].replace({
'N/A': 'N/A', 'Not Reported' : 'Not Reported','Paxlovid oral antiviral':'paxlovid',
'Paxlovid':'paxlovid','Remdesivir':'remdesivir','Azithromycin':'azithromycin','No treatment or therapy':'No Therapy',
'Molnupiravir':'molnupiravir','Monoclonal antibodies':'monoclonal antibody therapy, not specified','Aspirin':'aspirin',
'Paxlovid|Molnupiravir':'molnupiravir | paxlovid',
'Bamlanivimab':'bamlanivimab','Dulera and Albuterol inhalers':'albuterol | mometasone furoate and formoterol fumarate dihydrate',
'Sotrovimab':'sotrovimab', 'Hydroxychloroquine':'hydroxychloroquine','Convalescent serum/plasma':'convalescent plasma',
'Monoclonal antibodies, unspecified':'monoclonal antibody therapy, not specified','Unknown':'Unknown',
'Monoclonal infusion':'monoclonal antibody therapy, not specified',
'Monoclonal antibody therapy':'monoclonal antibody therapy, not specified',
'Remdesivir|Molnupiravir|Hydroxychloriquine':'hydroxychloriquine | molnupiravir | remdesivir',
'Remdesivir|Dexamethasone':'dexamethasone | remdesivir',
'steroids for COVID-19 related tightening of chest and history of asthma':'steroids, not specified',
'Hydroxychloroquine,Azithromycin':'azithromycin | hydroxychloroquine',
'Azithromycin,Other (specify)':'azithromycin | Other',
'Prednisone and cough medicine':'cough medicine, not specified | prednisone',
'Other COVID-19 treatment':'Other',
'Bebtelovimab':'bebtelovimab',
'NyQuil and DayQuil':'acetaminophen, dextromethorphan, and doxylamine |  acetaminophen, dextromethorphan, and pseudoephedrine',
'Hydroxychloroquine,Azithromycin,Aspirin,Blood Thinners':'azithromycin | aspirin | blood thinners, not specified | hydroxychloroquine',
'Paxlovid|prednisone':'paxlovid | prednisone',
'Monoclonoantibody infusion':'monoclonal antibody therapy, not specified',
'Molnupiravir|Monoclonal antibodies mAb':'molnupiravir | monoclonal antibody therapy, not specified'})


In [39]:
'''
Ensure that the Release_Data_Dictionary.xlsx workbook is stored in the same folder as this notebook.
'''
dictionary = pd.read_excel("Release_Data_Dictionary_External.xlsx", sheet_name = "Covid_History_Dictionary")

In [40]:
#Note: Be sure to name the file for the ExcelWriter object as Covid_History.<release_number>.xlsx.
covid_history_writer= pd.ExcelWriter(f'{output_folder}{file_sep}{sheet_name}_{version_num}.xlsx')

covid_history_table1.to_excel(covid_history_writer, "Summary")
covid_visits.to_excel(covid_history_writer, 'Covid Test All Visit', index = False)
pos_covid_visits.to_excel(covid_history_writer, 'Positive Covid Visits', index = False)
dictionary.to_excel(covid_history_writer, 'Data_Dictionary', index = False)

In [41]:
workbook = covid_history_writer.book
worksheet = covid_history_writer.sheets['Summary']
column_fmt = workbook.add_format({'font_name': 'Arial', 'font_size': 10, 'align': 'center'})
worksheet.set_column(0, 0, 30, column_fmt)  #this is column header names
worksheet.set_column(1, 1, 42, column_fmt)  #this is sub column header names
worksheet.set_column(2, 3, 12, column_fmt)  #all the data
worksheet.set_column(4, 17, 20, column_fmt)  #all the data

0

In [42]:
covid_history_writer.close()

In [43]:

wb = load_workbook(f'{output_folder}{file_sep}{sheet_name}_{version_num}.xlsx')
sheet = wb['Summary']
note0 = [""]
note1 = ["Note: Likely COVID Positive means the participant was not tested, but was around Positive Individuals"]
sheet.append(note0)
sheet.append(note1)
wb.save(filename = f'{output_folder}{file_sep}{sheet_name}_{version_num}.xlsx')