In [1]:
import warnings
import mysql.connector as connection
import pandas as pd
import numpy as np
import os
import sqlalchemy as sd
from tableone import TableOne
from openpyxl import load_workbook
import papermill as pm

warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
'''
Create a .env file to store hostname, database, username, and password as environment variables.  
The .env file is stored in same folder as notebook.
==============
Format:
HOST=a.b.c.d
DB=<my.database>
USER=john
PWD=abc#123
==============
Note: no space around the "=" sign. Do not put any of the values in quotes.
'''

env = {}
with open("test.env") as f:
    for line in f:
        (k, v) = line.split("=")
        k = k.strip()
        env[k] = v.strip()
        
HOST = env["HOST"]
DB = env["DB"]
USER = env["USER"]
PWD = env["PWD"]



In [4]:
last_edit = '12/16/2024'

In [5]:
v4_visits = r"C:\Users\breadsp2\Documents\Release_4.0.0\Participant_Visit_Info_4.0.0.xlsx"
v4_visits = pd.read_excel(v4_visits, sheet_name="Detailed_Report")
V4_fitler = False

In [6]:
if V4_fitler == True:
    version_num = "4.2.0"
    output_folder = r"C:\Users\breadsp2\Desktop\Release_4.2.0"
else: 
    version_num = "5.2.0"
    output_folder = r"C:\Users\breadsp2\Desktop\Release_5.2.0"
file_sep = os.path.sep
release_date = '2024-10-01'

In [7]:
file_name = "Baseline_Comborbidities"
#file_name = "Baseline_Infections"
#file_name = "Baseline_Chronic_list"

In [8]:
if file_name == "Baseline_Comborbidities":
    
    list_of_conditions = ["Diabetes", "Hypertension", "Cardiovascular_Disease", "Acute_Liver_Disease", 
                          "Immunosuppressive_Condition", "Autoimmune_Disorder", "Inflammatory_Disease", "Cancer"]

    sheet_name = "All_Participant_Comorbidities"
    
if file_name == "Baseline_Infections":
    
    list_of_conditions = ["Viral_Infection", "Bacterial_Infection"]
    sheet_name = "All_Participant_Infections"
    
if file_name == "Baseline_Chronic_list":
    
    list_of_conditions = ["Chronic_Lung_Disease", "Chronic_Kidney_Disease", "Chronic_Liver_Disease", 
                      "Chronic_Neurological_Condition", "Chronic_Oxygen_Requirement"]
    sheet_name = "All_Participant_Chronic_Conditions"

In [9]:
for curr_cond in list_of_conditions:
    res = pm.execute_notebook('Comorbid_baseline.ipynb',  'Output_file.ipynb',
        parameters = dict(Condition=curr_cond, USER=USER, PWD=PWD, HOST=HOST, DB=DB, sheet_1 = sheet_name))
    print(f"finished {curr_cond}")

Executing:   0%|          | 0/28 [00:00<?, ?cell/s]

finished Diabetes


Executing:   0%|          | 0/28 [00:00<?, ?cell/s]

finished Hypertension


Executing:   0%|          | 0/28 [00:00<?, ?cell/s]

finished Cardiovascular_Disease


Executing:   0%|          | 0/28 [00:00<?, ?cell/s]

finished Acute_Liver_Disease


Executing:   0%|          | 0/28 [00:00<?, ?cell/s]

finished Immunosuppressive_Condition


Executing:   0%|          | 0/28 [00:00<?, ?cell/s]

finished Autoimmune_Disorder


Executing:   0%|          | 0/28 [00:00<?, ?cell/s]

finished Inflammatory_Disease


Executing:   0%|          | 0/28 [00:00<?, ?cell/s]

finished Cancer


In [10]:
comorbid_file = output_folder + file_sep + sheet_name + "_" + version_num + ".xlsx"
all_data = []

for curr_cond in list_of_conditions:
    if curr_cond == "Immunosuppressive_Condition":
        comorbid_data = pd.read_excel(comorbid_file, sheet_name="Immunosuppressive_Summary")
    elif curr_cond == "Chronic_Neurological_Condition":
        comorbid_data = pd.read_excel(comorbid_file, sheet_name="Chronic_Neurological_Summary")
    elif curr_cond == "Chronic_Oxygen_Requirement":
        comorbid_data = pd.read_excel(comorbid_file, sheet_name="Chronic_Oxygen_Summary")
    else:
        comorbid_data = pd.read_excel(comorbid_file, sheet_name=curr_cond + "_Summary")
        
    comorbid_data = comorbid_data.reset_index(drop=True)
        
    comorbid_data[curr_cond + "_Status"] = " Yes: Participant has Condition"   
    
    x = comorbid_data.query("Harmonized_Value == ''")
    comorbid_data.loc[x.index, curr_cond + "_Status"] =  "  Yes: Term Not Harmonized"  
    
    x = comorbid_data.query("Harmonized_Value == 'Participant does not have Condition (answered No at baseline)'")
    comorbid_data.loc[x.index, curr_cond + "_Status"] = "  No: Does not have Condition"

    x = comorbid_data.query("Harmonized_Value == 'Not Cancer'")
    comorbid_data.loc[x.index, curr_cond + "_Status"] = "  No: Does not have Condition"

    x = comorbid_data.query("Harmonized_Value == 'Participant did not answer question, Unable to determine if condition exists'")
    comorbid_data[curr_cond + "_Status"][x.index] = 'Status not Provided: Not Reported'
    
    x = comorbid_data.query("Harmonized_Value == 'Not Reported'")
    comorbid_data[curr_cond + "_Status"][x.index] = 'Status not Provided: Not Reported'
    
    x = comorbid_data.query("Harmonized_Value == 'Participant is unsure if condition exists (answered Unknown Status)'")
    comorbid_data[curr_cond + "_Status"][x.index] = ' Unknown: Participant is Unsure'
    
    x = comorbid_data.query("Harmonized_Value == 'Unknown'")
    comorbid_data[curr_cond + "_Status"][x.index] = ' Unknown: Participant is Unsure'
    
    #x = comorbid_data.query(f"Harmonized_Value in ['Condition Not Described', '{curr_cond}, Type Unspecified']")
    #comorbid_data[curr_cond + "_Status"][x.index] = 'Yes: Type Unspecified'
    
    comorbid_data = comorbid_data[["Seronet_Participant_ID", curr_cond + "_Status"]]
    if len(all_data) == 0:
        all_data = comorbid_data
    else:
        all_data = all_data.merge(comorbid_data, how="outer") 

In [11]:
creds = {'usr': USER, 'pwd': PWD, 'hst': HOST, "prt": 3306, 'dbn': DB}
connstr = "mysql+mysqlconnector://{usr}:{pwd}@{hst}:{prt}/{dbn}"
engine = sd.create_engine(connstr.format(**creds))
conn = engine.connect()

In [12]:
part_data = pd.read_sql(("SELECT * FROM Participant;"), conn)
all_data = all_data.merge(part_data, how="left")

cohort_data = pd.read_sql(("SELECT * FROM Participant_Cohort;"), conn)
all_data = all_data.merge(cohort_data, how="left")

cohort_data = pd.read_sql(("SELECT * FROM Normalized_Visit_Info;"), conn)
all_data = all_data.merge(cohort_data, how="left")

In [13]:
all_data = all_data.query("Normalized_Visit_Index == 1")

In [14]:
new_cols = ['Seronet_Participant_ID', 'SeroNet_Cohort',  'Normalized_Visit_Index'] + [i + "_Status" for i in list_of_conditions]
all_data = all_data[new_cols]

all_data.drop_duplicates(inplace = True)

In [15]:
columns = [i + "_Status" for i in list_of_conditions]
categorical =  [i + "_Status" for i in list_of_conditions]
groupby = 'SeroNet_Cohort'

In [16]:
order_dict = {}
order_dict["SeroNet_Cohort"] = ["Healthy Cohort", "Comorbidity Cohort",  "Cancer",   "IBD",  "HIV", "Transplant"]

In [17]:
if file_name == "Baseline_Chronic_list":
    dictionary = pd.read_excel("Release_Data_Dictionary_External.xlsx", sheet_name = "Chronic_Cond")
elif file_name == "Baseline_Comborbidities":
    dictionary = pd.read_excel("Release_Data_Dictionary_External.xlsx", sheet_name = "Comorbids")
elif file_name == "Baseline_Infections":
    dictionary = pd.read_excel("Release_Data_Dictionary_External.xlsx", sheet_name = "Infections")

In [18]:
cond_writer = pd.ExcelWriter(f'{output_folder}{file_sep}{sheet_name}_Summary_{version_num}.xlsx')
comorbid_table = TableOne(all_data, columns, categorical, groupby,  order=order_dict, pval=False)
comorbid_table.to_excel(cond_writer, 'Summary')
all_data.to_excel(cond_writer, 'Raw_Data', index = False)
dictionary.to_excel(cond_writer, 'Data_Dictionary', index = False)

In [19]:
workbook = cond_writer.book
worksheet = cond_writer.sheets['Summary']
column_fmt = workbook.add_format({'font_name': 'Arial', 'font_size': 10, 'align': 'center'})
worksheet.set_column(0, 0, 45, column_fmt)  #this is column header names
worksheet.set_column(1, 1, 35, column_fmt)  #this is sub column header names
worksheet.set_column(2, 3, 12, column_fmt)  #all the data
worksheet.set_column(4, 11, 20, column_fmt)  #all the data

0

In [20]:
cond_writer.close()

In [21]:
#for curr_cond in Chronic_list:
#    res = pm.execute_notebook('Comorbid_baseline.ipynb',  'Output_file_2.ipynb',
#        parameters = dict(Condition=curr_cond, sheet_1 = "Baseline_Chronic_Conditions"))
#    print(f"finished {curr_cond}")

In [22]:
#for curr_cond in Infection_list:
#    res = pm.execute_notebook('Comorbid_baseline.ipynb',  'Output_file_3.ipynb',
#        parameters = dict(Condition=curr_cond,sheet_1 = "Baseline_Infections"))
#    print(f"finished {curr_cond}")