In [1]:
import mysql.connector as connection
import pandas as pd
import numpy as np
import os
from tableone import TableOne
from openpyxl import load_workbook, Workbook
from openpyxl.styles import Font
import sqlalchemy as sd
import datetime
import icd10

In [2]:
last_edit = '11/20/2024'

In [3]:
v4_visits = r"C:\Users\breadsp2\Documents\Release_4.0.0\Participant_Visit_Info_4.0.0.xlsx"
v4_visits = pd.read_excel(v4_visits, sheet_name="Detailed_Report")
V4_fitler = False

In [4]:
pd.options.mode.chained_assignment = None  # default='warn'

In [5]:
'''
Create a .env file to store hostname, database, username, and password as environment variables.  
The .env file is stored in same folder as notebook.
==============
Format:
HOST=a.b.c.d
DB=<my.database>
USER=john
PWD=abc#123
==============
Note: no space around the "=" sign. Do not put any of the values in quotes.
'''

env = {}
with open("test.env") as f:
    for line in f:
        (k, v) = line.split("=")
        k = k.strip()
        env[k] = v.strip()
        
HOST = env["HOST"]
DB = env["DB"]
USER = env["USER"]
PWD = env["PWD"]

In [6]:
if V4_fitler == True:
    version_num = "4.2.0"
    output_folder = r"C:\Users\breadsp2\Desktop\Release_4.2.0"
else: 
    version_num = "5.2.0"
    output_folder = r"C:\Users\breadsp2\Desktop\Release_5.2.0"
file_sep = os.path.sep
release_date = '2024-10-01'

In [7]:
sheet_name = "Participants_in_the_Comorbidity_Cohort"

In [8]:
creds = {'usr': USER, 'pwd': PWD, 'hst': HOST, "prt": 3306, 'dbn': DB}
connstr = "mysql+mysqlconnector://{usr}:{pwd}@{hst}:{prt}/{dbn}"
engine = sd.create_engine(connstr.format(**creds))
conn = engine.connect()

In [9]:
cohort_data = pd.read_sql(("SELECT p.Seronet_Participant_ID, nv.SeroNet_Cohort, nv.Research_Participant_ID, nv.Normalized_Visit_Index, nv.Visit_Info_ID,  ncv.Comorbidity, ncv.`Original_Names (Unique)`,  Harmonized_Value   " + 
                           "FROM Normalized_Comorbidity_Visits as ncv left join Normalized_Visit_Info as nv   " + 
                           "on ncv.Visit_Info_ID = nv.Visit_Info_ID left join Participant as p on nv.Research_Participant_ID = p.Research_Participant_ID " +
                           "where nv.SeroNet_Cohort = 'Comorbidity Cohort' " + 
                           " Union " + 
                           "SELECT p.Seronet_Participant_ID, nv.SeroNet_Cohort, nv.Research_Participant_ID, nv.Normalized_Visit_Index, nv.Visit_Info_ID, 'Cancer' as 'Comorbidity', `Original Cancer Name` as 'Original_Names (Unique)',  `Harmonized Cancer Name` as Harmonized_Value  " + 
                           " FROM Normalized_Cancer_Names as cancer left join Normalized_Visit_Info as nv on cancer.Visit_Info_ID = nv.Visit_Info_ID   " + 
                           " left join Participant as p on nv.Research_Participant_ID = p.Research_Participant_ID " +
                           " where nv.SeroNet_Cohort = 'Comorbidity Cohort'" + 
                           " Union  " + 
                           "SELECT p.Seronet_Participant_ID, nv.SeroNet_Cohort, nv.Research_Participant_ID, nv.Normalized_Visit_Index, nv.Visit_Info_ID, 'Transplant' as 'Comorbidity', `Organ_Transplant_Description_Or_ICD10_codes` as 'Original_Names (Unique)',  `Organ_Transplant_Description_Or_ICD10_codes` as Harmonized_Value  " + 
                           "FROM Participant_Other_Condition_Names as trans left join Normalized_Visit_Info as nv on trans.Visit_Info_ID = nv.Visit_Info_ID   " + 
                           " left join Participant as p on nv.Research_Participant_ID = p.Research_Participant_ID " +
                           "where nv.SeroNet_Cohort = 'Comorbidity Cohort'"), conn)

In [10]:
demo_file = output_folder + file_sep + "Demographics_" + version_num + ".xlsx"
demo_data = pd.read_excel(demo_file, sheet_name="Detailed_Report")

In [11]:
curr_visit_file = output_folder + file_sep + "Participant_Visit_Info_" + version_num + ".xlsx"
curr_visit_data = pd.read_excel(curr_visit_file, sheet_name="Detailed_Report")

In [12]:
cohort_data = cohort_data.merge(demo_data["Seronet_Participant_ID"])
cohort_data = cohort_data.merge(curr_visit_data[["Seronet_Participant_ID", 'Normalized_Visit_Index']], how="right")

In [13]:
cohort_data = cohort_data.query("Harmonized_Value not in ('Participant is unsure if condition exists (answered Unknown Status)', " + 
                                "'Participant did not answer question, Unable to determine if condition exists', " + 
                                "'Participant does not have Condition (answered No at baseline)', 'N/A', 'Not Reported')")

In [14]:
cohort_data = cohort_data.query("Comorbidity not in ['Bacterial_Infection', 'Viral_Infection']")
cohort_data = cohort_data.query("`Original_Names (Unique)` not in ['N/A', 'Not Reported', 'Unknown']")

In [15]:
cohort_data.drop(["Research_Participant_ID", "Visit_Info_ID"], axis=1, inplace=True)

In [16]:
cohort_data.sort_values(["Seronet_Participant_ID", "SeroNet_Cohort", "Normalized_Visit_Index"], inplace=True)

In [17]:
cohort_data = cohort_data.query("SeroNet_Cohort == SeroNet_Cohort")

In [18]:
part_list = cohort_data[['Seronet_Participant_ID', 'Comorbidity']].drop_duplicates()

In [19]:
comorbid_table = pd.crosstab(part_list['Seronet_Participant_ID'],part_list['Comorbidity'])

In [20]:
comorbid_table.reset_index(inplace=True)
comorbid_table['Comorbidity Count'] = comorbid_table[comorbid_table.columns[1:]].sum(axis=1)
comorbid_table["SeroNet_Cohort"] = 'Comorbidity Cohort'

In [21]:
new_cols = ["Seronet_Participant_ID","SeroNet_Cohort", "Comorbidity Count",
            "Acute_Liver_Disease", "Autoimmune_Disorder", "Cancer", "Cardiovascular_Disease", 
            "Chronic_Kidney_Disease", "Chronic_Liver_Disease", "Chronic_Lung_Disease", "Chronic_Neurological_Condition", 
            "Chronic_Oxygen_Requirement", "Diabetes", "Hypertension", "Immunosuppressive_Condition", "Inflammatory_Disease",
            "Transplant"]
comorbid_table = comorbid_table[new_cols]

In [22]:
columns = ['Comorbidity Count']
categorical =  ['Comorbidity Count']
groupby = 'SeroNet_Cohort'

In [23]:
comorbid_table = pd.DataFrame(comorbid_table.values, columns=comorbid_table.columns.tolist())

In [24]:
base_table1 = TableOne(comorbid_table, columns, categorical, groupby)


In [25]:
dictionary = pd.read_excel("Release_Data_Dictionary_External.xlsx", sheet_name = "comorbid_cohort")

In [26]:
comorbid_writer = pd.ExcelWriter(f'{output_folder}{file_sep}{sheet_name}_{version_num}.xlsx')
base_table1.to_excel(comorbid_writer, "Comorbidity Conditions Counts")
comorbid_table.to_excel(comorbid_writer, "Summary_of_Conditions", index = False)
cohort_data.to_excel(comorbid_writer, "All Comorbidities", index = False)
dictionary.to_excel(comorbid_writer, 'Data_Dictionary', index = False)

In [27]:
workbook = comorbid_writer.book
worksheet = comorbid_writer.sheets['Comorbidity Conditions Counts']
column_fmt = workbook.add_format({'font_name': 'Arial', 'font_size': 10, 'align': 'center'})
worksheet.set_column(0, 0, 25, column_fmt)   #this is column header names
worksheet.set_column(1, 1, 12, column_fmt)   #this is sub column header names
worksheet.set_column(2, 3, 12, column_fmt)   #all the data
worksheet.set_column(4, 17, 20, column_fmt)  #all the data

row_fmt = workbook.add_format({'text_wrap': True})
worksheet.set_row(1, 30, row_fmt)

0

In [28]:
comorbid_writer.close()

In [29]:
'''
Open the workbook to add additional notes to the Summary page.
Each note shoud be on a separate line. Add note as a single value array: noteX = ['noteX: This is a sample note.']
Append note to sheet.
Save workbook.
'''
wb = load_workbook(f'{output_folder}{file_sep}{sheet_name}_{version_num}.xlsx')
sheet = wb['Comorbidity Conditions Counts']
note0 = ['']
note1 = ['Note1: Counts represent unique comorbidity categories.']
note2 = ['Note2: A Participant can have multiple conditions for a single category (i.e multiple autoimmune conditions)']
note3 = ['Note3: Sheeet: All Comorbidities has all reported conditions for this cohort']

sheet.append(note0)
sheet.append(note1)
sheet.append(note2)
wb.save(filename = f'{output_folder}{file_sep}{sheet_name}_{version_num}.xlsx')