In [1]:
import mysql.connector as connection
import pandas as pd
import numpy as np
import os
from tableone import TableOne
from openpyxl import load_workbook, Workbook
from openpyxl.styles import Font
import sqlalchemy as sd
import datetime
import icd10

In [2]:
last_edit = '11/20/2024'

In [3]:
v4_visits = r"C:\Users\breadsp2\Documents\Release_4.0.0\Participant_Visit_Info_4.0.0.xlsx"
v4_visits = pd.read_excel(v4_visits, sheet_name="Detailed_Report")
V4_fitler = False

In [4]:
pd.options.mode.chained_assignment = None  # default='warn'

In [5]:
env = {}
with open("test.env") as f:
    for line in f:
        (k, v) = line.split("=")
        k = k.strip()
        env[k] = v.strip()
        
HOST = env["HOST"]
DB = env["DB"]
USER = env["USER"]
PWD = env["PWD"]

In [6]:
if V4_fitler == True:
    version_num = "4.2.0"
    output_folder = r"C:\Users\breadsp2\Desktop\Release_4.2.0"
else: 
    version_num = "5.2.0"
    output_folder = r"C:\Users\breadsp2\Desktop\Release_5.2.0"
file_sep = os.path.sep
release_date = '2024-10-01'

In [7]:
sheet_name = "Summary_File"

In [8]:
creds = {'usr': USER, 'pwd': PWD, 'hst': HOST, "prt": 3306, 'dbn': DB}
connstr = "mysql+mysqlconnector://{usr}:{pwd}@{hst}:{prt}/{dbn}"
engine = sd.create_engine(connstr.format(**creds))
conn = engine.connect()

In [9]:
demo_file = output_folder + file_sep + "Demographics_" + version_num + ".xlsx"
demo_data = pd.read_excel(demo_file, sheet_name="Detailed_Report")

In [10]:
demo_data = demo_data[["Seronet_Participant_ID", "Age", "Sex_At_Birth"]]

In [11]:
curr_visit_file = output_folder + file_sep + "Participant_Visit_Info_" + version_num + ".xlsx"
curr_visit_data = pd.read_excel(curr_visit_file, sheet_name="Detailed_Report")

In [12]:
visit_data = curr_visit_data.groupby("Seronet_Participant_ID").agg({"Normalized_Visit_Index":'count'}).reset_index()

In [13]:
demo_data = demo_data.merge(visit_data, how="left")

In [14]:
comorbid_data = output_folder + file_sep + "All_Participant_Comorbidities_Summary_" + version_num + ".xlsx"
comorbid_data = pd.read_excel(comorbid_data, sheet_name="Raw_Data")

chronic_data = output_folder + file_sep + "All_Participant_Chronic_Conditions_Summary_" + version_num + ".xlsx"
chronic_data = pd.read_excel(chronic_data, sheet_name="Raw_Data")

infection_data = output_folder + file_sep + "All_Participant_Infections_Summary_" + version_num + ".xlsx"
infection_data = pd.read_excel(infection_data, sheet_name="Raw_Data")

In [15]:
comorbid_data["Has Cond"] = comorbid_data[comorbid_data == " Yes: Participant has Condition" ].count(axis=1)
comorbid_data["No Cond"] = comorbid_data[comorbid_data == "  No: Does not have Condition" ].count(axis=1)
comorbid_data["Ukn Data"] = comorbid_data[comorbid_data == " Unknown: Participant is Unsure"].count(axis=1)
comorbid_data["Not Reported"] = comorbid_data[comorbid_data == "Status not Provided: Not Reported"].count(axis=1)
comorbid_data["Other"] = comorbid_data["Not Reported"] +  comorbid_data["Ukn Data"]

chronic_data["Has Cond"] = chronic_data[chronic_data == " Yes: Participant has Condition" ].count(axis=1)
chronic_data["No Cond"] = chronic_data[chronic_data == "  No: Does not have Condition" ].count(axis=1)
chronic_data["Ukn Data"] = chronic_data[chronic_data == " Unknown: Participant is Unsure"].count(axis=1)
chronic_data["Not Reported"] = chronic_data[chronic_data == "Status not Provided: Not Reported"].count(axis=1)
chronic_data["Other"] = chronic_data["Not Reported"] +  chronic_data["Ukn Data"]


infection_data["Has Cond"] = infection_data[infection_data == " Yes: Participant has Condition" ].count(axis=1)
infection_data["No Cond"] = infection_data[infection_data == "  No: Does not have Condition" ].count(axis=1)
infection_data["Ukn Data"] = infection_data[infection_data == " Unknown: Participant is Unsure"].count(axis=1)
infection_data["Not Reported"] = infection_data[infection_data == "Status not Provided: Not Reported"].count(axis=1)
infection_data["Other"] = infection_data["Not Reported"] +  infection_data["Ukn Data"]

In [16]:
comorbid_data["No Cond"] = comorbid_data["No Cond"].astype(str)
comorbid_data["Has Cond"] = comorbid_data["Has Cond"].astype(str)
comorbid_data["Other"] = comorbid_data["Other"].astype(str)

chronic_data["No Cond"] = chronic_data["No Cond"].astype(str)
chronic_data["Has Cond"] = chronic_data["Has Cond"].astype(str)
chronic_data["Other"] = chronic_data["Other"].astype(str)

infection_data["No Cond"] = infection_data["No Cond"].astype(str)
infection_data["Has Cond"] = infection_data["Has Cond"].astype(str)
infection_data["Other"] = infection_data["Other"].astype(str)

In [17]:
comorbid_data["# of Comobididites (Yes / No / No Answer)"] =  comorbid_data[['Has Cond', 'No Cond', "Other"]].agg(' / '.join, axis=1)
chronic_data["# of Chronic_Conditions (Yes / No / No Answer)"] =  chronic_data[['Has Cond', 'No Cond', "Other"]].agg(' / '.join, axis=1)
infection_data["# of Infections (Yes / No / No Answer)"] =  infection_data[['Has Cond', 'No Cond', "Other"]].agg(' / '.join, axis=1)

In [18]:
demo_data = demo_data.merge(comorbid_data[["Seronet_Participant_ID", "# of Comobididites (Yes / No / No Answer)"]], how="left")
demo_data = demo_data.merge(chronic_data[["Seronet_Participant_ID", "# of Chronic_Conditions (Yes / No / No Answer)"]], how="left")
demo_data = demo_data.merge(infection_data[["Seronet_Participant_ID", "# of Infections (Yes / No / No Answer)"]], how="left")

In [19]:
cancer_data = output_folder + file_sep + "Participants_in_the_Cancer_Cohort_" + version_num + ".xlsx"
cancer_data = pd.read_excel(cancer_data, sheet_name="All Cancer Information")

In [20]:
cancer_data_filt = cancer_data[["Seronet_Participant_ID", "SEER Category"]].drop_duplicates()

In [21]:
x = cancer_data_filt["Seronet_Participant_ID"].value_counts()
x = x.to_frame()
x.reset_index(inplace=True)
x.columns = ['Seronet_Participant_ID', "Number of Cancers"]

In [22]:
demo_data = demo_data.merge(x[["Seronet_Participant_ID", "Number of Cancers"]], how="left")
demo_data["Number of Cancers"].fillna("N/A", inplace=True)

In [23]:
vacc_data = output_folder + file_sep + "Vaccination_Data_" + version_num + ".xlsx"
vacc_data = pd.read_excel(vacc_data, sheet_name="All_Vaccination History")

In [24]:
vacc_data = vacc_data[["Seronet_Participant_ID", "Normalized_Visit", "Visit_Vaccine_History"]]
vacc_data = vacc_data.sort_values(["Seronet_Participant_ID", "Normalized_Visit"])

demo_data = demo_data.merge(vacc_data.drop_duplicates(["Seronet_Participant_ID"], keep="last"), how="left")

In [25]:
demo_data.drop("Normalized_Visit_Index", axis=1, inplace=True)
demo_data.rename(columns={"Visit_Vaccine_History": "Complete_Vaccine_History"}, inplace=True)

In [26]:
summary_writer = pd.ExcelWriter(f'{output_folder}{file_sep}{sheet_name}_{version_num}.xlsx')
demo_data.to_excel(summary_writer, "Overall Data Summary", index=False)
summary_writer.close()