#### Pre-requisites
The following packages need to be installed in your notebook environment:
1. <a href = https://dev.mysql.com/doc/connector-python/en/>mysql.connector</a>
2. <a href = https://pandas.pydata.org/>pandas</a>
3. <a href = https://numpy.org/>numpy</a>
4. <a href = https://pypi.org/project/tableone/>tableone</a>
5. <a href = https://pypi.org/project/openpyxl/>openpyxl</a>

Be sure to install these packages in one of the directories in the system path(`print sys.path`) for the notebook environment.

You will need access (username and password) to seronet vaccine response database, `seronetdb-Vaccine_Response`, accessible at the AWS endpoint: `seronet-dev-instance.cwe7vdnqsvxr.us-east-1.rds.amazonaws.com`. <br>
**Note:** Store all mysql connector parameters in a `.env` file as shown below. <i>The `.env` needs to be in the same directory as the notebooks</i>.

Ensure that the Excel workbook <i>Release_Data_Dictionary.xlsx</i> is in the same folder as this notebook. This workbook is also on versioned on GitHub.

#### Post-processing files
<u>Summary sheet:</u>
1. Remove all borders from Table 1.
2. Shade table rows in grey/white.
3. Right justify second column.
4. Center all columns reporting values.
5. Autoformat column widths.
6. Highlight all notes in bold.

<u>Detailed report sheets:</u>
1. All columns left justified.
2. Autoformat column widths.
3. Fill all empty cells with "Not Reported".

In [1]:
import mysql.connector as connection
from collections import Counter
import pandas as pd
import numpy as np
import os
from tableone import TableOne
from openpyxl import load_workbook, Workbook
from openpyxl.styles import Font
import sqlalchemy as sd
import datetime
import icd10

In [2]:
last_edit = '11/20/2024'

In [3]:
v4_visits = r"C:\Users\breadsp2\Documents\Release_4.0.0\Participant_Visit_Info_4.0.0.xlsx"
v4_visits = pd.read_excel(v4_visits, sheet_name="Detailed_Report")
V4_fitler = False

In [4]:
pd.options.mode.chained_assignment = None  # default='warn'

In [5]:
'''
Create a .env file to store hostname, database, username, and password as environment variables.  
The .env file is stored in same folder as notebook.
==============
Format:
HOST=a.b.c.d
DB=<my.database>
USER=john
PWD=abc#123
==============
Note: no space around the "=" sign. Do not put any of the values in quotes.
'''

env = {}
with open("test.env") as f:
    for line in f:
        (k, v) = line.split("=")
        k = k.strip()
        env[k] = v.strip()
        
HOST = env["HOST"]
DB = env["DB"]
USER = env["USER"]
PWD = env["PWD"]

In [6]:
if V4_fitler == True:
    version_num = "4.2.0"
    output_folder = r"C:\Users\breadsp2\Desktop\Release_4.2.0"
else: 
    version_num = "5.2.0"
    output_folder = r"C:\Users\breadsp2\Desktop\Release_5.2.0"
file_sep = os.path.sep
release_date = '2024-10-01'

In [7]:
sheet_name = "Treatment_History"

In [8]:
creds = {'usr': USER, 'pwd': PWD, 'hst': HOST, "prt": 3306, 'dbn': DB}
connstr = "mysql+mysqlconnector://{usr}:{pwd}@{hst}:{prt}/{dbn}"
engine = sd.create_engine(connstr.format(**creds))
conn = engine.connect()

In [9]:
All_Visits = pd.read_sql(("SELECT p.Seronet_Participant_ID, nv.Research_Participant_ID, nv.Seronet_Cohort, nv.Normalized_Visit_Index, nv.Date_Of_Event, " +
                          "nv.Visit_Info_ID  From Normalized_Visit_Info as nv " + 
                          "join Participant as p on nv.Research_Participant_ID = p.Research_Participant_ID"), conn)

In [10]:
demo_file = output_folder + file_sep + "Demographics_" + version_num + ".xlsx"
demo_data = pd.read_excel(demo_file, sheet_name="Detailed_Report")

In [11]:
curr_visit_file = output_folder + file_sep + "Participant_Visit_Info_" + version_num + ".xlsx"
curr_visit_data = pd.read_excel(curr_visit_file, sheet_name="Detailed_Report")

In [12]:
All_Visits = All_Visits.merge(demo_data["Seronet_Participant_ID"])
All_Visits = All_Visits.merge(curr_visit_data[["Seronet_Participant_ID", 'Normalized_Visit_Index']], how="right")

In [13]:
treatment_data = pd.read_sql(("SELECT * FROM Treatment_History"), conn)

In [14]:
treatment_data = All_Visits.merge(treatment_data)

In [15]:
'''
This block adds a CBC column to dataframe based on the Research_Participant_ID.
'''
treatment_data['CBC'] = "Not Reported"

conditions = [
    treatment_data['Research_Participant_ID'].str.startswith('14'),
    treatment_data['Research_Participant_ID'].str.startswith('27'),
    treatment_data['Research_Participant_ID'].str.startswith('41'),
    treatment_data['Research_Participant_ID'].str.startswith('32_22'),  #ASU_MidWestern
    treatment_data['Research_Participant_ID'].str.startswith('32_33'),  #ASU_Dignity Health
    treatment_data['Research_Participant_ID'].str.startswith('32_44'),  #ASU_ValleyWise
    treatment_data['Research_Participant_ID'].str.startswith('32_77'),  #ASU_Phoenix Childrens Hospital
    treatment_data['Research_Participant_ID'].str.startswith('32_55'),  #ASU_Columbia
]

values = ["Mount_Sinai", "University of Minnesota", "Feinstein_Northwell", "ASU: Midwestern", "ASU: Dignity Health",
          "ASU: ValleyWise", "ASU: Phoenix Childrens Hospital", "Columbia University"]


treatment_data['CBC'] = np.select(conditions, values)

In [16]:
unique_treat = treatment_data[["Research_Participant_ID","Health_Condition_Or_Disease"]]
unique_treat = unique_treat.drop_duplicates()


In [17]:
x = pd.DataFrame.from_dict(Counter(unique_treat["Health_Condition_Or_Disease"]),orient='index')

x.reset_index(inplace=True)
x.columns = ["Health_Condition_Or_Disease", "Frequency"]

In [18]:
x["Health_Condition_Or_Disease Frequency"] = x["Health_Condition_Or_Disease"]

for idx in range(0, 6):
    y = x.query(f"Frequency == {idx}")
    x.loc[y.index, "Health_Condition_Or_Disease Frequency"] = f"Condtion only in {idx} participants"

x["Health_Condition_Or_Disease Frequency"].replace("Unknown", "Unknown Heath Condition", inplace=True)

In [19]:
treatment_data_2 = treatment_data.merge(x, how="outer")

In [20]:
unique_treat = treatment_data_2.merge(unique_treat)

In [21]:
unique_treat = unique_treat[["Research_Participant_ID", "Seronet_Cohort", "Health_Condition_Or_Disease", 
                             "Health_Condition_Or_Disease Frequency"]]
unique_treat.drop_duplicates(inplace=True)

In [22]:
columns = ["Health_Condition_Or_Disease Frequency"]
categorical = ["Health_Condition_Or_Disease Frequency"]
groupby = 'Seronet_Cohort'

order_dict = {}

order_dict["Health_Condition_Or_Disease Frequency"] = ['Breast Cancer','Crohns Disease','Endometrial Cancer','Hypothyroidism',
                                                       'IBD','Multiple Myeloma','HIV','Anemia','Diarrhea','GERD',
                                                       'Transplant (induction immunosuppression)','Transplant (maintenance immunosuppression)',
'Condtion only in 1 participants','Condtion only in 2 participants','Condtion only in 3 participants',
'Condtion only in 4 participants','Condtion only in 5 participants','Unknown Heath Condition']


In [23]:
baseline_treat_table1 = TableOne(unique_treat, columns, categorical, groupby, order = order_dict)

In [24]:
norm_data = pd.read_sql(("select * from Normalized_Treatment_Visit_Info"), conn)

In [25]:
treatment_data.rename(columns={"Treatment": "Original Treatment Name"}, inplace=True)

In [26]:
offset_data = pd.read_sql(("Select * from  Visit_One_Offset_Correction;"), conn)

In [27]:
treatment_data = treatment_data.merge(norm_data, how="left")
treatment_data = treatment_data.merge(offset_data, how="left")
treatment_data.drop_duplicates(inplace=True)

In [28]:
for curr_row in treatment_data.index:
    start_dur =  treatment_data.loc[curr_row, "Start_Date_Duration_From_Index"]
    stop_dur =  treatment_data.loc[curr_row, "Stop_Date_Duration_From_Index"]
    offset =  treatment_data.loc[curr_row, "Offset_Value"]
    
    if start_dur == "Ongoing":
        treatment_data.loc[curr_row,"Start_Duration_From_Baseline"] = "Ongoing"
    elif start_dur == "Not Reported":
        treatment_data.loc[curr_row,"Start_Duration_From_Baseline"] = "Not Reported"
    else:
        treatment_data.loc[curr_row,"Start_Duration_From_Baseline"] = float(start_dur) - offset
        
    if stop_dur == "Ongoing":
        treatment_data.loc[curr_row,"Stop_Duration_From_Baseline"] = "Ongoing"
    elif stop_dur == "Not Reported":
        treatment_data.loc[curr_row,"Stop_Duration_From_Baseline"] = "Not Reported"
    else:
        treatment_data.loc[curr_row,"Stop_Duration_From_Baseline"] = float(stop_dur) - offset

In [29]:
new_cols = ["Seronet_Participant_ID","Seronet_Cohort",  "Normalized_Visit_Index",  "Health_Condition_Or_Disease",
           "Original Treatment Name", "Treatment_Provenance", "Harmonized Treatment",
           "Dosage", "Dosage_Units", "Dosage_Regimen", "Start_Duration_From_Baseline", "Stop_Duration_From_Baseline",
            "Update", "Treatment_History_Comments"]

treatment_data = treatment_data[new_cols]

In [30]:
treatment_data["Treatment_Provenance"].fillna("Self-Reported", inplace=True)
treatment_data["Treatment_Provenance"] = treatment_data["Treatment_Provenance"].replace("N/A", "Self-Reported")
treatment_data["Treatment_Provenance"] = treatment_data["Treatment_Provenance"].replace("nan", "Self-Reported")

In [31]:
#x = treatment_data.query("Dosage_Units == 'mg/0.4ml'")
#treatment_data.loc[x,index, "Dosage"] = 0.4

In [32]:
treatment_data["Dosage_Units"] = [i.lower() if i not in ['Unknown', 'Not Reported'] else i for i in treatment_data["Dosage_Units"]]
unit_dict = {'mg/m2': 'mg/m^2', 'milliGRAM(s)': 'mg',  'gram': 'g', 'mg/0.4ml': 'mg/ml', 'mg/0.5ml': 'mg/ml',
             'mg/0.8ml': 'mg/ml', 'mg/1.7ml': 'mg/ml', 'mg/10ml': 'mg/ml', 'mg/20ml': 'mg/ml', 'mg/2ml': 'mg/ml', 
             'mg/50ml': 'mg/ml'}

treatment_data["Dosage_Units"].replace(unit_dict, inplace=True)




In [33]:
dictionary_detailed = pd.read_excel("Release_Data_Dictionary_External.xlsx", sheet_name = "Treatment")

In [34]:
#Note: Be sure to name the file for the ExcelWriter object as Biospecimens.<release_number>.xlsx.
treat_writer = pd.ExcelWriter(f'{output_folder}{file_sep}{sheet_name}_{version_num}.xlsx')
baseline_treat_table1.to_excel(treat_writer, "Summary_of_Unique_Conditions")
#baseline_treat_table2.to_excel(treat_writer, "Summary_of_Treatments")
unique_treat.to_excel(treat_writer, "Conditions by Participant", index=False)
treatment_data.to_excel(treat_writer, "All Treatment Data", index=False)
dictionary_detailed.to_excel(treat_writer, 'Data_Dictionary', index = False)


#baseline_trans.to_excel(trans_writer, "Transplant Status at Baseline")
#transplant_data.to_excel(trans_writer, "All Transplant Visits")

In [35]:
workbook = treat_writer.book
worksheet = treat_writer.sheets['Summary_of_Unique_Conditions']
column_fmt = workbook.add_format({'font_name': 'Arial', 'font_size': 10, 'align': 'center'})
worksheet.set_column(0, 0, 30, column_fmt)   #this is column header names
worksheet.set_column(1, 1, 50, column_fmt)   #this is sub column header names
worksheet.set_column(2, 3, 12, column_fmt)   #all the data
worksheet.set_column(4, 17, 20, column_fmt)  #all the data

row_fmt = workbook.add_format({'text_wrap': True})
worksheet.set_row(1, 30, row_fmt)

0

In [36]:
treat_writer.close()