#### Pre-requisites
The following packages need to be installed in your notebook environment:
1. <a href = https://dev.mysql.com/doc/connector-python/en/>mysql.connector</a>
2. <a href = https://pandas.pydata.org/>pandas</a>
3. <a href = https://numpy.org/>numpy</a>
4. <a href = https://pypi.org/project/tableone/>tableone</a>
5. <a href = https://pypi.org/project/openpyxl/>openpyxl</a>

Be sure to install these packages in one of the directories in the system path(`print sys.path`) for the notebook environment.

You will need access (username and password) to seronet vaccine response database, `seronetdb-Vaccine_Response`, accessible at the AWS endpoint: `seronet-dev-instance.cwe7vdnqsvxr.us-east-1.rds.amazonaws.com`. <br>
**Note:** Store all mysql connector parameters in a `.env` file as shown below. <i>The `.env` needs to be in the same directory as the notebooks</i>.

Ensure that the Excel workbook <i>Release_Data_Dictionary.xlsx</i> is in the same folder as this notebook. This workbook is also on versioned on GitHub.

#### Post-processing files
<u>Summary sheet:</u>
1. Remove all borders from Table 1.
2. Shade table rows in grey/white.
3. Right justify second column.
4. Center all columns reporting values.
5. Autoformat column widths.
6. Highlight all notes in bold.

<u>Detailed report sheets:</u>
1. All columns left justified.
2. Autoformat column widths.
3. Fill all empty cells with "Not Reported".

In [1]:
import mysql.connector as connection
import pandas as pd
import numpy as np
import os
from tableone import TableOne
from openpyxl import load_workbook, Workbook
from openpyxl.styles import Font
import sqlalchemy as sd
import datetime
import icd10

In [2]:
last_edit = '12/13/2024'

In [3]:
v4_visits = r"C:\Users\breadsp2\Documents\Release_4.0.0\Participant_Visit_Info_4.0.0.xlsx"
v4_visits = pd.read_excel(v4_visits, sheet_name="Detailed_Report")
V4_fitler = False

In [4]:
pd.options.mode.chained_assignment = None  # default='warn'

In [5]:
'''
Create a .env file to store hostname, database, username, and password as environment variables.  
The .env file is stored in same folder as notebook.
==============
Format:
HOST=a.b.c.d
DB=<my.database>
USER=john
PWD=abc#123
==============
Note: no space around the "=" sign. Do not put any of the values in quotes.
'''

env = {}
with open("test.env") as f:
    for line in f:
        (k, v) = line.split("=")
        k = k.strip()
        env[k] = v.strip()
        
HOST = env["HOST"]
DB = env["DB"]
USER = env["USER"]
PWD = env["PWD"]

In [6]:
if V4_fitler == True:
    version_num = "4.2.0"
    output_folder = r"C:\Users\breadsp2\Desktop\Release_4.2.0"
else: 
    version_num = "5.2.0"
    output_folder = r"C:\Users\breadsp2\Desktop\Release_5.2.0"
file_sep = os.path.sep
release_date = '2024-10-01'

In [7]:
sheet_name = "Participants_in_the_Transplant_Cohort"

In [8]:
creds = {'usr': USER, 'pwd': PWD, 'hst': HOST, "prt": 3306, 'dbn': DB}
connstr = "mysql+mysqlconnector://{usr}:{pwd}@{hst}:{prt}/{dbn}"
engine = sd.create_engine(connstr.format(**creds))
conn = engine.connect()

In [9]:
All_Visits = pd.read_sql(("SELECT p.Seronet_Participant_ID, nv.Research_Participant_ID, nv.Seronet_Cohort, nv.Normalized_Visit_Index,  " +
                          "nv.Visit_Info_ID  From Normalized_Visit_Info as nv " + 
                          "join Participant as p on nv.Research_Participant_ID = p.Research_Participant_ID"), conn)

In [10]:
demo_file = output_folder + file_sep + "Demographics_" + version_num + ".xlsx"
demo_data = pd.read_excel(demo_file, sheet_name="Detailed_Report")

In [11]:
curr_visit_file = output_folder + file_sep + "Participant_Visit_Info_" + version_num + ".xlsx"
curr_visit_data = pd.read_excel(curr_visit_file, sheet_name="Detailed_Report")

In [12]:
All_Visits = All_Visits.merge(demo_data["Seronet_Participant_ID"])
All_Visits = All_Visits.merge(curr_visit_data[["Seronet_Participant_ID", 'Normalized_Visit_Index']], how="right")

In [13]:
#tranplant_data = pd.read_sql(("SELECT Visit_Info_ID, Organ_Transplant_Description_Or_ICD10_codes FROM Participant_Other_Condition_Names"), conn)

In [14]:
#All_Visits = All_Visits.merge(tranplant_data)

In [15]:
cohort_data = pd.read_sql(("SELECT * FROM Organ_Transplant_Cohort"), conn)

In [16]:
transplant_data = All_Visits.merge(cohort_data)

In [17]:
'''
This block adds a CBC column to dataframe based on the Research_Participant_ID.
'''
transplant_data['CBC'] = "Not Reported"

conditions = [
    transplant_data['Research_Participant_ID'].str.startswith('14'),
    transplant_data['Research_Participant_ID'].str.startswith('27'),
    transplant_data['Research_Participant_ID'].str.startswith('41'),
    transplant_data['Research_Participant_ID'].str.startswith('32_22'),  #ASU_Midwestern
    transplant_data['Research_Participant_ID'].str.startswith('32_33'),  #ASU_Dignity Health
    transplant_data['Research_Participant_ID'].str.startswith('32_44'),  #ASU_ValleyWise
    transplant_data['Research_Participant_ID'].str.startswith('32_77'),  #ASU_Phoenix Childrens Hospital
    transplant_data['Research_Participant_ID'].str.startswith('32_55'),  #ASU_Columbia

]

values = ["Mount_Sinai", "University of Minnesota", "Feinstein_Northwell", "ASU: Midwestern", "ASU: Dignity Health",
          "ASU: ValleyWise", "ASU: Phoenix Childrens Hospital", "Columbia University"]

transplant_data['CBC'] = np.select(conditions, values)

In [18]:
baseline_trans = transplant_data.query("Normalized_Visit_Index == 1")

In [19]:
baseline_trans["Organ Transplant"].fillna("No Data", inplace=True)
baseline_trans["Organ_Transplant_Other"].fillna("No Data", inplace=True)

In [20]:
#baseline_trans["Organ_Transplant_Description_Or_ICD10_codes"] = [i.split("|") for i in baseline_trans["Organ_Transplant_Description_Or_ICD10_codes"]] 
baseline_trans["Organ Transplant"] = [i.split("|") for i in baseline_trans["Organ Transplant"]] 
baseline_trans["Organ_Transplant_Other"] = [i.split("|") for i in baseline_trans["Organ_Transplant_Other"]] 

In [21]:
baseline_trans["All Organ Transplants"] = baseline_trans["Organ Transplant"] + baseline_trans["Organ_Transplant_Other"]

In [22]:
baseline_trans.query("Seronet_Participant_ID == 'SN796387'")

Unnamed: 0,Seronet_Participant_ID,Research_Participant_ID,Seronet_Cohort,Normalized_Visit_Index,Visit_Info_ID,Organ Transplant,Organ_Transplant_Other,Number_of_Hematopoietic_Cell_Transplants,Number_Of_Solid_Organ_Transplants,Date_of_Latest_Hematopoietic_Cell_Transplant_Duration_From_Index,Date_of_Latest_Solid_Organ_Transplant_Duration_From_Index,Update,Organ_Transplant_Cohort_Comments,CBC,All Organ Transplants
233,SN796387,14_T66411,Transplant,1,14_T66411 : B01,[Other],[Trachea],0,1,,-229,Baseline Information,,Mount_Sinai,"[Other, Trachea]"


In [23]:
baseline_trans["All Organ Transplants"] = [list(set(i)) for i in baseline_trans["All Organ Transplants"]]

[i.remove('Other') if 'Other' in i else i for i in baseline_trans["All Organ Transplants"]]
[i.remove('N/A')   if 'N/A'   in i else i for i in baseline_trans["All Organ Transplants"]]
[i.remove('Not Reported') if 'Not Reported' in i else i for i in baseline_trans["All Organ Transplants"]]
[i.remove('No Data') if 'No Data' in i else i for i in baseline_trans["All Organ Transplants"]]
baseline_trans["All Organ Transplants"] = [" | ".join(i) for i in baseline_trans["All Organ Transplants"]] 


In [24]:
baseline_trans["Number_of_Hematopoietic_Cell_Transplants"].fillna("Not Repoted", inplace=True)
baseline_trans["Number_Of_Solid_Organ_Transplants"].fillna("Not Repoted", inplace=True)
baseline_trans["Date_of_Latest_Hematopoietic_Cell_Transplant_Duration_From_Index"].fillna("Not Repoted", inplace=True)
baseline_trans["Date_of_Latest_Solid_Organ_Transplant_Duration_From_Index"].fillna("Not Repoted", inplace=True)

In [25]:
baseline_trans = baseline_trans.query("`All Organ Transplants` != ''")

In [26]:
new_cols = [
"Seronet_Participant_ID", "Seronet_Cohort", "Normalized_Visit_Index", "All Organ Transplants",
    "Number_of_Hematopoietic_Cell_Transplants", "Number_Of_Solid_Organ_Transplants", 
    "Date_of_Latest_Hematopoietic_Cell_Transplant_Duration_From_Index", "Date_of_Latest_Solid_Organ_Transplant_Duration_From_Index"]
baseline_trans = baseline_trans[new_cols]

In [27]:
columns = ["Number_of_Hematopoietic_Cell_Transplants", "Number_Of_Solid_Organ_Transplants", "All Organ Transplants"]
categorical = ["Number_of_Hematopoietic_Cell_Transplants", "Number_Of_Solid_Organ_Transplants","All Organ Transplants"]
groupby = 'Seronet_Cohort'

In [28]:
baseline_trans_table1 = TableOne(baseline_trans, columns, categorical, groupby)

In [29]:
dictionary = pd.read_excel("Release_Data_Dictionary_External.xlsx", sheet_name = "Organ_Transplant_Dictionary")

In [30]:
#Note: Be sure to name the file for the ExcelWriter object as Biospecimens.<release_number>.xlsx.
trans_writer = pd.ExcelWriter(f'{output_folder}{file_sep}{sheet_name}_{version_num}.xlsx')
baseline_trans_table1.to_excel(trans_writer, "Summary")
baseline_trans.to_excel(trans_writer, "Transplant Status at Baseline", index = False)
dictionary.to_excel(trans_writer, 'Data_Dictionary', index = False)

In [31]:
workbook = trans_writer.book
worksheet = trans_writer.sheets['Summary']
column_fmt = workbook.add_format({'font_name': 'Arial', 'font_size': 10, 'align': 'center'})
worksheet.set_column(0, 0, 47, column_fmt)   #this is column header names
worksheet.set_column(1, 1, 50, column_fmt)   #this is sub column header names
worksheet.set_column(2, 3, 12, column_fmt)   #all the data
worksheet.set_column(4, 17, 20, column_fmt)  #all the data

row_fmt = workbook.add_format({'text_wrap': True})
worksheet.set_row(1, 30, row_fmt)

0

In [32]:
trans_writer.close()