This is the **1st** Notebook in the clustering pipeline. It allows you to take the semi-raw riskfactor data (prepped by Patricia), as well as the raw biomarker data, and prep it for future analysis, by putting it into a DataSet object.

Use <u>***pappas_tadam***</u> virtual environment.

In [None]:
# Set this to whatever directory GoodCopy is in, make sure to add a / at the end.

home_dir = "/home/l/lungboy/tadam/Project/"

# Importing Packages

In [None]:
import numpy as np
import pandas as pd

import sys

sys.path.append(home_dir + 'GoodCopy/Functions')

import FunctionsOOPGood as func

# Risk Factor Data

This data was pre-prepped by Patricia to remove missingness, impute NAs and scale data.

In [None]:
# Read in data prepped by Patricia

data = pd.read_csv(home_dir + "GoodCopy/Data/risk_factors.cleaned.scaled.one_hot.csv")
data_unscaled = pd.read_csv(home_dir + "GoodCopy/Data/risk_factors.cleaned.unscaled.one_hot.csv")

In [None]:
data_unscaled

# Biomarker Data

I prepped this data in the following way

### Importing

In [None]:
# Importing Biomarker Data

bio_data1 = pd.read_excel(home_dir + "GoodCopy/Data/BiomarkerRaw/AlereData_readable.xlsx",header = 5)
bio_data2 = pd.read_csv(home_dir + "GoodCopy/Data/BiomarkerRaw/AlereData_2.csv",header= 5)

In [None]:
# Checking for matching columns and drop any that are the same

drop_list = []

for col in bio_data1.columns.tolist():
    if col in bio_data2.columns.tolist():
        drop_list.append(col)
        
# Drop duplicated columns and concatenate
        
bio_data2.drop(drop_list,axis = 1,inplace = True)
bio_data = pd.concat([bio_data1,bio_data2],axis=1,verify_integrity=True)

### Cleaning

The following process is a bit messy, as we found overlapping patient IDs in the biomarker data, as some patient IDs had two different patients at two different sites. Thus we had to crossreference IDs with site to select the correct patients.

In [None]:
# Getting ids of patients we use for risk factor data. Need to use this risk data file as 
# it contains REGIDS

raw_data_removed = pd.read_csv(home_dir + "GoodCopy/Data/risk_factors.csv", encoding= 'unicode_escape',index_col = "regid")
raw_data_removed.drop(labels="Unnamed: 0",inplace=True, axis = 1)
ids = np.asarray(raw_data_removed.index)

In [None]:
raw_data_removed

In [None]:
# Cutting biomarker list down to those patients only

bio_data_removed = bio_data.copy()

remove_list= [] # list of rows to remove
check_list = [] # List of IDs that were checked already and kept

for i in range(len(bio_data["SCOPE Subject ID"].tolist())):
    ID = bio_data["SCOPE Subject ID"].tolist()[i] # patient ID number
    
    if ID not in ids or ID in check_list:
        remove_list.append(i) # if the ID isnt in the list of current IDs or it has already been checked, 
                              # remove it
    
    elif ID not in check_list:
        # check for the correct site
        
        if raw_data_removed.loc[ID, "centre"] != bio_data['Site'].tolist()[i]:
            if bio_data['Site'].tolist()[i] == "Adelaide Research & Innovation" and raw_data_removed.loc[ID, "centre"] == "Adelaide University":
                check_list.append(ID)
                pass
            
                
            elif bio_data['Site'].tolist()[i] == "University of Manchester" and raw_data_removed.loc[ID, "centre"] == "Manchester University":
                check_list.append(ID)
                pass
                
            elif bio_data['Site'].tolist()[i] == "Kings College London" and raw_data_removed.loc[ID, "centre"] == "Kings College, London":
                check_list.append(ID)
                pass
            
            elif bio_data['Site'].tolist()[i] == "Cork University" and raw_data_removed.loc[ID, "centre"] == "University College, Cork":
                check_list.append(ID)
                pass
            
            elif bio_data['Site'].tolist()[i] == "Leeds (St. James)" and raw_data_removed.loc[ID, "centre"] == "University of Leeds":
                check_list.append(ID)
                pass
            
            else:
                remove_list.append(i)
        else:
            check_list.append(ID)


print(len(remove_list))
                
bio_data_removed.drop(index = remove_list,inplace=True)

In [None]:
# Drop columns with more than 3000 NAs

for col in bio_data_removed.columns:
    if bio_data_removed[col].isna().sum() > 3000:
        bio_data_removed.drop(columns = col,inplace=True)

In [None]:
# MinMax Scale

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler((1,2))
# transform data
scaled_bio_data = scaler.fit_transform(bio_data_removed.drop(columns = bio_data_removed.columns[[0,1,2,3,4,5]]))
scaled_bio_data = pd.DataFrame(scaled_bio_data,index = bio_data_removed.index,columns = bio_data_removed.columns.tolist()[6:])
scaled_bio_data.to_csv(home_dir + "GoodCopy/Data/biomarker.rmNA.scaled.csv")

### Normalizing

We chose to normalize data based on biomarkers that occur in similar rates among different patients. I.E. "Adam 9 3b" is ranked 3rd most frequent in 95% of patients, thus it is commonly ranked at the same frequency, as we can use to normalize patients.

In [None]:
# Compute ranked levels of protein expression for all patients
# here all the rows are patients and the columns are the rank, ie for patient 0, Cystatin is 
# highest biomarker measurement.

norm_list = []

for i in scaled_bio_data.index:
    norm_list.append(scaled_bio_data.loc[i].sort_values(ascending=False).index.tolist())
    
norm_df = pd.DataFrame(norm_list)
display(norm_df)

In [None]:
# Find most commonly occuring protein in each rank, ie for 4th highest biomarker, the most frequent biomarker 
# among all patients is C-Met 111a.

norm_df.mode(axis=0)

In [None]:
# Rank all variables as specified above and put into dataframe, where rank is the most common rank of the variable
# and count is the number of times it appears at this rank. Thus, biomarkers with high counts commonly occur in
# the same rank most of the time.

temp = []
ranked_vars = []


for i in norm_df.columns:
    temp.append(norm_df[i].value_counts().tolist()[0])

for k in range(len(temp)):
    m = 0
    max_ind = 0
    for i in range(len(temp)):
        if temp[i] > m:
            m = temp[i]
            max_ind = i
    ranked_vars.append([norm_df.mode()[max_ind][0],m,max_ind])
    temp[max_ind] = np.NINF
            
ranked_vars_df = pd.DataFrame(ranked_vars, columns=["variable","count","rank"])
ranked_vars_df.sort_values(by=["count"],ascending=False,inplace=True)
ranked_vars_df.head()

In [None]:
# Normalize by top 3 most frequent variables from ranked_vars_df

for pat in scaled_bio_data.index:
    scaled_bio_data.loc[pat] = scaled_bio_data.loc[pat]/scaled_bio_data.loc[pat,ranked_vars_df.loc[0:2,"variable"]].sum()
    
scaled_bio_data.to_csv(home_dir + "GoodCopy/Data/biomarker.rmNA.scaled.normalized_top3")
scaled_bio_data

# Site Labels

prepping site labels for DataSet object

In [None]:
# Getting site labels from raw data

site_labels = pd.read_csv(home_dir + "GoodCopy/Data/risk_factors.csv", encoding = "cp1252").centre
site_labels.to_csv(home_dir + "GoodCopy/Data/site_labels.csv")

# PE Labels

prepping PE labels for DataSet object

In [None]:
# Getting site labels from raw data

pe_labels = pd.read_csv(home_dir + "GoodCopy/Data/risk_factors.csv", encoding = "cp1252").f34_pet
pe_labels.to_csv(home_dir + "GoodCopy/Data/pe_labels.csv")

# Outcome Variables

The workflow I used to prep outcome variables

In [None]:
# Choose a list of outcome variables. These were selected by starting
# after week 24 and looking at which variables could be useful for
# diferentiating severity and type of disease. More could be added if needed

outcome_variables = ['f34_pet',"f34c_gest_diag_pet","f34c_mat_adm_gest_PET_PTB_SGA",
                    "f34c_gest_dev_gh","f35_Max_sBP_Adm","f35_dBP_Adm","f35_Max_dBP_Adm",
                     "f35_sBP_Adm", "f35_Max_Pulse_Adm", "f35_Max_proturia_dipstick_Adm",
                    "f35c_f24_any_proturia", "f37_Hb_Lowest_ap",
                    "f37_Hb_Highest_ap","f37_hct_Lowest_ap","f37_hct_Highest_ap",
                    "f37_wcc_lowest_ap","f37_wcc_highest_ap","f37_platelets_lowest_ap",
                    "f37_platelets_highest_ap","f37_prot_creat_lowest_ap",
                     "f37_prot_creat_highest_ap","f37_24hproturia_lowest_ap",
                    "f37_24hproturia_highest_ap","f37_creat_Lowest_ap_umol",
                    "f37_creat_highest_ap_umol","f37_urate_lowest_ap_mmol",
                     "f37_urate_highest_ap_mmol","f37_AST_lowest_ap","f37_AST_highest_ap",
                    "f37_ALT_lowest_ap","f37_ALT_highest_ap","f37_GGT_lowest_ap",
                     "f37_GGT_highest_ap","f37_billi_lowest_ap_umol","f37_billi_highest_ap_umol",
                    "f37_Alb_lowest_ap","f37_Alb_highest_ap","f37_LDH_lowest_ap",
                     "f37_LDH_highest_ap","f37_Haptoglobin_lowest_ap","f37_APTT_highest_ap",
                    "f37_PR_highest_ap","f37_Ddimer_highest_ap","f37_CRP_lowest_ap",
                    "f39_fetal_outcome", "f39c_final_del_gest","f39_status_after_2nd_vst",
                     "f39_pet","f39c_pet_lt_37w","f39c_pet_ge_37w","f39c_SGA_AGA_LGA",
                     "f25_Placental_Wgt","f26_Birthwgt","f26_Length","f38c_hellp","f38c_ellp"]

outcome_variables.sort()

In [None]:
# Load in data dictionary

data_dict = pd.read_csv(home_dir + "GoodCopy/Data/DataDict_csv.csv")

In [None]:
# Create dataframe with outcome variables and their descriptions

# Get explanation for each variable

variable_explanation = []
for e in outcome_variables:
    variable_explanation.append(list(data_dict[" Variable Explanation\n"])[list(data_dict["Variable SAS name"]).index(e)])
    
# Get variable data types

variable_type = []
for e in outcome_variables:
    variable_type.append(list(data_dict["Database Categories"])[list(data_dict["Variable SAS name"]).index(e)])
  
# Create dataframe

outcomes = pd.DataFrame()
outcomes["variable"] = outcome_variables
outcomes["descriptions"] = variable_explanation
outcomes["type"] = variable_type
outcomes.head()

In [None]:
# Create csv with all patients and outcome variables

data_outcome = raw_data_removed.copy()

for col in raw_data_removed.columns:
    if col not in outcome_variables:
        data_outcome.drop(labels=col,axis=1,inplace=True)
        
data_outcome

In [None]:
# replace all missing values with NA

data_outcome = data_outcome[data_outcome >= 0]

In [None]:
data_outcome.to_csv(home_dir + "GoodCopy/Data/outcome_variables.unscaled.csv")

### Seperating into continous/categorical

This needs to be done as to use hypothesis testing, different variable types need different tests

In [None]:
ordinal_vars = ["1 SGA\n2 AGA\n3 LGA\n-77 missing birthweight customised centile comprised:\nmiscarriage or termination <20w and not a case (n=15)\nmiscarriage or termination <20w and spont PTB case but no birthweight customised centile (n=3)\ntermination 20-22w or FDIU 20-22w with no birthweight customised centile (n=4)",
               "1 Neg/trace \n2 1+ or 0.3 g/L  \n3 2+ or 1 g/L \n4 3+ or >=3 g/L  \n-67  No result among PET cases\n-99 No result and case but not PET (n=525, 9.3%)\n-909 Not a case (n=4580, 81.4%)",
               '1 "Yes, pregnancy outcome known >=20w"  \n2 Pregnancy ended <20w  \n3 "Pregnancy ended <20w, but CASE"  \n4 Lost to follow up']

In [None]:
outcome_data_cont = data_outcome.copy()
for col in data_outcome.columns:
    if "ontinuous" not in outcomes["type"][outcomes["variable"].tolist().index(col)] and outcomes["type"][outcomes["variable"].tolist().index(col)] not in ordinal_vars:
        outcome_data_cont.drop(labels=col,inplace=True,axis=1)
        
outcome_data_cont.to_csv(home_dir + "GoodCopy/Data/outcome_variables.unscaled.cont_ord.csv")

In [None]:
outcome_data_bin = data_outcome.copy()
for col in outcome_data_bin:
    if col in outcome_data_cont:
        outcome_data_bin.drop(columns=col,inplace=True)
        
outcome_data_bin.to_csv(home_dir + "GoodCopy/Data/outcome_variables.unscaled.bin_cat.csv")

# Reprepping for BioMarker Missingness

After we started going through the biomarker data, we noticed there were risk_factor patients that were not present in the biomarker data. Thus, we had to remove them. We also have to do this for the PE labels and site labels

In [None]:
# Select for only those patients in biomarker data

# getting regids for all datasets as index
data.index = raw_data_removed.index
data_unscaled.index = raw_data_removed.index
site_labels.index = raw_data_removed.index
pe_labels.index = raw_data_removed.index
data_outcome.index = raw_data_removed.index
outcome_data_bin.index = raw_data_removed.index
outcome_data_cont.index = raw_data_removed.index

risk_data_removed = data.copy()
risk_data_removed_unscaled = data_unscaled.copy()
site_labels_removed = site_labels.copy()
pe_labels_removed = pe_labels.copy()
data_outcome_removed = data_outcome.copy()
outcome_data_bin_removed = outcome_data_bin.copy()
outcome_data_cont_removed = outcome_data_cont.copy()

i = 0
for p in data.index:
    if p not in list(bio_data_removed["SCOPE Subject ID"]):
        risk_data_removed.drop(index=p,inplace=True)
        risk_data_removed_unscaled.drop(index=p,inplace=True)
        site_labels_removed.drop(index=p,inplace=True)
        pe_labels_removed.drop(index=p,inplace=True)
        data_outcome_removed.drop(index=p,inplace=True)
        outcome_data_bin_removed.drop(index=p,inplace=True)
        outcome_data_cont_removed.drop(index=p,inplace=True)
        i += 1
        
risk_data_removed.to_csv(home_dir + "GoodCopy/Data/risk_factors.cleaned.scaled.one_hot.biomarker.csv")
risk_data_removed_unscaled.to_csv(home_dir + "GoodCopy/Data/risk_factors.cleaned.unscaled.one_hot.biomarker.csv")
site_labels_removed.to_csv(home_dir + "GoodCopy/Data/site_labels.biomarker.csv")
pe_labels_removed.to_csv(home_dir + "GoodCopy/Data/pe_labels.biomarker.csv")
data_outcome_removed.to_csv(home_dir + "GoodCopy/Data/outcome_variables.unscaled.biomarker.csv")
outcome_data_bin_removed.to_csv(home_dir + "GoodCopy/Data/outcome_variables.unscaled.bin_cat.biomarker.csv")
outcome_data_cont_removed.to_csv(home_dir + "GoodCopy/Data/outcome_variables.unscaled.cont_ord.biomarker.csv")

# Cutting to only RFE selected variables

Patricia did the RFE

In [None]:
# Importing RFE_summary

rfe_sum = pd.read_csv(home_dir + "GoodCopy/Data/rfe_summary_v2.csv",index_col='varname').sort_values("votes",ascending=False)
rfe_sum

In [None]:
# Dropping all variables which had 0 RFE votes. I only do this to scaled data, 
# as unscaled data is mainly used for hypothesis testing on cluster assingments

risk_data_removed_drop = risk_data_removed.copy()

for col in rfe_sum.index:
    if rfe_sum.loc[col,"votes"] == 0:
        risk_data_removed_drop.drop(columns = col,inplace=True)
        
risk_data_removed_drop.to_csv(home_dir + "GoodCopy/Data/risk_factors.cleaned.scaled.one_hot.biomarker.RFE.csv")

# Putting everything into a dataset object and Creating a matched dataset object

In [None]:
# Creating DataSet object using all the things we did here.

data = func.DataSet(input_data = risk_data_removed_drop.drop(columns="Unnamed: 0"),
                    input_data_unscaled = risk_data_removed_unscaled.drop(columns="Unnamed: 0"), 
                    bio_data= scaled_bio_data,
                    pe_labels = pe_labels_removed, site_labels = site_labels_removed, outcome_bin_cat = outcome_data_bin_removed, 
                    outcome_cont_ord = outcome_data_cont_removed,
                    data_dict = data_dict)

In [None]:
data.save_DataSet(home_dir + "GoodCopy/Objects/data_object")