Code for generating all dataset for Thyroid Machine Learning Models. The code assumes that .env file in the root directory of project contains both the path of raw data in RAW_THYROID_PATH and destination folder exist and is in THYROID_PATH. The later cells use variables from previous cells (expecially from Unprocessed Probes Dataset)

In [1]:
import sys
import os
from pathlib import Path
import pandas as pd
import numpy as np

#Add the parent directory to access ENV variables
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

#Import of necessary paths ( Raw data and Dataset folder)
from config import RAW_THYROID_PATH,THYROID_PATH

We import first the unfiltered datasets, fix the indexes and add the relevant columns for creating the ML datasets

# Unprocessed Probes Dataset - Cancer vs Normal

We load the cvPTC, fvPTC and normal datasets. We fix the index and set the target variables, then we save the varibles (X) and outcome (y) numpy vector in a new file.

In [2]:
cvPTC_path_unprocessed = Path(RAW_THYROID_PATH,'cvPTC_beta_values_unprocessed.txt')
fvPTC_path_unprocessed = Path(RAW_THYROID_PATH,'fvPTC_beta_values_unprocessed.txt')
norm_path_unprocessed = Path(RAW_THYROID_PATH,'norm_beta_values_unprocessed.txt')

#Import dataset, fix index 
cvPTC = pd.read_csv(cvPTC_path_unprocessed, sep = '\t', index_col=0)
cvPTC.set_index('ProbeID',inplace=True)
cvPTC = cvPTC.T
cvPTC['cancer']=1
cvPTC['follicolar']=0
cvPTC['type']='classic'


#Same for type 2 cancer
fvPTC = pd.read_csv(fvPTC_path_unprocessed, sep = '\t', index_col=0)
fvPTC.set_index('ProbeID',inplace=True)
fvPTC = fvPTC.T
fvPTC['cancer']=1
fvPTC['follicolar']=1
fvPTC['type']='follicolar'


#Same for normal
normal = pd.read_csv(norm_path_unprocessed, sep = '\t', index_col=0)
normal.set_index('ProbeID',inplace=True)
normal = normal.T
normal['cancer']=0
normal['follicolar']=0
normal['type']='normal'

all_probes = normal.columns
num_all_probes = len(all_probes)

dataset_full_unprocessed = pd.concat([cvPTC,fvPTC,normal])

X = dataset_full_unprocessed.drop(['cancer','follicolar','type'],axis=1)
X = X.to_numpy().astype(np.float32)
y = dataset_full_unprocessed['cancer']
y = y.to_numpy().astype(np.float32)
feature_names = np.array(all_probes)
UnfilteredPath = Path(THYROID_PATH,'UnfilteredCancerData.npy')

with open(UnfilteredPath, 'wb') as f:
    np.savez(f, X = X, y = y, feature_names = feature_names)

# Filtered Probes Dataset - Cancer vs Normal

In addition to the filtered dataset we create a zero-filled dataset where probes have been filtered to ensure dimensioal compatibility with the first layer of the neural net. We use the reindex function of pandas to add zero-filled columns where the probe has been filtered. Note that all_probes is the columns name of the normal unprocessed dataset in the unfiltered section.

In [3]:
cvPTC_path_processed = Path(RAW_THYROID_PATH,'cvPTC_beta_values_processed.txt')
fvPTC_path_processed = Path(RAW_THYROID_PATH,'fvPTC_beta_values_processed.txt')
norm_path_processed = Path(RAW_THYROID_PATH,'norm_beta_values_processed.txt')

#Import cvcancer, fix index, transpose and add type column
cvPTC = pd.read_csv(cvPTC_path_processed, sep = '\t', index_col=0)
cvPTC = cvPTC.T
cvPTC['cancer']=1
cvPTC['follicolar']=0
cvPTC['type']='classic'

#Same for type 2 cancer
fvPTC = pd.read_csv(fvPTC_path_processed, sep = '\t', index_col=0)
fvPTC = fvPTC.T
fvPTC['cancer']=1
fvPTC['follicolar']=1
fvPTC['type']='follicolar'

#Same for normal
normal = pd.read_csv(norm_path_processed, sep = '\t', index_col=0)
normal = normal.T
normal['cancer']=0
normal['follicolar']=0
normal['type']='normal'
filtered_probes = normal.columns
num_filtered_probes = len(filtered_probes)


dataset_full_filtered = pd.concat([cvPTC,fvPTC,normal])

X = dataset_full_filtered.drop(['cancer','follicolar','type'],axis=1)
X = X.to_numpy().astype(np.float32)
y = dataset_full_filtered['cancer']
y = y.to_numpy().astype(np.float32)
feature_names = np.array(filtered_probes)

zeroes_data = dataset_full_filtered.reindex(all_probes,axis=1)

X_zeroes = zeroes_data.drop(['cancer','follicolar','type'],axis=1).to_numpy().astype(np.float32)

UnfilteredPath = Path(THYROID_PATH,'FilteredCancerData.npy')
with open(UnfilteredPath, 'wb') as f:
    np.savez(f, X = X, y = y, X_nn = X_zeroes, feature_names = feature_names)

# Subtype Unprocessed Probes - cvPTC vs fvPTC

For the subtype task we drop all samples that are normal and we create a new dataset.

In [4]:
dataset_full_unfiltered_subtype = dataset_full_unprocessed[dataset_full_unprocessed['type']!='normal']

X = dataset_full_unfiltered_subtype.drop(['cancer','follicolar','type'],axis=1)
X = X.to_numpy().astype(np.float32)
y = dataset_full_unfiltered_subtype['follicolar']
y = y.to_numpy().astype(np.float32)
feature_names = np.array(all_probes)

FollicolarPath = Path(THYROID_PATH,'UnfilteredSubtypeData.npy')
with open(FollicolarPath, 'wb') as f:
    np.savez(f, X = X, y = y, feature_names = feature_names)

# Subtype Filtered Probes - cvPTC vs fvPTC

In [5]:
dataset_full_filtered_subtype = dataset_full_filtered[dataset_full_filtered['type']!='normal']

X = dataset_full_filtered_subtype.drop(['cancer','follicolar','type'],axis=1)
X = X.to_numpy().astype(np.float32)
y = dataset_full_filtered_subtype['follicolar']
y = y.to_numpy().astype(np.float32)
feature_names = np.array(filtered_probes)

zeroes_data = dataset_full_filtered_subtype.reindex(all_probes,axis=1)

X_zeroes = zeroes_data.drop(['cancer','follicolar','type'],axis=1).to_numpy().astype(np.float32)

FollicolarFilteredPath = Path(THYROID_PATH,'FilteredSubtypeData.npy')
with open(FollicolarFilteredPath, 'wb') as f:
    np.savez(f, X = X, y = y, X_nn = X_zeroes, feature_names = feature_names)

# Differential Methylation - Cancer vs Normal

In [6]:
#Load list of differentially methylated CpG islands 
diff_cpg_path = Path(RAW_THYROID_PATH,'dmCpGs_PTC_vs_Norm_logFC1_FDR05.txt')
list_diff_cg = pd.read_csv(diff_cpg_path, sep='\t',index_col=0)
diff_cpg_index = list(list_diff_cg.index)

#Select from full dataset
dataset_full_diff = dataset_full_unprocessed[diff_cpg_index+['cancer','follicolar','type']]

X = dataset_full_diff.drop(['cancer','follicolar','type'],axis=1)
X = X.to_numpy().astype(np.float32)
y = dataset_full_diff['cancer']
y = y.to_numpy().astype(np.float32)

feature_names = np.array(diff_cpg_index)

X_zeroes = dataset_full_diff.reindex(all_probes,axis=1)
X_zeroes = X_zeroes.drop(['cancer','follicolar','type'],axis=1).to_numpy().astype(np.float32)

DifferentialCancerPath = Path(THYROID_PATH,'DifferentialCancerData.npy')
with open(DifferentialCancerPath, 'wb') as f:
    np.savez(f, X = X, y = y, X_nn = X_zeroes, feature_names = feature_names)

# Differential Methylation - cvPTC vs fvPTC

In [7]:
dataset_full_diff_subtype = dataset_full_diff[dataset_full_diff['type']!='normal']

X = dataset_full_diff_subtype.drop(['cancer','follicolar','type'],axis=1)
X = X.to_numpy().astype(np.float32)
y = dataset_full_diff_subtype['follicolar']
y = y.to_numpy().astype(np.float32)
feature_names = np.array(diff_cpg_index)

X_zeroes = dataset_full_diff_subtype.reindex(all_probes,axis=1)
X_zeroes = X_zeroes.drop(['cancer','follicolar','type'],axis=1).to_numpy().astype(np.float32)

DifferentialSubtypePath = Path(THYROID_PATH,'DifferentialSubtypeData.npy')
with open(DifferentialSubtypePath, 'wb') as f:
    np.savez(f, X = X, y=y, X_nn = X_zeroes,feature_names=feature_names)