In [None]:
import os
import sys
from pathlib import Path

import numpy as np
import pandas as pd
#Sklearn imports
from sklearn.metrics import make_scorer, recall_score, precision_score, roc_auc_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold
from sklearn.dummy import DummyClassifier
from sklearn.exceptions import UndefinedMetricWarning
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

import tensorflow as tf
from tensorflow import keras
from scikeras.wrappers import KerasClassifier

from lightgbm import LGBMClassifier

#Add the parent directory to access ENV variables
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

#Import of necessary paths ( GDC data Path and Dataset folder)
from config import THYROID_PATH, MODEL_PATH, RESULTS_PATH

In [None]:
#definition of scoring metrics
scoring={
          'acc': 'accuracy',
          'roc': make_scorer(roc_auc_score),
          'recall0': make_scorer(recall_score, average = None,labels=[0]),
          'recall1': make_scorer(recall_score, average = None,labels=[1]),
          'precision0': make_scorer(precision_score, average = None,labels=[0],zero_division=0),
          'precision1': make_scorer(precision_score, average = None,labels=[1],zero_division=0),
          'f0': make_scorer(f1_score,average=None,labels = [0]),
          'f1': make_scorer(f1_score,average=None,labels = [1]),
           }

imputer = SimpleImputer(strategy='constant')

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5,random_state=2024)

def save_report(report,folder,name,experiment="5fold_Repeated"):
    df = pd.DataFrame(report).transpose()
    savepath = os.path.join(folder,f'{name}_{experiment}.csv')
    df.to_csv(savepath)

In [None]:
models = {}
models['Dummy_prior'] = DummyClassifier(strategy="prior")
models['Dummy_prob']= DummyClassifier(strategy="stratified")
models['Logistic_elastic'] = LogisticRegression(penalty='elasticnet',solver='saga',class_weight='balanced', C=0.02, max_iter=200,l1_ratio=0.7)
models['QDA'] = QuadraticDiscriminantAnalysis()
models['SVC'] = SVC(C=0.2,class_weight='balanced') 
models['RF'] = RandomForestClassifier(50, max_depth=3,max_features='log2')

def get_uncompiled_model(reset_last_layer=False):
    
    model = keras.models.load_model(os.path.join(MODEL_PATH,'pan-cancer-solid-only'))
    if(reset_last_layer):
        output_follicolar= keras.layers.Dense(1, activation='sigmoid',name='output_follicolar')(model.layers[-2].output)
        model = keras.models.Model(inputs=model.input, outputs = [output_follicolar])
    return model

def get_compiled_model(metrics=None,reset_last_layer=False):
    
    if(metrics is None):
        metrics = [
              keras.metrics.TruePositives(name='tp'),
              keras.metrics.FalsePositives(name='fp'),
              keras.metrics.TrueNegatives(name='tn'),
              keras.metrics.FalseNegatives(name='fn'), 
              keras.metrics.BinaryAccuracy(name='accuracy'),
              keras.metrics.Precision(name='precision'),
              keras.metrics.Recall(name='recall'),
              keras.metrics.AUC(name='auc'),
              keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
        ]
        
    model = get_uncompiled_model(reset_last_layer)
    model.compile(loss='binary_crossentropy',optimizer=keras.optimizers.Adam(3e-5),metrics=metrics)
    
    return model

early_stopping_cb = tf.keras.callbacks.EarlyStopping(
    monitor='prc', 
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)

kwargs = dict(
    model=get_compiled_model,
    epochs=40,
    verbose=True,
    batch_size = 8,
    callbacks = [early_stopping_cb],
    shuffle=True,
    #validation_split=0.2,
    fit__class_weight = None
)

# Unfiltered Cancer (Cancer vs Normal)

In [None]:
UnfilteredCancerPath = Path(THYROID_PATH,'UnfilteredCancerData.npy')
npzfiles = np.load(UnfilteredCancerPath,allow_pickle=True)

X = npzfiles['X']
y = npzfiles['y']

output_folder = Path(RESULTS_PATH,'UnfilteredCancer/')
output_folder.mkdir(exist_ok=True) #Create output folder if it does not exist 

In [None]:
pos = np.sum(y)
total = len(y)
neg  = total-pos

weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}

kwargs['fit__class_weight'] = class_weight

models['NeuralNetwork']=KerasClassifier(**kwargs)

In [None]:
for name,clf in models.items():
    if name == 'QDA':
        lr = LogisticRegression(penalty='l1',solver='saga',class_weight='balanced')
        selector = SelectFromModel(lr)
    else:
        selector = 'passthrough'
    
    pipe = Pipeline(steps=[
    ('imputation',imputer),
    ('selector',selector),
    ('classifier', clf)])  
    print(name)
    report = cross_validate(pipe,X,y, cv=cv, scoring=scoring)
    save_report(report,output_folder,name)


# Unfiltered Subtype (FvPTC vs CvPTC)

In [None]:
UnfilteredSubtypePath = Path(THYROID_PATH,'UnfilteredSubtypeData.npy')
npzfiles = np.load(UnfilteredSubtypePath,allow_pickle=True)

X = npzfiles['X']
y = npzfiles['y']

output_folder = Path(RESULTS_PATH,'UnfilteredSubtype/')
output_folder.mkdir(exist_ok=True) #Create output folder if it does not exist 

In [None]:
pos = np.sum(y)
total = len(y)
neg  = total-pos

weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}

kwargs['fit__class_weight'] = class_weight

models['NeuralNetwork']=KerasClassifier(**kwargs, reset_last_layer=True)

In [None]:
for name,clf in models.items():
    if name == 'QDA':
        lr = LogisticRegression(penalty='l1',solver='saga',class_weight='balanced')
        selector = SelectFromModel(lr)
    else:
        selector = 'passthrough'
    
    pipe = Pipeline(steps=[
    ('imputation',imputer),
    ('selector',selector),
    ('classifier', clf)])  
    print(name)
    report = cross_validate(pipe,X,y, cv=cv, scoring=scoring)
    save_report(report,output_folder,name)


# Filtered Cancer (Normal vs Cancer)

In [None]:
FilteredCancerPath = Path(THYROID_PATH,'FilteredCancerData.npy')
npzfiles = np.load(FilteredCancerPath,allow_pickle=True)

X = npzfiles['X']
y = npzfiles['y']
X_nn = npzfiles['X_nn'] #Zero-padded dataset for Neural Net Dimensionality

output_folder = Path(RESULTS_PATH,'FilteredCancer/')
output_folder.mkdir(exist_ok=True) #Create output folder if it does not exist 

In [None]:
pos = np.sum(y)
total = len(y)
neg  = total-pos

weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}

kwargs['fit__class_weight'] = class_weight

models['NeuralNetwork']=KerasClassifier(**kwargs)

In [None]:
for name,clf in models.items():
    if name == 'QDA':
        lr = LogisticRegression(penalty='l1',solver='saga',class_weight='balanced')
        selector = SelectFromModel(lr)
    else:
        selector = 'passthrough'
    
    pipe = Pipeline(steps=[
    ('imputation',imputer),
    ('selector',selector),
    ('classifier', clf)])  
    print(name)
    
    if(name == 'NeuralNetwork'):
        report = cross_validate(pipe,X_nn,y, cv=cv, scoring=scoring)
    else:
        report = cross_validate(pipe,X,y, cv=cv, scoring=scoring)
    save_report(report,output_folder,name)

# Filtered Subtype (FvPTC vs CvPTC)

In [None]:
FilteredSubtypePath = Path(THYROID_PATH,'FilteredSubtypeData.npy')
npzfiles = np.load(FilteredSubtypePath,allow_pickle=True)

X = npzfiles['X']
y = npzfiles['y']
X_nn = npzfiles['X_nn'] #Zero-padded dataset for Neural Net Dimensionality

output_folder = Path(RESULTS_PATH,'FilteredSubtype/')
output_folder.mkdir(exist_ok=True) #Create output folder if it does not exist 

In [None]:
pos = np.sum(y)
total = len(y)
neg  = total-pos

weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}

kwargs['fit__class_weight'] = class_weight

models['NeuralNetwork']=KerasClassifier(**kwargs, reset_last_layer=True)

In [None]:
for name,clf in models.items():
    if name == 'QDA':
        lr = LogisticRegression(penalty='l1',solver='saga',class_weight='balanced')
        selector = SelectFromModel(lr)
    else:
        selector = 'passthrough'
    
    pipe = Pipeline(steps=[
    ('imputation',imputer),
    ('selector',selector),
    ('classifier', clf)])  
    print(name)
    
    if(name == 'NeuralNetwork'):
        report = cross_validate(pipe,X_nn,y, cv=cv, scoring=scoring)
    else:
        report = cross_validate(pipe,X,y, cv=cv, scoring=scoring)
    save_report(report,output_folder,name)

# Differential Methylation Cancer (Cancer vs Normal)

In [None]:
DifferentialCancerPath = Path(THYROID_PATH,'DifferentialCancerData.npy')
npzfiles = np.load(DifferentialCancerPath,allow_pickle=True)

X = npzfiles['X']
y = npzfiles['y']
X_nn = npzfiles['X_nn'] #Zero-padded dataset for Neural Net Dimensionality

output_folder = Path(RESULTS_PATH,'DifferentialCancer/')
output_folder.mkdir(exist_ok=True) #Create output folder if it does not exist 

In [None]:
pos = np.sum(y)
total = len(y)
neg  = total-pos

weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}

kwargs['fit__class_weight'] = class_weight

models['NeuralNetwork']=KerasClassifier(**kwargs)


In [None]:
models = {}


models['LGBM'] = LGBMClassifier(n_jobs=-1)

for name,clf in models.items():
    if name == 'QDA':
        lr = LogisticRegression(penalty='l1',solver='saga',class_weight='balanced')
        selector = SelectFromModel(lr)
    else:
        selector = 'passthrough'
    
    pipe = Pipeline(steps=[
    ('imputation',imputer),
    ('selector',selector),
    ('classifier', clf)])  
    print(name)
    
    if(name == 'NeuralNetwork'):
        report = cross_validate(pipe,X_nn,y, cv=cv, scoring=scoring)
    else:
        report = cross_validate(pipe,X,y, cv=cv, scoring=scoring)
    save_report(report,output_folder,name)
    
del models['LGBM']

# Differential Methylation Subtype (FvPTC vs CvPTC)

In [None]:
DifferentialSubtypePath = Path(THYROID_PATH,'DifferentialSubtypeData.npy')
npzfiles = np.load(DifferentialSubtypePath,allow_pickle=True)

X = npzfiles['X']
y = npzfiles['y']
X_nn = npzfiles['X_nn'] #Zero-padded dataset for Neural Net Dimensionality

output_folder = Path(RESULTS_PATH,'DifferentialSubtype/')
output_folder.mkdir(exist_ok=True) #Create output folder if it does not exist 

In [None]:
pos = np.sum(y)
total = len(y)
neg  = total-pos

weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}

kwargs['fit__class_weight'] = class_weight

models['NeuralNetwork']=KerasClassifier(**kwargs, reset_last_layer=True)


In [None]:
models['LGBM'] = LGBMClassifier(n_jobs=-1)

for name,clf in models.items():
    if name == 'QDA':
        lr = LogisticRegression(penalty='l1',solver='saga',class_weight='balanced')
        selector = SelectFromModel(lr)
    else:
        selector = 'passthrough'
    
    pipe = Pipeline(steps=[
    ('imputation',imputer),
    ('selector',selector),
    ('classifier', clf)])  
    print(name)
    
    if(name == 'NeuralNetwork'):
        report = cross_validate(pipe,X_nn,y, cv=cv, scoring=scoring)
    else:
        report = cross_validate(pipe,X,y, cv=cv, scoring=scoring)
    save_report(report,output_folder,name)
    
del models['LGBM']