# hiPCA development notebook

Libraries needed

In [154]:
import pandas as pd
import numpy as np
import json
import pickle
from scipy.stats import kstest
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.stats import chi2, norm
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score
from sklearn.ensemble import RandomForestClassifier
import statistics

import pandas as pd
import pickle
import joblib
import numpy as np
import json

import warnings
warnings.filterwarnings("ignore")

Function to perform the Kolmogorov-Smirnov test and select important features

In [155]:
def ks_test(df, healthy, non_healthy, method_ks = 'asymp', p_val = 0.001):
    healthy_df = df[[x for x in df.columns if x in healthy]].T
    nonhealthy_df = df[[x for x in df.columns if x in non_healthy]].T
    healthy_features = []
    nonhealthy_features = []
    for feature in list(df.index):
        if kstest(list(healthy_df[feature]), list(nonhealthy_df[feature]), alternative = 'less', method = method_ks).pvalue <= p_val:
            healthy_features.append(feature)
        if kstest(list(nonhealthy_df[feature]), list(healthy_df[feature]), alternative = 'less', method = method_ks).pvalue <= p_val:
            nonhealthy_features.append(feature)
    
    return healthy_features, nonhealthy_features



Function to transform the data according to the original paper

In [156]:
def custom_transform(x):
    if x <= 1:
        return np.log2(2 * x + 0.00001)
    else:
        return np.sqrt(x)

def transform_data(df, features):
    scaler = StandardScaler()
    aux = pd.DataFrame()
    for item in list(set(features)):
        if item in df.index:
            aux[item] = list(df.T[item])
        else:
            aux[item] = [0 for x in range(len(df.T))]
    selected = aux.applymap(custom_transform)

    scaler.fit(selected)
    with open(f'model_data/scaler_{model_name}.pkl', 'wb') as file:
        pickle.dump(scaler, file)
    selected2 = scaler.transform(selected)
    
    pd.DataFrame(zip(selected.columns, scaler.mean_, scaler.scale_), columns = ['specie', 'mean', 'std']).to_csv(f'model_data/scaling_parameters_{model_name}.csv', index = False)

    selected2 = pd.DataFrame(selected2, columns = selected.columns)
    selected2.index = df.T.index

    return selected2




Function to perform PCA analysis

In [157]:
def calculate_pca_stats(df, variance_for_pc = 0.9, alpha = 0.05):
    pca = PCA()

    pca.fit(df)

    with open(f'model_data/pca_model_{model_name}.pkl', 'wb') as file:
        pickle.dump(pca, file)

    eigenvalues = pca.explained_variance_
    eigenvectors = pca.components_
    singular = pca.singular_values_
    
    pca_data = pd.DataFrame(zip(eigenvectors, eigenvalues, singular), columns = ('Eigenvectors', 'Explained_variance', 'Singular_values')).sort_values('Explained_variance', ascending = False)
    pca_data['%variance'] = pca_data['Explained_variance'] / sum(pca_data['Explained_variance'])
    pca_data = pca_data.sort_values('%variance', ascending = False)
    pca_data['%variance_cumulative'] = pca_data['%variance'].cumsum()
    
    principal_components = list(pca_data[pca_data['%variance_cumulative'] < variance_for_pc]['Eigenvectors'])
    print(f'# Principal Components selected: {len(principal_components)}')
    
    principal_values = list(pca_data[pca_data['%variance_cumulative'] < variance_for_pc]['Explained_variance'])
    D = np.array(principal_components).T @ np.linalg.inv(np.diag(principal_values)) @ np.array(principal_components)
    deg_free = len(principal_components) 
    # alpha = 0.05
    t2_threshold = chi2.ppf(1-alpha, deg_free)
#     print(1-alpha, deg_free)
#     print(t2_threshold)
    
    principal_components_residual = list(pca_data[pca_data['%variance_cumulative'] >= variance_for_pc]['Eigenvectors'])
    principal_values_residual = list(pca_data[pca_data['%variance_cumulative'] >= variance_for_pc]['Explained_variance'])
    principal_singvalues_residual = list(pca_data[pca_data['%variance_cumulative'] >= variance_for_pc]['Singular_values'])
    
    C = np.array(principal_components_residual).T @ np.array(principal_components_residual)
    Theta1 = sum(principal_values_residual)
    Theta2 = sum([x**2 for x in principal_values_residual])
    Theta3 = sum([x**3 for x in principal_values_residual])
    
    c_alpha = norm.ppf(1-alpha)
    
    h0 = 1-((2*Theta1*Theta3)/(3*(Theta2**2)))

    '''INTENTO'''
    Q_alpha = Theta1*(((((c_alpha*np.sqrt(2*Theta2*(h0**2)))/Theta1)+1+((Theta2*h0*(h0-1))/(Theta1**2))))**(1/h0))
    # print(Q_alpha)
    Q_alpha = Theta1*(((((np.sqrt(c_alpha*(2*Theta2*(h0**2))))/Theta1)+1-((Theta2*h0*(h0-1))/(Theta1**2))))**(1/h0))
    # print(Q_alpha)
    Q_alpha = (Theta2/Theta1) * chi2.ppf(alpha, len(principal_components_residual)) * ((Theta1**2)/Theta2)
    # print(Q_alpha)
    
    # Q_alpha = Theta1*(((((c_alpha*np.sqrt(2*Theta2*(h0**2)))/Theta1)+1+((Theta2*h0*(h0-1))/(Theta1**2))))**(1/h0))
    
    #fi = D/t2_threshold + (np.eye(len(principal_components[0])) - (np.array(principal_components).T @ np.array(principal_components)))/Q_alpha
    fi = D/t2_threshold  + C/Q_alpha
    g = ((len(principal_components) / t2_threshold**2) + (Theta2 / Q_alpha**2)) / ((len(principal_components)/t2_threshold) + (Theta1 / Q_alpha))
    h = ((len(principal_components)/t2_threshold) + (Theta1 / Q_alpha))**2 / ((len(principal_components) / t2_threshold**2) + (Theta2 / Q_alpha**2))

    chi_value = chi2.ppf(1-alpha, h)
    threshold_combined = g*chi_value
    
    return pca, pca_data, D, t2_threshold, C, Q_alpha, fi, threshold_combined




Main function to train hiPCA index

In [158]:
def hiPCA(df, healthy, non_healthy, features = [], ks = False, method = 'auto', p_val = 0.001, only_nonhealthy_features = False):
    if ks:
        healthy_features, non_healthy_features = ks_test(df, healthy, non_healthy, method_ks = method, p_val = p_val)


    if only_nonhealthy_features:
        print(f'# Unhealthy features selected by KS: {len(non_healthy_features)}')
        healthy_features = []
        if ks:
            features = healthy_features + non_healthy_features
        selected = transform_data(df[[x for x in healthy if x in df.columns]], features)
        
    else:
        if ks:
            features = healthy_features + non_healthy_features
        print(f'# Healthy features selected by KS: {len(healthy_features)}')
        print(f'# Unhealthy features selected by KS: {len(non_healthy_features)}')
        selected = transform_data(df[[x for x in healthy if x in df.columns]], features)
    
    # print(selected)
    pca, pca_data, D, t2_threshold, C, Q_alpha, fi, threshold_combined = calculate_pca_stats(selected)
    np.save(f'model_data/D_matrix_{model_name}.npy', D)
    np.save(f'model_data/C_matrix_{model_name}.npy', C)
    np.save(f'model_data/fi_matrix_{model_name}.npy', fi)

    thresholds = {'t2':t2_threshold, 'c':Q_alpha, 'combined':threshold_combined}

    with open(f'model_data/thresholds_{model_name}.json', 'w') as json_file:
        json.dump(thresholds, json_file)

    # print(t2_threshold, Q_alpha, threshold_combined)

        
    return features, pca, pca_data, D, t2_threshold, C, Q_alpha, fi, threshold_combined, selected



In [160]:
model_name = 'camda_fold1'
taxonomy = pd.read_csv('../../DataSets/CAMDA/taxonomy.txt', sep = '\t', index_col = 0)
metadata = pd.read_csv('../../DataSets/CAMDA/metadata.csv')

In [161]:
good_samples = []
for c in taxonomy.columns:
    if sum(taxonomy[c]) > 90:
        good_samples.append(c)
taxonomy_aux = taxonomy[good_samples]

obese = list(metadata[metadata['Diagnosis'] == 'Obese']['SampleID'])
healthy = list(metadata[metadata['Diagnosis'] == 'Healthy']['SampleID'])
non_healthy = list(metadata[metadata['Diagnosis'] != 'Healthy']['SampleID'])
non_healthy = [x for x in non_healthy if x not in obese]

taxonomy_not_obese = taxonomy_aux[[x for x in taxonomy_aux.columns if x not in obese]]


# features, pca, pca_data, D, t2_threshold, C, Q_alpha, fi, threshold_combined, selected = hiPCA(taxonomy_not_obese, healthy, non_healthy, ks = True, method = 'asymp', only_nonhealthy_features = True)


In [162]:
len(taxonomy.T)

613

In [163]:
labels = []
for sample in taxonomy.T.index:
    labels.append(metadata[metadata['SampleID'] == sample]['Diagnosis'].iloc[0])

In [164]:
labels = pd.DataFrame(labels)

In [165]:
taxonomy.T

Species,Butyrivibrio_crossotus,GGB3614_SGB4886,GGB1630_SGB2238,Paraprevotella_clara,Ruminococcus_bromii,GGB1380_SGB1883,Phocaeicola_vulgatus,GGB1146_SGB1472,Phocaeicola_dorei,GGB6608_SGB9342,...,GGB28369_SGB40949,GGB4250_SGB5751,GGB45600_SGB15302,Clostridium_porci,Blautia_liquoris,Clostridium_cuniculi,Mediterraneibacter_catenae,GGB27782_SGB40196,Leclercia_adecarboxylata,Enterobacter_kobei
SRR5946989,0.0,0.00000,0.0,0.00000,0.00000,0.00000,6.65468,0.0,0.00000,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SRR5983265,0.0,0.00000,0.0,0.00000,0.48611,0.00000,0.23675,0.0,7.44563,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SRR5946777,0.0,0.00000,0.0,3.48134,0.12196,0.00000,7.53330,0.0,0.33739,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SRR5946822,0.0,0.00000,0.0,0.00000,0.00000,0.00000,26.48135,0.0,0.00000,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SRR5946857,0.0,0.00000,0.0,0.00000,5.68323,0.00000,20.94219,0.0,0.00000,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR5946648,0.0,0.00000,0.0,0.00000,0.00000,0.00000,52.33682,0.0,15.11590,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SRR5946925,0.0,0.00000,0.0,0.00000,0.00000,0.00000,29.45264,0.0,0.00175,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ERR209694,0.0,0.00000,0.0,0.30373,7.09285,0.07645,0.79036,0.0,0.31640,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SRR5946668,0.0,0.00000,0.0,0.00000,0.00000,0.00000,0.00000,0.0,0.00000,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Function to transform input data

In [166]:
def transform_data_calculate(df):
    scaling_data = pd.read_csv(f'{path}/scaling_parameters_{model_name}.csv')
    features = list(scaling_data['specie'])
    # print(df.index)
    with open(f'{path}/scaler_{model_name}.pkl', 'rb') as file:
        scaler = pickle.load(file)
    # print(features)
    # scaler = StandardScaler()
    aux = pd.DataFrame()
    for item in list(set(features)):
        if item in df.index:
            aux[item] = list(df.T[item])
        else:
            aux[item] = [0 for x in range(len(df.T))]
    selected = aux.applymap(custom_transform)

    # scaler.fit(np.array(selected))
    selected = selected[features]
    selected2 = scaler.transform(selected)
    selected2 = pd.DataFrame(selected2, columns = selected.columns)
    selected2.index = df.T.index
    
    # pd.DataFrame(zip(selected.columns, scaler.mean_, scaler.scale_), columns = ['specie', 'mean', 'std']).to_csv('scaling_parameters.csv', index = False)


    # selected.index = df.T.index

    return selected2

def calculate_index(data_transformed):
    D = np.load(f'{path}/D_matrix_{model_name}.npy')
    C = np.load(f'{path}/C_matrix_{model_name}.npy')
    fi = np.load(f'{path}/fi_matrix_{model_name}.npy')
    pca = joblib.load(f'{path}/pca_model_{model_name}.pkl')


    with open(f'{path}/thresholds_{model_name}.json', 'r') as file:
        thresholds = json.load(file)
        t2_threshold = thresholds['t2']
        Q_alpha = thresholds['c']
        threshold_combined = thresholds['combined']

    T2, Q, combined = [], [], []
    pred_t2, pred_Q, pred_combined = [], [], []

    try:
        for item in pca.transform(data_transformed):
            index = item.T @ D @ item
            index2 = item.T @ C @ item
            index3 = item.T @ fi @ item
            T2.append(index)
            Q.append(index2)
            combined.append(index3)
            if index > t2_threshold:
                pred_t2.append('Unhealthy')
            else:
                pred_t2.append('Healthy')

            if index2 > Q_alpha:
                pred_Q.append('Unhealthy')
            else:
                pred_Q.append('Healthy')

            if index3 > threshold_combined:
                pred_combined.append('Unhealthy')
            else:
                pred_combined.append('Healthy') 
    except:
        for item in np.array(data_transformed):
            index = item.T @ D @ item
            index2 = item.T @ C @ item
            index3 = item.T @ fi @ item
            T2.append(index)
            Q.append(index2)
            combined.append(index3)
            if index > t2_threshold:
                pred_t2.append('Unhealthy')
            else:
                pred_t2.append('Healthy')

            if index2 > Q_alpha:
                pred_Q.append('Unhealthy')
            else:
                pred_Q.append('Healthy')

            if index3 > threshold_combined:
                pred_combined.append('Unhealthy')
            else:
                pred_combined.append('Healthy') 

    return pd.DataFrame(zip(data_transformed.index, T2, pred_t2, Q, pred_Q, combined, pred_combined), columns = ['SampleID', 'T2', 'Prediction T2', 'Q', 'Prediction Q', 'Combined Index', 'Combined Prediction'])

def calculate_hiPCA(model, path_, data):
    global model_name, path
    model_name = model
    path = path_
    data = transform_data(data)
    results = calculate_index(data)
    return results

## Experiments and evaluations

In [167]:
path = 'model_data'

In [168]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=21)

t2_evaluations = []
q_evaluations = []
combined_evaluations = []
# Loop over each fold
for fold, (train_index, test_index) in enumerate(skf.split(taxonomy.T, labels)):
    model_name = f'camda_fold{fold+1}'
    X_train, X_test = taxonomy.T.iloc[train_index], taxonomy.T.iloc[test_index]
    y_train, y_test = labels.iloc[train_index], labels.iloc[test_index]

    healthy_sub = [x for x in X_train.index if x in healthy]
    nonhealthy_sub = [x for x in X_train.index if x in non_healthy]
    
    print(f"Fold {fold + 1}")
    
    features, pca, pca_data, D, t2_threshold, C, Q_alpha, fi, threshold_combined, selected = hiPCA(X_train.T, healthy_sub, nonhealthy_sub, ks = True, method = 'asymp', only_nonhealthy_features = True)

    data_transformed = transform_data_calculate(X_test.T)
    results = calculate_index(data_transformed)

    # print(list(y_test))
    # print(list(results['Prediction T2']))
    true = ['Healthy' if x == 'Healthy' else 'Unhealthy' for x in list(y_test[0])]
    # print(true)
    t2_score = balanced_accuracy_score(true, results['Prediction T2'])
    t2_evaluations.append(t2_score)
    print('Balanced Accuracy Scores:')
    print(f'T\u00b2 -> {t2_score}')
    q_score = balanced_accuracy_score(true, results['Prediction Q'])
    q_evaluations.append(q_score)
    print(f'Q -> {q_score}')
    combined = balanced_accuracy_score(true, results['Combined Prediction'])
    combined_evaluations.append(combined)
    print(f'Combined index -> {combined}')
    print('-------------------------------------------------------')


print('Final evaluation')
mean = statistics.mean(t2_evaluations)
stdev = statistics.stdev(t2_evaluations)
print(f'T\u00b2 mean performance {mean} with an standard deviation of {stdev}')

mean = statistics.mean(q_evaluations)
stdev = statistics.stdev(q_evaluations)
print(f'Q mean performance {mean} with an standard deviation of {stdev}')

mean = statistics.mean(combined_evaluations)
stdev = statistics.stdev(combined_evaluations)
print(f'Combined mean performance {mean} with an standard deviation of {stdev}')


Fold 1
# Unhealthy features selected by KS: 18
# Principal Components selected: 11
Balanced Accuracy Scores:
T² -> 0.8023809523809524
Q -> 0.7742063492063491
Combined index -> 0.7976190476190477
-------------------------------------------------------
Fold 2
# Unhealthy features selected by KS: 15
# Principal Components selected: 10
Balanced Accuracy Scores:
T² -> 0.736904761904762
Q -> 0.6535714285714286
Combined index -> 0.7797619047619048
-------------------------------------------------------
Fold 3
# Unhealthy features selected by KS: 23
# Principal Components selected: 14
Balanced Accuracy Scores:
T² -> 0.6880952380952381
Q -> 0.698015873015873
Combined index -> 0.6742063492063493
-------------------------------------------------------
Fold 4
# Unhealthy features selected by KS: 20
# Principal Components selected: 12
Balanced Accuracy Scores:
T² -> 0.7188172043010752
Q -> 0.6975806451612903
Combined index -> 0.6970430107526882
------------------------------------------------------

In [169]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=21)

t2_evaluations = []
q_evaluations = []
combined_evaluations = []
# Loop over each fold
for fold, (train_index, test_index) in enumerate(skf.split(taxonomy.T, labels)):
    model_name = f'camda_fold{fold+1}'
    X_train, X_test = taxonomy.T.iloc[train_index], taxonomy.T.iloc[test_index]
    y_train, y_test = labels.iloc[train_index], labels.iloc[test_index]

    healthy_sub = [x for x in X_train.index if x in healthy]
    nonhealthy_sub = [x for x in X_train.index if x in non_healthy]
    
    print(f"Fold {fold + 1}")
    
    features, pca, pca_data, D, t2_threshold, C, Q_alpha, fi, threshold_combined, selected = hiPCA(X_train.T, healthy_sub, nonhealthy_sub, ks = True, method = 'asymp', only_nonhealthy_features = False)

    data_transformed = transform_data_calculate(X_test.T)
    results = calculate_index(data_transformed)

    # print(list(y_test))
    # print(list(results['Prediction T2']))
    true = ['Healthy' if x == 'Healthy' else 'Unhealthy' for x in list(y_test[0])]
    # print(true)
    t2_score = balanced_accuracy_score(true, results['Prediction T2'])
    t2_evaluations.append(t2_score)
    print('Balanced Accuracy Scores:')
    print(f'T\u00b2 -> {t2_score}')
    q_score = balanced_accuracy_score(true, results['Prediction Q'])
    q_evaluations.append(q_score)
    print(f'Q -> {q_score}')
    combined = balanced_accuracy_score(true, results['Combined Prediction'])
    combined_evaluations.append(combined)
    print(f'Combined index -> {combined}')
    print('-------------------------------------------------------')
          

print('Final evaluation')
mean = statistics.mean(t2_evaluations)
stdev = statistics.stdev(t2_evaluations)
print(f'T\u00b2 mean performance {mean} with an standard deviation of {stdev}')

mean = statistics.mean(q_evaluations)
stdev = statistics.stdev(q_evaluations)
print(f'Q mean performance {mean} with an standard deviation of {stdev}')

mean = statistics.mean(combined_evaluations)
stdev = statistics.stdev(combined_evaluations)
print(f'Combined mean performance {mean} with an standard deviation of {stdev}')

Fold 1
# Healthy features selected by KS: 159
# Unhealthy features selected by KS: 18
# Principal Components selected: 58
Balanced Accuracy Scores:
T² -> 0.7253968253968254
Q -> 0.5
Combined index -> 0.7257936507936508
-------------------------------------------------------
Fold 2
# Healthy features selected by KS: 148
# Unhealthy features selected by KS: 15
# Principal Components selected: 57
Balanced Accuracy Scores:
T² -> 0.6678571428571428
Q -> 0.5
Combined index -> 0.7178571428571429
-------------------------------------------------------
Fold 3
# Healthy features selected by KS: 156
# Unhealthy features selected by KS: 23
# Principal Components selected: 59
Balanced Accuracy Scores:
T² -> 0.6178571428571429
Q -> 0.5
Combined index -> 0.6932539682539682
-------------------------------------------------------
Fold 4
# Healthy features selected by KS: 154
# Unhealthy features selected by KS: 20
# Principal Components selected: 59
Balanced Accuracy Scores:
T² -> 0.7422043010752688
Q 

In [170]:
sum(t2_evaluations)/len(t2_evaluations)

0.6808243727598566

In [171]:
sum(q_evaluations)/len(q_evaluations)

0.5

In [172]:
sum(combined_evaluations)/len(combined_evaluations)

0.7178648233486943

In [173]:
pathways = pd.read_csv('../../DataSets/CAMDA/pathways.txt', sep = '\t', index_col = 0)

In [174]:
labels = []
for sample in pathways.T.index:
    labels.append(metadata[metadata['SampleID'] == sample]['Diagnosis'].iloc[0])

In [175]:
labels = pd.DataFrame(labels)

In [176]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=21)

t2_evaluations = []
q_evaluations = []
combined_evaluations = []
# Loop over each fold
for fold, (train_index, test_index) in enumerate(skf.split(pathways.T, labels)):
    model_name = f'camda_pathways_fold{fold+1}'
    X_train, X_test = pathways.T.iloc[train_index], pathways.T.iloc[test_index]
    y_train, y_test = labels.iloc[train_index], labels.iloc[test_index]

    healthy_sub = [x for x in X_train.index if x in healthy]
    nonhealthy_sub = [x for x in X_train.index if x in non_healthy]
    
    print(f"Fold {fold + 1}")
    
    features, pca, pca_data, D, t2_threshold, C, Q_alpha, fi, threshold_combined, selected = hiPCA(X_train.T, healthy_sub, nonhealthy_sub, ks = True, method = 'asymp', only_nonhealthy_features = True)

    data_transformed = transform_data_calculate(X_test.T)
    results = calculate_index(data_transformed)

    # print(list(y_test))
    # print(list(results['Prediction T2']))
    true = ['Healthy' if x == 'Healthy' else 'Unhealthy' for x in list(y_test[0])]
    # print(true)
    t2_score = balanced_accuracy_score(true, results['Prediction T2'])
    t2_evaluations.append(t2_score)
    print('Balanced Accuracy Scores:')
    print(f'T\u00b2 -> {t2_score}')
    q_score = balanced_accuracy_score(true, results['Prediction Q'])
    q_evaluations.append(q_score)
    print(f'Q -> {q_score}')
    combined = balanced_accuracy_score(true, results['Combined Prediction'])
    combined_evaluations.append(combined)
    print(f'Combined index -> {combined}')
    print('-------------------------------------------------------')


print('Final evaluation')
mean = statistics.mean(t2_evaluations)
stdev = statistics.stdev(t2_evaluations)
print(f'T\u00b2 mean performance {mean} with an standard deviation of {stdev}')

mean = statistics.mean(q_evaluations)
stdev = statistics.stdev(q_evaluations)
print(f'Q mean performance {mean} with an standard deviation of {stdev}')

mean = statistics.mean(combined_evaluations)
stdev = statistics.stdev(combined_evaluations)
print(f'Combined mean performance {mean} with an standard deviation of {stdev}')
          

Fold 1
# Unhealthy features selected by KS: 676
# Principal Components selected: 9
Balanced Accuracy Scores:
T² -> 0.7603174603174603
Q -> 0.5833333333333334
Combined index -> 0.7603174603174603
-------------------------------------------------------
Fold 2
# Unhealthy features selected by KS: 635
# Principal Components selected: 9
Balanced Accuracy Scores:
T² -> 0.7503968253968254
Q -> 0.5583333333333333
Combined index -> 0.7503968253968254
-------------------------------------------------------
Fold 3
# Unhealthy features selected by KS: 695
# Principal Components selected: 9
Balanced Accuracy Scores:
T² -> 0.7019841269841269
Q -> 0.5166666666666667
Combined index -> 0.7019841269841269
-------------------------------------------------------
Fold 4
# Unhealthy features selected by KS: 600
# Principal Components selected: 8
Balanced Accuracy Scores:
T² -> 0.7268817204301075
Q -> 0.5583333333333333
Combined index -> 0.7268817204301075
----------------------------------------------------

In [177]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=21)

t2_evaluations = []
q_evaluations = []
combined_evaluations = []
# Loop over each fold
for fold, (train_index, test_index) in enumerate(skf.split(pathways.T, labels)):
    model_name = f'camda_pathways_fold{fold+1}'
    X_train, X_test = pathways.T.iloc[train_index], pathways.T.iloc[test_index]
    y_train, y_test = labels.iloc[train_index], labels.iloc[test_index]

    healthy_sub = [x for x in X_train.index if x in healthy]
    nonhealthy_sub = [x for x in X_train.index if x in non_healthy]
    
    print(f"Fold {fold + 1}")
    
    features, pca, pca_data, D, t2_threshold, C, Q_alpha, fi, threshold_combined, selected = hiPCA(X_train.T, healthy_sub, nonhealthy_sub, ks = True, method = 'asymp', only_nonhealthy_features = False)

    data_transformed = transform_data_calculate(X_test.T)
    results = calculate_index(data_transformed)

    # print(list(y_test))
    # print(list(results['Prediction T2']))
    true = ['Healthy' if x == 'Healthy' else 'Unhealthy' for x in list(y_test[0])]
    # print(true)
    t2_score = balanced_accuracy_score(true, results['Prediction T2'])
    t2_evaluations.append(t2_score)
    print('Balanced Accuracy Scores:')
    print(f'T\u00b2 -> {t2_score}')
    q_score = balanced_accuracy_score(true, results['Prediction Q'])
    q_evaluations.append(q_score)
    print(f'Q -> {q_score}')
    combined = balanced_accuracy_score(true, results['Combined Prediction'])
    combined_evaluations.append(combined)
    print(f'Combined index -> {combined}')
    print('-------------------------------------------------------')


print('Final evaluation')
mean = statistics.mean(t2_evaluations)
stdev = statistics.stdev(t2_evaluations)
print(f'T\u00b2 mean performance {mean} with an standard deviation of {stdev}')

mean = statistics.mean(q_evaluations)
stdev = statistics.stdev(q_evaluations)
print(f'Q mean performance {mean} with an standard deviation of {stdev}')

mean = statistics.mean(combined_evaluations)
stdev = statistics.stdev(combined_evaluations)
print(f'Combined mean performance {mean} with an standard deviation of {stdev}')

Fold 1
# Healthy features selected by KS: 1190
# Unhealthy features selected by KS: 676
# Principal Components selected: 32
Balanced Accuracy Scores:
T² -> 0.7777777777777777
Q -> 0.5083333333333333
Combined index -> 0.7777777777777777
-------------------------------------------------------
Fold 2
# Healthy features selected by KS: 1190
# Unhealthy features selected by KS: 635
# Principal Components selected: 31
Balanced Accuracy Scores:
T² -> 0.8095238095238095
Q -> 0.5083333333333333
Combined index -> 0.8095238095238095
-------------------------------------------------------
Fold 3
# Healthy features selected by KS: 1251
# Unhealthy features selected by KS: 695
# Principal Components selected: 32
Balanced Accuracy Scores:
T² -> 0.7440476190476191
Q -> 0.5
Combined index -> 0.7440476190476191
-------------------------------------------------------
Fold 4
# Healthy features selected by KS: 1224
# Unhealthy features selected by KS: 600
# Principal Components selected: 30
Balanced Accura

In [184]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=21)

evaluations = []
# Loop over each fold
for fold, (train_index, test_index) in enumerate(skf.split(pathways.T, labels)):
    # model_name = f'camda_pathways_fold{fold+1}'
    X_train_pathways, X_test_pathways = pathways.T.iloc[train_index], pathways.T.iloc[test_index]
    X_train_tax, X_test_tax = taxonomy.T.iloc[train_index], taxonomy.T.iloc[test_index]

    # X_train_un = X_train_pathways[[x for x in X_train.columns if 'UNINTEGRATED' in x]]
    # X_train_in = X_train_pathways[[x for x in X_train.columns if 'UNINTEGRATED' not in x]]

    # X_test_un = X_test_pathways[[x for x in X_test_pathways.columns if 'UNINTEGRATED' in x]]
    # X_test_in = X_test_pathways[[x for x in X_test_pathways.columns if 'UNINTEGRATED' not in x]]
    
    y_train, y_test = labels.iloc[train_index], labels.iloc[test_index]

    healthy_sub = [x for x in X_train.index if x in healthy]
    nonhealthy_sub = [x for x in X_train.index if x in non_healthy]

    print(f"Fold {fold + 1}")
    model_name = f'camda_pathways_fold{fold+1}'
    features, pca, pca_data, D, t2_threshold, C, Q_alpha, fi, threshold_combined, selected = hiPCA(X_train_pathways.T, healthy_sub, nonhealthy_sub, ks = True, method = 'asymp', only_nonhealthy_features = False)
    data_transformed = transform_data_calculate(X_test_pathways.T)
    results_pathways = calculate_index(data_transformed)

    data_transformed = transform_data_calculate(X_train_pathways.T)
    results_pathways_train = calculate_index(data_transformed)

    model_name = f'camda_tax_fold{fold+1}'
    features, pca, pca_data, D, t2_threshold, C, Q_alpha, fi, threshold_combined, selected = hiPCA(X_train_tax.T, healthy_sub, nonhealthy_sub, ks = True, method = 'asymp', only_nonhealthy_features = True)
    data_transformed = transform_data_calculate(X_test_tax.T)
    results_tax = calculate_index(data_transformed)
    data_transformed = transform_data_calculate(X_train_tax.T)
    results_tax_train = calculate_index(data_transformed)

    t2_tax = list(results_tax_train['T2'])
    q_tax = list(results_tax_train['Q'])
    combined_tax = list(results_tax_train['Combined Index'])

    t2_path = list(results_pathways_train['T2'])
    q_path = list(results_pathways_train['Q'])
    combined_path = list(results_pathways_train['Combined Index'])
    
    t2_tax_test = list(results_tax['T2'])
    q_tax_test = list(results_tax['Q'])
    combined_tax_test = list(results_tax['Combined Index'])

    t2_path_test = list(results_pathways['T2'])
    q_path_test = list(results_pathways['Q'])
    combined_path_test = list(results_pathways['Combined Index'])

    

    # print(results_tax)

    # print(X_train_tax)
    
    

    new_data_train = pd.DataFrame(zip(t2_tax, q_tax, combined_tax, t2_path, q_path, combined_path), columns = ['T2 taxonomy', 'Q taxonomy', 'Combined taxonomy', 'T2 pathways', 'Q pathways', 'Combined pathways'])
    
    new_data_train.index = X_train_tax.index

    new_data_test = pd.DataFrame(zip(t2_tax_test, q_tax_test, combined_tax_test, t2_path_test, q_path_test, combined_path_test), columns = ['T2 taxonomy', 'Q taxonomy', 'Combined taxonomy', 'T2 pathways', 'Q pathways', 'Combined pathways'])
    new_data_test.index = X_test_tax.index
    # # print(new_data_test.head(3))

    clf = RandomForestClassifier(max_depth=5, min_samples_leaf = 3, random_state=0)
    # # clf = DecisionTreeClassifier(max_depth=5, min_samples_leaf = 3, random_state=0)
    # print( list(y_train[0]) )
    # print(['Healthy' if x == 'Healthy' else 'Unhealthy' for x in list(y_train[0])])
    # print(new_data_train)
    clf.fit(new_data_train, ['Healthy' if x == 'Healthy' else 'Unhealthy' for x in list(y_train[0])])

    pred = clf.predict(new_data_test)

    ba = balanced_accuracy_score(['Healthy' if x == 'Healthy' else 'Unhealthy' for x in list(y_test[0])], pred)
    evaluations.append(ba)
    print(f'Balanced Accuracy -> {ba}')


    

print('Final evaluation')
mean = statistics.mean(evaluations)
stdev = statistics.stdev(evaluations)
print(f'Balanced Accuracy mean performance {mean} with an standard deviation of {stdev}')


Fold 1
# Healthy features selected by KS: 943
# Unhealthy features selected by KS: 587
# Principal Components selected: 26
# Unhealthy features selected by KS: 13
# Principal Components selected: 8
Balanced Accuracy -> 0.819047619047619
Fold 2
# Healthy features selected by KS: 972
# Unhealthy features selected by KS: 486
# Principal Components selected: 24
# Unhealthy features selected by KS: 12
# Principal Components selected: 7
Balanced Accuracy -> 0.8269841269841269
Fold 3
# Healthy features selected by KS: 1016
# Unhealthy features selected by KS: 576
# Principal Components selected: 27
# Unhealthy features selected by KS: 17
# Principal Components selected: 11
Balanced Accuracy -> 0.7611111111111111
Fold 4
# Healthy features selected by KS: 1045
# Unhealthy features selected by KS: 463
# Principal Components selected: 25
# Unhealthy features selected by KS: 14
# Principal Components selected: 9
Balanced Accuracy -> 0.8021505376344086
Fold 5
# Healthy features selected by KS: 1202