# KS - hiPCA

In this notebook we replicate the Kolgomorov-Smirnov hiPCA index as described in (Zhu et al, 2023). We also capture all the experiments made in pursue to obtain the best result for the CAMDA 2024 challenge

First we import the necessary libraries to run this notebook

In [220]:
import warnings
warnings.filterwarnings("ignore")

In [221]:
import pandas as pd
import numpy as np
from scipy.stats import kstest
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.stats import chi2, norm
from sklearn.metrics import balanced_accuracy_score

Now we need to read the data

In [222]:
taxonomy = pd.read_csv('../../DataSets/CAMDA/taxonomy.txt', sep = '\t', index_col = 0)
metadata = pd.read_csv('../../DataSets/CAMDA/metadata.csv')

We select the samples which have most of the species identified

In [223]:
good_samples = []
for c in taxonomy.columns:
    if sum(taxonomy[c]) > 90:
        good_samples.append(c)
taxonomy_aux = taxonomy[good_samples]

From previous experiments we obtained better performance taking out the samples labeled as Obese, that is the reason we are not going to consider them either

In [224]:
obese = list(metadata[metadata['Diagnosis'] == 'Obese']['SampleID'])
healthy = list(metadata[metadata['Diagnosis'] == 'Healthy']['SampleID'])
non_healthy = list(metadata[metadata['Diagnosis'] != 'Healthy']['SampleID'])
non_healthy = [x for x in non_healthy if x not in obese]

Next we will define a function to perform Kolmogorov-Smirnov test to find the most important features for the PCA model

In [225]:
def ks_test(df, healthy, non_healthy, method_ks = 'auto', p_val = 0.001):
    healthy_df = df[[x for x in df.columns if x in healthy]].T
    nonhealthy_df = df[[x for x in df.columns if x in non_healthy]].T
    healthy_features = []
    nonhealthy_features = []
    for feature in list(df.index):
        if kstest(list(healthy_df[feature]), list(nonhealthy_df[feature]), alternative = 'less', method = method_ks).pvalue <= p_val:
            healthy_features.append(feature)
        if kstest(list(nonhealthy_df[feature]), list(healthy_df[feature]), alternative = 'less', method = method_ks).pvalue <= p_val:
            nonhealthy_features.append(feature)
    print(f'# Healthy features selected by KS: {len(healthy_features)}')
    print(f'# Unheatlhy features selected by KS: {len(nonhealthy_features)}')
    return healthy_features, nonhealthy_features

Now we define the data preprocessing workflow as defined in the paper

In [226]:
def custom_transform(x):
    if x <= 1:
        return np.log2(2 * x + 0.00001)
    else:
        return np.sqrt(x)

In [227]:
def transform_data(df, features):
    scaler = StandardScaler()
    # selected = df.T[[x for x in list(set(healthy_features + nonhealthy_features)) if x in df.T.columns]]
    aux = pd.DataFrame()
    for item in list(set(features)):
        if item in df.index:
            aux[item] = list(df.T[item])
        else:
            aux[item] = [0 for x in range(len(df.T))]
    selected = aux.applymap(custom_transform)

    for c in selected.columns:
        scaler.fit(np.array(selected[c]).reshape(-1, 1))
        selected[c] = scaler.transform(np.array(selected[c]).reshape(-1, 1))
        
    selected.index = df.T.index

    return selected

Then we define a function to perform PCA over the selected features only

In [228]:
def get_pca_data(df):
    pca = PCA()

    pca.fit(df)

    eigenvalues = pca.explained_variance_
    eigenvectors = pca.components_
    singular = pca.singular_values_
    
    pca_data = pd.DataFrame(zip(eigenvectors, eigenvalues, singular), columns = ('Eigenvectors', 'Explained_variance', 'Singular_values')).sort_values('Explained_variance', ascending = False)
    pca_data['%variance'] = pca_data['Explained_variance'] / sum(pca_data['Explained_variance'])
    pca_data = pca_data.sort_values('%variance', ascending = False)
    pca_data['%variance_cumulative'] = pca_data['%variance'].cumsum()
    
    return pca_data, pca

In [229]:
def calculate_pca_stats(df, variance_for_pc = 0.9, alpha = 0.05):
    pca = PCA()

    pca.fit(df)

    eigenvalues = pca.explained_variance_
    eigenvectors = pca.components_
    singular = pca.singular_values_
    
    pca_data = pd.DataFrame(zip(eigenvectors, eigenvalues, singular), columns = ('Eigenvectors', 'Explained_variance', 'Singular_values')).sort_values('Explained_variance', ascending = False)
    pca_data['%variance'] = pca_data['Explained_variance'] / sum(pca_data['Explained_variance'])
    pca_data = pca_data.sort_values('%variance', ascending = False)
    pca_data['%variance_cumulative'] = pca_data['%variance'].cumsum()
    
    principal_components = list(pca_data[pca_data['%variance_cumulative'] < variance_for_pc]['Eigenvectors'])
    print(f'# Principal Components selected: {len(principal_components)}')
    
    principal_values = list(pca_data[pca_data['%variance_cumulative'] < variance_for_pc]['Explained_variance'])
    D = np.array(principal_components).T @ np.linalg.inv(np.diag(principal_values)) @ np.array(principal_components)
    deg_free = len(principal_components) 
    # alpha = 0.05
    t2_threshold = chi2.ppf(1-alpha, deg_free)
#     print(1-alpha, deg_free)
#     print(t2_threshold)
    
    principal_components_residual = list(pca_data[pca_data['%variance_cumulative'] >= variance_for_pc]['Eigenvectors'])
    principal_values_residual = list(pca_data[pca_data['%variance_cumulative'] >= variance_for_pc]['Explained_variance'])
    principal_singvalues_residual = list(pca_data[pca_data['%variance_cumulative'] >= variance_for_pc]['Singular_values'])
    
    C = np.array(principal_components_residual).T @ np.array(principal_components_residual)
    Theta1 = sum(principal_values_residual)
    Theta2 = sum([x**2 for x in principal_values_residual])
    Theta3 = sum([x**3 for x in principal_values_residual])
    
    c_alpha = norm.ppf(1-alpha)
    
    h0 = 1-((2*Theta1*Theta3)/(3*Theta2**2))
    
    Q_alpha = Theta1*(((((c_alpha*np.sqrt(2*Theta2*(h0**2)))/Theta1)+1+((Theta2*h0*(h0-1))/(Theta1**2))))**(1/h0))
    
    fi = D/t2_threshold + (np.eye(len(principal_components[0])) - (np.array(principal_components).T @ np.array(principal_components)))/Q_alpha
    g = ((len(principal_components) / t2_threshold**2) + (Theta2 / Q_alpha**2)) / ((len(principal_components)/t2_threshold) + (Theta1 / Q_alpha))
    h = ((len(principal_components)/t2_threshold) + (Theta1 / Q_alpha))**2 / ((len(principal_components) / t2_threshold**2) + (Theta2 / Q_alpha**2))

    chi_value = chi2.ppf(1-alpha, h)
    threshold_combined = g*chi_value
    
    return pca, pca_data, D, t2_threshold, C, Q_alpha, fi, threshold_combined
    

In [230]:
def calculate_stats(pca, samples_df):
    for item in pca.transform(samples_df):
        index = item.T @ D @ item
        T2.append(index)
        if index > t2_threshold:
            pred.append('Unhealthy')
        else:
            pred.append('Healthy')

After that we start calculating the indexes, below there is a function to calculate T^2 index

In [231]:
def hotelling_t2(df, pca, pca_data, variance_for_pc = 0.9, alpha = 0.05):
    principal_components = list(pca_data[pca_data['%variance_cumulative'] < variance_for_pc]['Eigenvectors'])
    print(f'# Principal Components selected: {len(principal_components)}')
    principal_values = list(pca_data[pca_data['%variance_cumulative'] < variance_for_pc]['Explained_variance'])
    D = np.array(principal_components).T @ np.linalg.inv(np.diag(principal_values)) @ np.array(principal_components)
    deg_free = len(principal_components) 
    # alpha = 0.05
    t2_threshold = chi2.ppf(1-alpha, deg_free)
    T2 = []
    pred = []
    
    try:
        for item in pca.transform(df):
            index = item.T @ D @ item
            T2.append(index)
            if index > t2_threshold:
                pred.append('Unhealthy')
            else:
                pred.append('Healthy')
    except:
        for item in np.array(df):
            index = item.T @ D @ item
            T2.append(index)
            if index > t2_threshold:
                pred.append('Unhealthy')
            else:
                pred.append('Healthy')
            
    hoteling = pd.DataFrame(zip(df.index, T2, pred), columns = ['Sample', 'T2', 'Prediction T2'])
    
    return D, principal_components, hoteling, t2_threshold
    

Here we made a modification to Q_statistic limit according to ()

In [232]:
def Q_statistic(df, pca, pca_data, variance_for_pc = 0.9, alpha = 0.05):
    principal_components_residual = list(pca_data[pca_data['%variance_cumulative'] >= variance_for_pc]['Eigenvectors'])
    principal_values_residual = list(pca_data[pca_data['%variance_cumulative'] >= variance_for_pc]['Explained_variance'])
    principal_singvalues_residual = list(pca_data[pca_data['%variance_cumulative'] >= variance_for_pc]['Singular_values'])
    
    C = np.array(principal_components_residual).T @ np.array(principal_components_residual)
    Theta1 = sum(principal_values_residual)
    Theta2 = sum([x**2 for x in principal_values_residual])
    Theta3 = sum([x**3 for x in principal_values_residual])
    
    c_alpha = norm.ppf(1-alpha)
    
    h0 = 1-((2*Theta1*Theta3)/(3*Theta2**2))
    
    Q_alpha = Theta1*(((((c_alpha*np.sqrt(2*Theta2*(h0**2)))/Theta1)+1+((Theta2*h0*(h0-1))/(Theta1**2))))**(1/h0))
    
    Q = []
    pred = []
    try:
        for item in pca.transform(df):
            index = item.T @ C @ item
            Q.append(index)
            if index > Q_alpha:
                pred.append('Unhealthy')
            else:
                pred.append('Healthy')
    except:
        for item in np.array(df):
            index = item.T @ C @ item
            Q.append(index)
            if index > Q_alpha:
                pred.append('Unhealthy')
            else:
                pred.append('Healthy')
    
    Q_statistic = pd.DataFrame(zip(df.index, Q, pred), columns = ['Sample', 'Q', 'Prediction Q'])
    
    return C, Theta1, Theta2, Q_statistic, Q_alpha
    
    

In [233]:
def combined_index(df, D, t2_threshold, principal_components, Q_alpha, Theta1, Theta2, pca, alpha = 0.05):
    fi = D/t2_threshold + (np.eye(len(principal_components[0])) - (np.array(principal_components).T @ np.array(principal_components)))/Q_alpha
    g = ((len(principal_components) / t2_threshold**2) + (Theta2 / Q_alpha**2)) / ((len(principal_components)/t2_threshold) + (Theta1 / Q_alpha))
    h = ((len(principal_components)/t2_threshold) + (Theta1 / Q_alpha))**2 / ((len(principal_components) / t2_threshold**2) + (Theta2 / Q_alpha**2))

    chi_value = chi2.ppf(1-alpha, h)
    threshold_combined = g*chi_value
    combined = []
    pred = []

    try:
        for item in pca.transform(df):
            index = item.T @ fi @ item
            combined.append(index)
            if index > threshold_combined:
                pred.append('Unhealthy')
            else:
                pred.append('Healthy')
    except:
        for item in np.array(df):
            index = item.T @ fi @ item
            combined.append(index)
            if index > threshold_combined:
                pred.append('Unhealthy')
            else:
                pred.append('Healthy')

    combined = pd.DataFrame(zip(df.index, combined, pred), columns = ['Sample', 'Combined', 'Prediction Combined']) 
    return combined

Finally we define a function to calculate the index 

In [234]:
def hiPCA(df, healthy, non_healthy, features = [], ks = False, method = 'auto', p_val = 0.001, only_nonhealthy_features = False):
    if ks:
        healthy_features, non_healthy_features = ks_test(df, healthy, non_healthy, method_ks = method, p_val = p_val)
        
    if only_nonhealthy_features:
        healthy_features = []
        if ks:
            features = healthy_features + non_healthy_features
        selected = transform_data(df[[x for x in healthy if x in df.columns]], features)
        
    else:
        if ks:
            features = healthy_features + non_healthy_features
        selected = transform_data(df[[x for x in healthy if x in df.columns]], features)
    
    # print(selected)
    pca, pca_data, D, t2_threshold, C, Q_alpha, fi, threshold_combined = calculate_pca_stats(selected)
    print(t2_threshold, Q_alpha, threshold_combined)

        
    return features, pca, pca_data, D, t2_threshold, C, Q_alpha, fi, threshold_combined, selected

In [235]:
def calculate_index(data_transformed):
    T2, Q, combined = [], [], []
    pred_t2, pred_Q, pred_combined = [], [], []

    try:
        for item in pca.transform(data_transformed):
            index = item.T @ D @ item
            index2 = item.T @ C @ item
            index3 = item.T @ fi @ item
            T2.append(index)
            Q.append(index2)
            combined.append(index3)
            if index > t2_threshold:
                pred_t2.append('Unhealthy')
            else:
                pred_t2.append('Healthy')

            if index2 > Q_alpha:
                pred_Q.append('Unhealthy')
            else:
                pred_Q.append('Healthy')

            if index3 > threshold_combined:
                pred_combined.append('Unhealthy')
            else:
                pred_combined.append('Healthy') 
    except:
        for item in np.array(data_transformed):
            index = item.T @ D @ item
            index2 = item.T @ C @ item
            index3 = item.T @ fi @ item
            T2.append(index)
            Q.append(index2)
            combined.append(index3)
            if index > t2_threshold:
                pred_t2.append('Unhealthy')
            else:
                pred_t2.append('Healthy')

            if index2 > Q_alpha:
                pred_Q.append('Unhealthy')
            else:
                pred_Q.append('Healthy')

            if index3 > threshold_combined:
                pred_combined.append('Unhealthy')
            else:
                pred_combined.append('Healthy') 

    return pd.DataFrame(zip(data_transformed.index, T2, pred_t2, Q, pred_Q, combined, pred_combined), columns = ['SampleID', 'T2', 'Prediction T2', 'Q', 'Prediction Q', 'Combined Index', 'Combined Prediction'])

## Experiments

### Experiment 1

First we will calculate KS-hiPCA index using just taxonomic profile with just the unhealthy related species evaluate its result using balanced accuracy

In [332]:
obese = list(metadata[metadata['Diagnosis'] == 'Obese']['SampleID'])
healthy = list(metadata[metadata['Diagnosis'] == 'Healthy']['SampleID'])
non_healthy = list(metadata[metadata['Diagnosis'] != 'Healthy']['SampleID'])
non_healthy = [x for x in non_healthy if x not in obese]

In [333]:
taxonomy_not_obese = taxonomy_aux[[x for x in taxonomy_aux if x not in obese]]

In [334]:
taxonomy_transposed = taxonomy_not_obese.T

In [335]:
label = []
for item in taxonomy_transposed.T.columns:
    if metadata[metadata['SampleID'] == item]['Diagnosis'].iloc[0] == 'Healthy':
        label.append('Healthy')
    else:
        label.append('Unhealthy')

In [336]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(taxonomy_transposed, label, test_size=0.20, random_state=20, stratify = label)

In [337]:
features, pca, pca_data, D, t2_threshold, C, Q_alpha, fi, threshold_combined, selected = hiPCA(X_train.T, healthy, non_healthy, ks = True, method = 'asymp', only_nonhealthy_features = True)

# Healthy features selected by KS: 130
# Unheatlhy features selected by KS: 17
# Principal Components selected: 10
0.95 10
18.307038053275146
18.307038053275146 4.198301915746644 1.6919901191461642


In [338]:
test_transformed = transform_data(X_test.T, features)

In [339]:
results = calculate_index(test_transformed)
# results.to_csv('../../output/hiPCA/experiment1_results.csv', index = False)

In [340]:
acc_combined = balanced_accuracy_score(results['Combined Prediction'], list(y_test))
acc_t2 = balanced_accuracy_score(results['Prediction T2'], list(y_test))
acc_q = balanced_accuracy_score(results['Prediction Q'], list(y_test))
print(f'KS - hiPCA (T^2) : {acc_t2}')
print(f'KS - hiPCA (Q) : {acc_q}')
print(f'KS - hiPCA (Combined): {acc_combined}')

KS - hiPCA (T^2) : 0.7857142857142857
KS - hiPCA (Q) : 0.6482263513513513
KS - hiPCA (Combined): 0.8151913875598087


In [341]:
train_transformed = transform_data(X_train.T, features)

In [348]:
results = calculate_index(train_transformed)
# results.to_csv('../../output/hiPCA/experiment1_results_train.csv', index = False)

### Experiment 2

Now we will use the both the unhealthy and healhty species to build the index

In [245]:
features, pca, pca_data, D, t2_threshold, C, Q_alpha, fi, threshold_combined, selected = hiPCA(X_train.T, healthy, non_healthy, ks = True, method = 'asymp', only_nonhealthy_features = False)

# Healthy features selected by KS: 130
# Unheatlhy features selected by KS: 17
# Principal Components selected: 51
0.95 51
68.66929391228578
68.66929391228578 20.365473937778425 1.8419064612150686


In [246]:
test_transformed = transform_data(X_test.T, features)

In [247]:
results = calculate_index(test_transformed)
# results.to_csv('../../output/hiPCA/experiment2_results.csv', index = False)

In [248]:
balanced_accuracy_score(results['Combined Prediction'], list(y_test))

0.48514851485148514

In [249]:
acc_combined = balanced_accuracy_score(results['Combined Prediction'], list(y_test))
acc_t2 = balanced_accuracy_score(results['Prediction T2'], list(y_test))
acc_q = balanced_accuracy_score(results['Prediction Q'], list(y_test))
print(f'KS - hiPCA (T^2) : {acc_t2}')
print(f'KS - hiPCA (Q) : {acc_q}')
print(f'KS - hiPCA (Combined): {acc_combined}')

KS - hiPCA (T^2) : 0.35
KS - hiPCA (Q) : 0.48514851485148514
KS - hiPCA (Combined): 0.48514851485148514


### Experiment 3

Next, we will try using the species selected by the differential abundance test with p-value 0.05 and 0.005 respectively

In [250]:
features_ancom = pd.read_csv('../../DataSets/INDEX/hiPCA/ANCOM-BC_pvalue 0.05.txt', sep="\t")
features, pca, pca_data, D, t2_threshold, C, Q_alpha, fi, threshold_combined, selected = hiPCA(X_train.T, healthy, non_healthy, ks = False, features=list(features_ancom['Species']))

# Principal Components selected: 25
0.95 25
37.65248413348277
37.65248413348277 7.917922960056595 1.7726626649772614


In [251]:
test_transformed = transform_data(X_test.T, features)

In [252]:
results = calculate_index(test_transformed)
# results.to_csv('../../output/hiPCA/experiment3_results1.csv', index = False)

In [253]:
balanced_accuracy_score(results['Combined Prediction'], list(y_test))

0.7474747474747475

In [254]:
acc_combined = balanced_accuracy_score(results['Combined Prediction'], list(y_test))
acc_t2 = balanced_accuracy_score(results['Prediction T2'], list(y_test))
acc_q = balanced_accuracy_score(results['Prediction Q'], list(y_test))
print(f'KS - hiPCA (T^2) : {acc_t2}')
print(f'KS - hiPCA (Q) : {acc_q}')
print(f'KS - hiPCA (Combined): {acc_combined}')

KS - hiPCA (T^2) : 0.6850877192982456
KS - hiPCA (Q) : 0.24
KS - hiPCA (Combined): 0.7474747474747475


In [255]:
features_ancom = pd.read_csv('../../DataSets/INDEX/hiPCA/ANCOM-BC_pvalue 0.005.txt', sep="\t")
features, pca, pca_data, D, t2_threshold, C, Q_alpha, fi, threshold_combined, selected = hiPCA(X_train.T, healthy, non_healthy, ks = False, features=list(features_ancom['Species']))

# Principal Components selected: 21
0.95 21
32.670573340917315
32.670573340917315 6.70865917494215 1.7555456167236936


In [256]:
test_transformed = transform_data(X_test.T, features)

In [257]:
results = calculate_index(test_transformed)
# results.to_csv('../../output/hiPCA/experiment3_results2.csv', index = False)

In [258]:
balanced_accuracy_score(results['Combined Prediction'], list(y_test))

0.6443236714975846

In [259]:
acc_combined = balanced_accuracy_score(results['Combined Prediction'], list(y_test))
acc_t2 = balanced_accuracy_score(results['Prediction T2'], list(y_test))
acc_q = balanced_accuracy_score(results['Prediction Q'], list(y_test))
print(f'KS - hiPCA (T^2) : {acc_t2}')
print(f'KS - hiPCA (Q) : {acc_q}')
print(f'KS - hiPCA (Combined): {acc_combined}')

KS - hiPCA (T^2) : 0.45274725274725275
KS - hiPCA (Q) : 0.5807017543859649
KS - hiPCA (Combined): 0.6443236714975846


### Experiment 4

Now we are going to use the pathways to build the index, first we will build it with just the unhealthy related pathways

In [260]:
pathways = pd.read_csv('../../DataSets/CAMDA/pathways.txt', sep = '\t', index_col = 0)
pathways = pathways.T[[x for x in pathways.index if 'UNINTEGRATED' not in x]]
pathways = pathways.T

In [261]:
good_samples = []
for c in taxonomy.columns:
    if sum(taxonomy[c]) > 90:
        good_samples.append(c)
pathways_aux = pathways[good_samples]

ob = list(metadata[metadata['Diagnosis'] == 'Obese']['SampleID'])

healthy = list(metadata[metadata['Diagnosis'] == 'Healthy']['SampleID'])
non_healthy = list(metadata[metadata['Diagnosis'] != 'Healthy']['SampleID'])
non_healthy = [x for x in non_healthy if x not in ob]

pathways_healthy = pathways[[x for x in pathways_aux.columns if x in healthy]].T
pathways_nonhealthy = pathways[[x for x in pathways_aux.columns if x in non_healthy]].T

In [262]:
label = []
for item in pathways_aux.columns:
    if metadata[metadata['SampleID'] == item]['Diagnosis'].iloc[0] == 'Healthy':
        label.append('Healthy')
    else:
        label.append('Unhealthy')

X_train, X_test, y_train, y_test = train_test_split(pathways_aux.T, label, test_size=0.20, random_state=20, stratify = label)

In [263]:
features, pca, pca_data, D, t2_threshold, C, Q_alpha, fi, threshold_combined, selected = hiPCA(X_train.T, healthy, non_healthy, ks = True, only_nonhealthy_features = True, method = 'asymp', p_val = 0.001)

# Healthy features selected by KS: 1016
# Unheatlhy features selected by KS: 659
# Principal Components selected: 9
0.95 9
16.918977604620448
16.918977604620448 114.95910612152693 1.7160996201302054


In [264]:
test_transformed = transform_data(X_test.T, features)

In [265]:
results = calculate_index(test_transformed)
# results.to_csv('../../output/hiPCA/experiment4_results.csv', index = False)

In [266]:
balanced_accuracy_score(results['Combined Prediction'], list(y_test))

0.8095238095238095

In [267]:
acc_combined = balanced_accuracy_score(results['Combined Prediction'], list(y_test))
acc_t2 = balanced_accuracy_score(results['Prediction T2'], list(y_test))
acc_q = balanced_accuracy_score(results['Prediction Q'], list(y_test))
print(f'KS - hiPCA (T^2) : {acc_t2}')
print(f'KS - hiPCA (Q) : {acc_q}')
print(f'KS - hiPCA (Combined): {acc_combined}')

KS - hiPCA (T^2) : 0.7921348314606742
KS - hiPCA (Q) : 0.7708333333333333
KS - hiPCA (Combined): 0.8095238095238095


### Experiment 5

Now we will fit both the unhealthy and healthy related pathways

In [268]:
features, pca, pca_data, D, t2_threshold, C, Q_alpha, fi, threshold_combined, selected = hiPCA(X_train.T, healthy, non_healthy, ks = True, only_nonhealthy_features = False, method = 'asymp', p_val = 0.001)

# Healthy features selected by KS: 1016
# Unheatlhy features selected by KS: 659
# Principal Components selected: 26
0.95 26
38.885138659830055
38.885138659830055 118.03589503853675 2.7801519279115507


In [269]:
test_transformed = transform_data(X_test.T, features)

In [270]:
results = calculate_index(test_transformed)
# results.to_csv('../../output/hiPCA/experiment5_results.csv', index = False)

In [271]:
balanced_accuracy_score(results['Combined Prediction'], list(y_test))

0.7357549857549857

In [272]:
acc_combined = balanced_accuracy_score(results['Combined Prediction'], list(y_test))
acc_t2 = balanced_accuracy_score(results['Prediction T2'], list(y_test))
acc_q = balanced_accuracy_score(results['Prediction Q'], list(y_test))
print(f'KS - hiPCA (T^2) : {acc_t2}')
print(f'KS - hiPCA (Q) : {acc_q}')
print(f'KS - hiPCA (Combined): {acc_combined}')

KS - hiPCA (T^2) : 0.6480562448304383
KS - hiPCA (Q) : 0.6670020120724346
KS - hiPCA (Combined): 0.7357549857549857


### Experiment 6

In [451]:
label = []
for item in taxonomy_transposed.T.columns:
    if metadata[metadata['SampleID'] == item]['Diagnosis'].iloc[0] == 'Healthy':
        label.append('Healthy')
    else:
        label.append('Unhealthy')

In [452]:
X_train, X_test, y_train, y_test = train_test_split(taxonomy_transposed, label, test_size=0.20, random_state=20, stratify = label)

In [453]:
pathways = pd.read_csv('../../DataSets/CAMDA/pathways.txt', sep = '\t', index_col = 0)

In [454]:
pathways_train = pathways[list(X_train.index)]

In [455]:
pathways_train = pathways_train.T

In [458]:
from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import train_test_split

In [460]:
clf = RandomForestClassifier(max_depth=10, min_samples_leaf=3, random_state=0)

In [461]:
clf.fit(pathways_train, y_train)

In [463]:
selected_pathways = list(pd.DataFrame(zip(pathways_train.columns, clf.feature_importances_)).sort_values(1, ascending = False)[:89][0])

In [466]:
selected = pathways_train[selected_pathways]

In [467]:
results = pd.read_csv('../../output/hiPCA/experiment1_results_train.csv')
results2 = pd.read_csv('../../output/hiPCA/experiment1_results.csv')

In [469]:
selected['hiPCA'] = list(results['Combined Index'])

In [470]:
selected['hiPCA'] = selected['hiPCA'] - 1.69 #Threshold combined

In [471]:
pathways_test = pathways.T[selected_pathways]
pathways_test = pathways_test.T[X_test.index].T
pathways_test['hiPCA'] = list(results2['Combined Index'])
pathways_test['hiPCA'] = pathways_test['hiPCA'] - 1.69

In [472]:
from sklearn.preprocessing import StandardScaler

In [473]:
scaler = StandardScaler()
for c in selected.columns:
    scaler.fit(np.array(selected[c]).reshape(-1, 1))
    selected[c] = scaler.transform(np.array(selected[c]).reshape(-1, 1))
    pathways_test[c] = scaler.transform(np.array(pathways_test[c]).reshape(-1, 1))

In [476]:
from sklearn.linear_model import LogisticRegression


In [478]:
clf = LogisticRegression(random_state=0).fit(selected, y_train)

In [481]:
importance = pd.DataFrame(zip(list(selected.columns), clf.coef_[0]), columns = ['Features', 'Coeficient']).sort_values('Coeficient', ascending  = False)

In [482]:
importance.to_csv('../../output/hiPCA/coefficients_logreg.csv', index = False)

In [485]:
pred = clf.predict(pathways_test)
balanced_accuracy_score(y_test, pred)

0.8198587127158555

In [486]:
clf = RandomForestClassifier(max_depth=10, min_samples_leaf=3, random_state=0)
clf.fit(selected, y_train)
pred = clf.predict(pathways_test)
balanced_accuracy_score(y_test, pred)

0.8498822605965464