# KS - hiPCA

In this notebook we replicate the Kolgomorov-Smirnov hiPCA index as described in (Zhu et al, 2023). We also capture all the experiments made in pursue to obtain the best result for the CAMDA 2024 challenge

First we import the necessary libraries to run this notebook

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import kstest
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.stats import chi2, norm
from sklearn.metrics import balanced_accuracy_score

Now we need to read the data

In [2]:
taxonomy = pd.read_csv('../../DataSets/CAMDA/taxonomy.txt', sep = '\t', index_col = 0)
metadata = pd.read_csv('../../DataSets/CAMDA/metadata.csv')

We select the samples which have most of the species identified

In [3]:
good_samples = []
for c in taxonomy.columns:
    if sum(taxonomy[c]) > 90:
        good_samples.append(c)
taxonomy_aux = taxonomy[good_samples]

From previous experiments we obtained better performance taking out the samples labeled as Obese, that is the reason we are not going to consider them either

In [4]:
obese = list(metadata[metadata['Diagnosis'] == 'Obese']['SampleID'])
healthy = list(metadata[metadata['Diagnosis'] == 'Healthy']['SampleID'])
non_healthy = list(metadata[metadata['Diagnosis'] != 'Healthy']['SampleID'])
non_healthy = [x for x in non_healthy if x not in obese]

Next we will define a function to perform Kolmogorov-Smirnov test to find the most important features for the PCA model

In [5]:
def ks_test(df, healthy, non_healthy, method_ks = 'auto', p_val = 0.001):
    healthy_df = df[[x for x in df.columns if x in healthy]].T
    nonhealthy_df = df[[x for x in df.columns if x in non_healthy]].T
    healthy_features = []
    nonhealthy_features = []
    for feature in list(df.index):
        if kstest(list(healthy_df[feature]), list(nonhealthy_df[feature]), alternative = 'less', method = method_ks).pvalue <= p_val:
            healthy_features.append(feature)
        if kstest(list(nonhealthy_df[feature]), list(healthy_df[feature]), alternative = 'less', method = method_ks).pvalue <= p_val:
            nonhealthy_features.append(feature)
    print(f'# Healthy features selected by KS: {len(healthy_features)}')
    print(f'# Unheatlhy features selected by KS: {len(nonhealthy_features)}')
    return healthy_features, nonhealthy_features

Now we define the data preprocessing workflow as defined in the paper

In [6]:
def custom_transform(x):
    if x <= 1:
        return np.log2(2 * x + 0.00001)
    else:
        return np.sqrt(x)

In [41]:
def transform_data(df, healthy_features, nonhealthy_features):
    scaler = StandardScaler()
    # selected = df.T[[x for x in list(set(healthy_features + nonhealthy_features)) if x in df.T.columns]]
    aux = pd.DataFrame()
    for item in list(set(healthy_features + nonhealthy_features)):
        if item in df.index:
            aux[item] = list(df.T[item])
        else:
            aux[item] = [0 for x in range(len(df.T))]
    selected = aux.applymap(custom_transform)

    for c in selected.columns:
        scaler.fit(np.array(selected[c]).reshape(-1, 1))
        selected[c] = scaler.transform(np.array(selected[c]).reshape(-1, 1))
        
    selected.index = df.T.index

    return selected

Then we define a function to perform PCA over the selected features only

In [8]:
def get_pca_data(df):
    pca = PCA()

    pca.fit(df)

    eigenvalues = pca.explained_variance_
    eigenvectors = pca.components_
    singular = pca.singular_values_
    
    pca_data = pd.DataFrame(zip(eigenvectors, eigenvalues, singular), columns = ('Eigenvectors', 'Explained_variance', 'Singular_values')).sort_values('Explained_variance', ascending = False)
    pca_data['%variance'] = pca_data['Explained_variance'] / sum(pca_data['Explained_variance'])
    pca_data = pca_data.sort_values('%variance', ascending = False)
    pca_data['%variance_cumulative'] = pca_data['%variance'].cumsum()
    
    return pca_data, pca

In [9]:
def calculate_pca_stats(df, variance_for_pc = 0.9, alpha = 0.05):
    pca = PCA()

    pca.fit(df)

    eigenvalues = pca.explained_variance_
    eigenvectors = pca.components_
    singular = pca.singular_values_
    
    pca_data = pd.DataFrame(zip(eigenvectors, eigenvalues, singular), columns = ('Eigenvectors', 'Explained_variance', 'Singular_values')).sort_values('Explained_variance', ascending = False)
    pca_data['%variance'] = pca_data['Explained_variance'] / sum(pca_data['Explained_variance'])
    pca_data = pca_data.sort_values('%variance', ascending = False)
    pca_data['%variance_cumulative'] = pca_data['%variance'].cumsum()
    
    principal_components = list(pca_data[pca_data['%variance_cumulative'] < variance_for_pc]['Eigenvectors'])
    print(f'# Principal Components selected: {len(principal_components)}')
    
    principal_values = list(pca_data[pca_data['%variance_cumulative'] < variance_for_pc]['Explained_variance'])
    D = np.array(principal_components).T @ np.linalg.inv(np.diag(principal_values)) @ np.array(principal_components)
    deg_free = len(principal_components) 
    # alpha = 0.05
    t2_threshold = chi2.ppf(1-alpha, deg_free)
    print(1-alpha, deg_free)
    print(t2_threshold)
    
    principal_components_residual = list(pca_data[pca_data['%variance_cumulative'] >= variance_for_pc]['Eigenvectors'])
    principal_values_residual = list(pca_data[pca_data['%variance_cumulative'] >= variance_for_pc]['Explained_variance'])
    principal_singvalues_residual = list(pca_data[pca_data['%variance_cumulative'] >= variance_for_pc]['Singular_values'])
    
    C = np.array(principal_components_residual).T @ np.array(principal_components_residual)
    Theta1 = sum(principal_values_residual)
    Theta2 = sum([x**2 for x in principal_values_residual])
    Theta3 = sum([x**3 for x in principal_values_residual])
    
    c_alpha = norm.ppf(1-alpha)
    
    h0 = 1-((2*Theta1*Theta3)/(3*Theta2**2))
    
    Q_alpha = Theta1*(((((c_alpha*np.sqrt(2*Theta2*(h0**2)))/Theta1)+1+((Theta2*h0*(h0-1))/(Theta1**2))))**(1/h0))
    
    fi = D/t2_threshold + (np.eye(len(principal_components[0])) - (np.array(principal_components).T @ np.array(principal_components)))/Q_alpha
    g = ((len(principal_components) / t2_threshold**2) + (Theta2 / Q_alpha**2)) / ((len(principal_components)/t2_threshold) + (Theta1 / Q_alpha))
    h = ((len(principal_components)/t2_threshold) + (Theta1 / Q_alpha))**2 / ((len(principal_components) / t2_threshold**2) + (Theta2 / Q_alpha**2))

    chi_value = chi2.ppf(1-alpha, h)
    threshold_combined = g*chi_value
    
    return pca, pca_data, D, t2_threshold, C, Q_alpha, fi, threshold_combined
    

In [10]:
def calculate_stats(pca, samples_df):
    for item in pca.transform(samples_df):
        index = item.T @ D @ item
        T2.append(index)
        if index > t2_threshold:
            pred.append('Unhealthy')
        else:
            pred.append('Healthy')

After that we start calculating the indexes, below there is a function to calculate T^2 index

In [11]:
def hotelling_t2(df, pca, pca_data, variance_for_pc = 0.9, alpha = 0.05):
    principal_components = list(pca_data[pca_data['%variance_cumulative'] < variance_for_pc]['Eigenvectors'])
    print(f'# Principal Components selected: {len(principal_components)}')
    principal_values = list(pca_data[pca_data['%variance_cumulative'] < variance_for_pc]['Explained_variance'])
    D = np.array(principal_components).T @ np.linalg.inv(np.diag(principal_values)) @ np.array(principal_components)
    deg_free = len(principal_components) 
    # alpha = 0.05
    t2_threshold = chi2.ppf(1-alpha, deg_free)
    T2 = []
    pred = []
    
    try:
        for item in pca.transform(df):
            index = item.T @ D @ item
            T2.append(index)
            if index > t2_threshold:
                pred.append('Unhealthy')
            else:
                pred.append('Healthy')
    except:
        for item in np.array(df):
            index = item.T @ D @ item
            T2.append(index)
            if index > t2_threshold:
                pred.append('Unhealthy')
            else:
                pred.append('Healthy')
            
    hoteling = pd.DataFrame(zip(df.index, T2, pred), columns = ['Sample', 'T2', 'Prediction T2'])
    
    return D, principal_components, hoteling, t2_threshold
    

Here we made a modification to Q_statistic limit according to ()

In [12]:
def Q_statistic(df, pca, pca_data, variance_for_pc = 0.9, alpha = 0.05):
    principal_components_residual = list(pca_data[pca_data['%variance_cumulative'] >= variance_for_pc]['Eigenvectors'])
    principal_values_residual = list(pca_data[pca_data['%variance_cumulative'] >= variance_for_pc]['Explained_variance'])
    principal_singvalues_residual = list(pca_data[pca_data['%variance_cumulative'] >= variance_for_pc]['Singular_values'])
    
    C = np.array(principal_components_residual).T @ np.array(principal_components_residual)
    Theta1 = sum(principal_values_residual)
    Theta2 = sum([x**2 for x in principal_values_residual])
    Theta3 = sum([x**3 for x in principal_values_residual])
    
    c_alpha = norm.ppf(1-alpha)
    
    h0 = 1-((2*Theta1*Theta3)/(3*Theta2**2))
    
    Q_alpha = Theta1*(((((c_alpha*np.sqrt(2*Theta2*(h0**2)))/Theta1)+1+((Theta2*h0*(h0-1))/(Theta1**2))))**(1/h0))
    
    Q = []
    pred = []
    try:
        for item in pca.transform(df):
            index = item.T @ C @ item
            Q.append(index)
            if index > Q_alpha:
                pred.append('Unhealthy')
            else:
                pred.append('Healthy')
    except:
        for item in np.array(df):
            index = item.T @ C @ item
            Q.append(index)
            if index > Q_alpha:
                pred.append('Unhealthy')
            else:
                pred.append('Healthy')
    
    Q_statistic = pd.DataFrame(zip(df.index, Q, pred), columns = ['Sample', 'Q', 'Prediction Q'])
    
    return C, Theta1, Theta2, Q_statistic, Q_alpha
    
    

In [13]:
def combined_index(df, D, t2_threshold, principal_components, Q_alpha, Theta1, Theta2, pca, alpha = 0.05):
    fi = D/t2_threshold + (np.eye(len(principal_components[0])) - (np.array(principal_components).T @ np.array(principal_components)))/Q_alpha
    g = ((len(principal_components) / t2_threshold**2) + (Theta2 / Q_alpha**2)) / ((len(principal_components)/t2_threshold) + (Theta1 / Q_alpha))
    h = ((len(principal_components)/t2_threshold) + (Theta1 / Q_alpha))**2 / ((len(principal_components) / t2_threshold**2) + (Theta2 / Q_alpha**2))

    chi_value = chi2.ppf(1-alpha, h)
    threshold_combined = g*chi_value
    combined = []
    pred = []
    print(threshold_combined)
    try:
        for item in pca.transform(df):
            index = item.T @ fi @ item
            combined.append(index)
            if index > threshold_combined:
                pred.append('Unhealthy')
            else:
                pred.append('Healthy')
    except:
        for item in np.array(df):
            index = item.T @ fi @ item
            combined.append(index)
            if index > threshold_combined:
                pred.append('Unhealthy')
            else:
                pred.append('Healthy')

    combined = pd.DataFrame(zip(df.index, combined, pred), columns = ['Sample', 'Combined', 'Prediction Combined']) 
    return combined

Finally we define a function to calculate the index 

In [14]:
def hiPCA(df, healthy, non_health, df_ks = [], healthy_features = [], non_healthy_features = [], ks = False, method = 'auto', p_val = 0.001, only_nonhealthy_features = False):
    if ks:
        healthy_features, non_healthy_features = ks_test(df_ks, healthy, non_healthy, method_ks = method, p_val = p_val)
        
    if only_nonhealthy_features:
        selected = transform_data(df, [], non_healthy_features)
    else:
        selected = transform_data(df, healthy_features, non_healthy_features)
    
    # print(selected)
    pca_data, pca = get_pca_data(selected.T[healthy].T)
    D, principal_components, results_hotelling, t2_threshold = hotelling_t2(selected, pca, pca_data)
    C, Theta1, Theta2, results_q, Q_alpha = Q_statistic(selected, pca, pca_data)
    hiPCA = combined_index(selected, D, t2_threshold, principal_components, Q_alpha, Theta1, Theta2, pca)
    
    return healthy_features, non_healthy_features, pd.concat([results_hotelling, results_q.drop('Sample', axis = 1),  hiPCA.drop('Sample', axis = 1)], axis=1, join='outer')

In [15]:
def hiPCA2(df, healthy, non_healthy, healthy_features = [], non_healthy_features = [], ks = False, method = 'auto', p_val = 0.001, only_nonhealthy_features = False):
    if ks:
        healthy_features, non_healthy_features = ks_test(df, healthy, non_healthy, method_ks = method, p_val = p_val)
        
    if only_nonhealthy_features:
        selected = transform_data(df, [], non_healthy_features)
    else:
        selected = transform_data(df, healthy_features, non_healthy_features)
        
    pca, pca_data, D, t2_threshold, C, Q_alpha, fi, threshold_combined = calculate_pca_stats(selected)
    print(t2_threshold, Q_alpha, threshold_combined)

In [16]:
def hiPCA3(df, healthy, non_healthy, df_ks = [], healthy_features = [], non_healthy_features = [], ks = False, method = 'auto', p_val = 0.001, only_nonhealthy_features = False):
    if ks:
        healthy_features, non_healthy_features = ks_test(df_ks, healthy, non_healthy, method_ks = method, p_val = p_val)
        
    if only_nonhealthy_features:
        selected = transform_data(df[[x for x in healthy if x in df.columns]], [], non_healthy_features)
    else:
        selected = transform_data(df[[x for x in healthy if x in df.columns]], healthy_features, non_healthy_features)
    
    # print(selected)
    pca, pca_data, D, t2_threshold, C, Q_alpha, fi, threshold_combined = calculate_pca_stats(selected)
    print(t2_threshold, Q_alpha, threshold_combined)
        
    return healthy_features, non_healthy_features, pca, pca_data, D, t2_threshold, C, Q_alpha, fi, threshold_combined, selected

## Experiments

### Experiment 1

First we will calculate KS-hiPCA index to the trainning data and evaluate its result using balanced accuracy

In [18]:
metadata

Unnamed: 0,Diagnosis,Project,SampleID,GMHI,hiPCA,Shannon_entropy
0,Healthy,HMP2,SRR5946989,1.977597,0.805751,3.585040
1,Healthy,PRJNA389280,SRR5983265,1.036363,9.604431,4.516999
2,Healthy,HMP2,SRR5946777,2.079383,0.605478,5.132696
3,CD,HMP2,SRR5946822,-2.522606,3.317453,2.971195
4,Healthy,HMP2,SRR5946857,0.260267,2.921465,4.491431
...,...,...,...,...,...,...
608,CD,HMP2,SRR5946648,-0.257764,3.582903,2.348565
609,Healthy,HMP2,SRR5946925,0.663667,2.553392,4.186847
610,Healthy,PRJEB1220,ERR209694,-0.262792,14.573817,5.606539
611,CD,HMP2,SRR5946668,-3.183020,10.761874,3.802272


In [17]:
obese = list(metadata[metadata['Diagnosis'] == 'Obese']['SampleID'])
healthy = list(metadata[metadata['Diagnosis'] == 'Healthy']['SampleID'])
non_healthy = list(metadata[metadata['Diagnosis'] != 'Healthy']['SampleID'])
non_healthy = [x for x in non_healthy if x not in obese]

In [19]:
taxonomy_not_obese = taxonomy_aux[[x for x in taxonomy_aux if x not in obese]]

In [20]:
taxonomy_transposed = taxonomy_not_obese.T

In [21]:
# taxonomy_transposed['Label'] = ['Healthy' if x == 'Healthy' else 'Unhealthy' for x in list(metadata['Diagnosis'])]

In [21]:
label = []
for item in taxonomy_transposed.T.columns:
    if metadata[metadata['SampleID'] == item]['Diagnosis'].iloc[0] == 'Healthy':
        label.append('Healthy')
    else:
        label.append('Unhealthy')

In [23]:
# taxonomy_transposed = taxonomy_transposed.T[taxonomy_aux.columns].T

In [24]:
# taxonomy_transposed

### TESTING

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(taxonomy_transposed, label, test_size=0.20, random_state=20, stratify = label)

In [23]:
healthy_features, non_healthy_features, pca, pca_data, D, t2_threshold, C, Q_alpha, fi, threshold_combined, selected = hiPCA3(X_train.T, healthy, non_healthy, df_ks = X_train.T, ks = True, method = 'asymp', only_nonhealthy_features = True)

# Healthy features selected by KS: 130
# Unheatlhy features selected by KS: 17
# Principal Components selected: 10
0.95 10
18.307038053275146
18.307038053275146 4.198301915746638 1.6919901191461653


In [405]:
test_transformed = transform_data(X_test.T, [], non_healthy_features)

In [328]:
healthy_features, non_healthy_features, pca, pca_data, D, t2_threshold, C, Q_alpha, fi, threshold_combined, selected = hiPCA3(X_train.T, healthy, non_healthy, df_ks = X_train.T, ks = True, method = 'asymp', only_nonhealthy_features = False)

# Healthy features selected by KS: 130
# Unheatlhy features selected by KS: 17
# Principal Components selected: 51
0.95 51
68.66929391228578
68.66929391228578 20.365473937778415 1.8419064612150682


In [329]:
test_transformed = transform_data(X_test.T, healthy_features, non_healthy_features)

In [79]:
len(X_train)

401

In [406]:
train_transformed = transform_data(X_train.T, [], non_healthy_features)

In [81]:
len(train_transformed)

401

In [407]:
T2, Q, combined = [], [], []
pred_t2, pred_Q, pred_combined = [], [], []
    

for item in pca.transform(train_transformed):
    index = item.T @ D @ item
    index2 = item.T @ C @ item
    index3 = item.T @ fi @ item
    T2.append(index)
    Q.append(index2)
    combined.append(index3)
    if index > t2_threshold:
        pred_t2.append('Unhealthy')
    else:
        pred_t2.append('Healthy')
    
    if index2 > Q_alpha:
        pred_Q.append('Unhealthy')
    else:
        pred_Q.append('Healthy')
        
    if index3 > threshold_combined:
        pred_combined.append('Unhealthy')
    else:
        pred_combined.append('Healthy') 
        
results_train = pd.DataFrame(zip(train_transformed.index, T2, pred_t2, Q, pred_Q, combined, pred_combined), columns = ['SampleID', 'T2', 'Prediction T2', 'Q', 'Prediction Q', 'Combined Index', 'Combined Prediction'])

In [83]:
results_train

Unnamed: 0,SampleID,T2,Prediction T2,Q,Prediction Q,Combined Index,Combined Prediction
0,ERR209799,1.402994,Healthy,4.784865,Unhealthy,1.231402,Healthy
1,SRR5983354,6.423970,Healthy,2.792262,Healthy,1.024778,Healthy
2,SRR5946648,2.422791,Healthy,3.900467,Healthy,1.073669,Healthy
3,ERR209854,13.887102,Healthy,2.638236,Healthy,1.395270,Healthy
4,SRR5935795,2.162224,Healthy,5.528409,Unhealthy,1.452319,Healthy
...,...,...,...,...,...,...,...
396,SRR5935770,2.138255,Healthy,1.997195,Healthy,0.598797,Healthy
397,SRR5935891,12.994175,Healthy,10.881675,Unhealthy,3.335942,Unhealthy
398,SRR5946762,5.745383,Healthy,4.191088,Unhealthy,1.325299,Healthy
399,SRR5947012,5.374624,Healthy,4.183521,Unhealthy,1.303221,Healthy


In [408]:
T2, Q, combined = [], [], []
pred_t2, pred_Q, pred_combined = [], [], []
    

for item in pca.transform(test_transformed):
    index = item.T @ D @ item
    index2 = item.T @ C @ item
    index3 = item.T @ fi @ item
    T2.append(index)
    Q.append(index2)
    combined.append(index3)
    if index > t2_threshold:
        pred_t2.append('Unhealthy')
    else:
        pred_t2.append('Healthy')
    
    if index2 > Q_alpha:
        pred_Q.append('Unhealthy')
    else:
        pred_Q.append('Healthy')
        
    if index3 > threshold_combined:
        pred_combined.append('Unhealthy')
    else:
        pred_combined.append('Healthy') 
        
results = pd.DataFrame(zip(test_transformed.index, T2, pred_t2, Q, pred_Q, combined, pred_combined), columns = ['SampleID', 'T2', 'Prediction T2', 'Q', 'Prediction Q', 'Combined Index', 'Combined Prediction'])

In [29]:
# results['TRUE'] = list(y_test)

In [331]:
balanced_accuracy_score(results['Combined Prediction'], list(y_test))



0.48514851485148514

In [332]:
balanced_accuracy_score(results['Prediction T2'], list(y_test))

0.35

In [333]:
balanced_accuracy_score(results['Prediction Q'], list(y_test))



0.48514851485148514

In [25]:
healthy_features, non_healthy_features, pca, pca_data, D, t2_threshold, C, Q_alpha, fi, threshold_combined, selected = hiPCA3(X_train.T, healthy, non_healthy, df_ks = X_train.T, ks = False, method = 'auto', only_nonhealthy_features = True, non_healthy_features=list(unhealthy_tax['species']))

# Principal Components selected: 23
0.95 23
35.17246162690806
35.17246162690806 7.507711665443201 1.7623337048363399


In [26]:
test_transformed = transform_data(X_test.T, [], list(unhealthy_tax['species']))

In [27]:
T2, Q, combined = [], [], []
pred_t2, pred_Q, pred_combined = [], [], []
    

for item in pca.transform(test_transformed):
    index = item.T @ D @ item
    index2 = item.T @ C @ item
    index3 = item.T @ fi @ item
    T2.append(index)
    Q.append(index2)
    combined.append(index3)
    if index > t2_threshold:
        pred_t2.append('Unhealthy')
    else:
        pred_t2.append('Healthy')
    
    if index2 > Q_alpha:
        pred_Q.append('Unhealthy')
    else:
        pred_Q.append('Healthy')
        
    if index3 > threshold_combined:
        pred_combined.append('Unhealthy')
    else:
        pred_combined.append('Healthy') 
        
results = pd.DataFrame(zip(test_transformed.index, T2, pred_t2, Q, pred_Q, combined, pred_combined), columns = ['SampleID', 'T2', 'Prediction T2', 'Q', 'Prediction Q', 'Combined Index', 'Combined Prediction'])

In [29]:
balanced_accuracy_score(results['Combined Prediction'], list(y_test))

0.6414634146341464

In [341]:
unhealthy_tax = pd.read_csv('../../DataSets/INDEX/hiPCA/ANCOM-BC_pvalue 0.005.txt', sep="\t")

In [342]:
healthy_features, non_healthy_features, pca, pca_data, D, t2_threshold, C, Q_alpha, fi, threshold_combined, selected = hiPCA3(X_train.T, healthy, non_healthy, df_ks = X_train.T, ks = False, method = 'asymp', only_nonhealthy_features = True, non_healthy_features=list(unhealthy_tax['Species']))

# Principal Components selected: 21
0.95 21
32.670573340917315
32.670573340917315 6.708659174942156 1.755545616723695


In [343]:
test_transformed = transform_data(X_test.T, [], list(unhealthy_tax['Species']))

In [344]:
T2, Q, combined = [], [], []
pred_t2, pred_Q, pred_combined = [], [], []
    

for item in pca.transform(test_transformed):
    index = item.T @ D @ item
    index2 = item.T @ C @ item
    index3 = item.T @ fi @ item
    T2.append(index)
    Q.append(index2)
    combined.append(index3)
    if index > t2_threshold:
        pred_t2.append('Unhealthy')
    else:
        pred_t2.append('Healthy')
    
    if index2 > Q_alpha:
        pred_Q.append('Unhealthy')
    else:
        pred_Q.append('Healthy')
        
    if index3 > threshold_combined:
        pred_combined.append('Unhealthy')
    else:
        pred_combined.append('Healthy') 
        
results = pd.DataFrame(zip(test_transformed.index, T2, pred_t2, Q, pred_Q, combined, pred_combined), columns = ['SampleID', 'T2', 'Prediction T2', 'Q', 'Prediction Q', 'Combined Index', 'Combined Prediction'])

In [345]:
balanced_accuracy_score(results['Combined Prediction'], list(y_test))

0.4613526570048309

In [346]:
balanced_accuracy_score(results['Prediction T2'], list(y_test))

0.5747377622377623

In [347]:
balanced_accuracy_score(results['Prediction Q'], list(y_test))

0.36211340206185566

In [303]:
results['true'] = y_test

In [307]:
results[results['Combined Prediction'] == 'Healthy']

Unnamed: 0,SampleID,T2,Prediction T2,Q,Prediction Q,Combined Index,Combined Prediction,true
82,SRR5946874,17.392177,Healthy,9.280078,Unhealthy,1.644097,Healthy,Healthy


In [73]:
un2 = ['Bacteroides_fragilis',
 'Clostridium_symbiosum',
 'Enterocloster_aldenensis',
 'Enterocloster_bolteae',
 'Enterocloster_clostridioformis',
 'Erysipelatoclostridium_ramosum',
 'Escherichia_coli',
 'Flavonifractor_plautii',
 'Hungatella_hathewayi',
 'Ruminococcus_gnavus',
 'Veillonella_parvula']

In [74]:
healthy_features, non_healthy_features, pca, pca_data, D, t2_threshold, C, Q_alpha, fi, threshold_combined, selected = hiPCA3(X_train.T, healthy, non_healthy, df_ks = X_train.T, ks = False, method = 'auto', only_nonhealthy_features = True, non_healthy_features=un2)

# Principal Components selected: 7
0.95 7
14.067140449340167
14.067140449340167 2.8656083142109576 1.655722273093897


In [75]:
test_transformed = transform_data(X_test.T, [], un2)

In [76]:
T2, Q, combined = [], [], []
pred_t2, pred_Q, pred_combined = [], [], []
    

for item in pca.transform(test_transformed):
    index = item.T @ D @ item
    index2 = item.T @ C @ item
    index3 = item.T @ fi @ item
    T2.append(index)
    Q.append(index2)
    combined.append(index3)
    if index > t2_threshold:
        pred_t2.append('Unhealthy')
    else:
        pred_t2.append('Healthy')
    
    if index2 > Q_alpha:
        pred_Q.append('Unhealthy')
    else:
        pred_Q.append('Healthy')
        
    if index3 > threshold_combined:
        pred_combined.append('Unhealthy')
    else:
        pred_combined.append('Healthy') 
        
results = pd.DataFrame(zip(test_transformed.index, T2, pred_t2, Q, pred_Q, combined, pred_combined), columns = ['SampleID', 'T2', 'Prediction T2', 'Q', 'Prediction Q', 'Combined Index', 'Combined Prediction'])

In [77]:
balanced_accuracy_score(results['Combined Prediction'], list(y_test))

0.7255412991178829

In [169]:
results['TRUE'] = list(y_test)

In [170]:
results.head(50)

Unnamed: 0,SampleID,T2,Prediction T2,Q,Prediction Q,Combined Index,Combined Prediction,TRUE
0,SRR5935802,23.28622,Unhealthy,15.95688,Unhealthy,5.072776,Unhealthy,Unhealthy
1,SRR5947102,21.507794,Unhealthy,5.914778,Unhealthy,2.583687,Unhealthy,Unhealthy
2,SRR5946673,36.299243,Unhealthy,18.928025,Unhealthy,6.491298,Unhealthy,Unhealthy
3,ERR209749,42.888788,Unhealthy,9.599707,Unhealthy,4.629318,Unhealthy,Unhealthy
4,SRR5935984,9.419097,Healthy,3.352874,Healthy,1.313133,Healthy,Healthy
5,SRR5946630,12.74359,Healthy,8.628717,Unhealthy,2.751391,Unhealthy,Unhealthy
6,ERR210591,12.957113,Healthy,6.588999,Unhealthy,2.277211,Unhealthy,Unhealthy
7,ERR209748,42.515783,Unhealthy,9.381326,Unhealthy,4.556926,Unhealthy,Unhealthy
8,ERR209746,13.201642,Healthy,3.950944,Healthy,1.662205,Healthy,Unhealthy
9,SRR5946772,9.782545,Healthy,8.328916,Unhealthy,2.518237,Unhealthy,Healthy


In [104]:
T2

[8.78694456157465,
 5.724143766057815,
 19.17719470489543,
 7.159069222502472,
 9.455144743808866,
 10.977173511651811,
 23.659226745766325,
 50.826464009732824,
 7.559861860432868,
 6.292424681383243,
 6.251766888621732,
 17.774537953769013,
 6.152618930107254,
 4.4871676444167115,
 21.716556714040834,
 3.6410136179915265,
 7.322730700390331,
 13.480883139621954,
 16.267657323631084,
 3.8352507750569393,
 4.829976058631994,
 36.575773653392545,
 4.041692690985719,
 3.465308929046774,
 9.206031693701282,
 2.8535279354397574,
 8.144443754481168,
 14.868969942515605,
 2.7010445979357267,
 7.056319105495712,
 2.4095489324344332,
 3.75938044514283,
 6.829728307567452,
 4.245911274475965,
 8.129956109366857,
 7.67202668569921,
 6.166378000784345,
 6.615705598372675,
 29.41203301575717,
 45.50430271422011,
 1.7722906494189137,
 1.9140410030117698,
 5.157944889477502,
 12.40263254715239,
 10.691575930171869,
 7.4778121061123075,
 7.644091028044516,
 44.068213497172074,
 8.48851667886725,
 8.7

In [93]:
y_train

ERR209298     Unhealthy
SRR5983470    Unhealthy
SRR5935805    Unhealthy
SRR5936014      Healthy
SRR5947008    Unhealthy
                ...    
SRR5936186    Unhealthy
SRR5946723      Healthy
SRR5935915      Healthy
ERR210536     Unhealthy
ERR210220       Healthy
Name: Label, Length: 422, dtype: object

In [62]:
pca, pca_data, D, t2_threshold, C, Q_alpha, fi, threshold_combined, selected = hiPCA3(taxonomy, healthy, non_healthy, df_ks = taxonomy_aux, ks = True, method = 'asymp', only_nonhealthy_features = True)

# Healthy features selected by KS: 140
# Unheatlhy features selected by KS: 22
# Principal Components selected: 13
0.95 13
22.362032494826934
22.362032494826934 4.958097208913289 1.708343213455037


In [39]:
original_hiPCA = balanced_accuracy_score(['Unhealthy' if x != 'Healthy' else 'Healthy' for x in metadata[metadata['SampleID'].isin(taxonomy.columns)]['Diagnosis']], ['Unhealthy' if x > 3.8 else 'Healthy' for x in metadata['hiPCA']])

In [40]:
ks_hiPCA = balanced_accuracy_score(['Unhealthy' if x != 'Healthy' else 'Healthy' for x in metadata[metadata['SampleID'].isin(taxonomy.columns)]['Diagnosis']],results['Prediction Combined'])

In [41]:
t2_hiPCA = balanced_accuracy_score(['Unhealthy' if x != 'Healthy' else 'Healthy' for x in metadata[metadata['SampleID'].isin(taxonomy.columns)]['Diagnosis']],results['Prediction T2'])

In [42]:
q_hiPCA = balanced_accuracy_score(['Unhealthy' if x != 'Healthy' else 'Healthy' for x in metadata[metadata['SampleID'].isin(taxonomy.columns)]['Diagnosis']],results['Prediction Q'])

In [43]:
print(f'Given hiPCA: {original_hiPCA}')
print(f'KS - hiPCA (T^2) : {t2_hiPCA}')
print(f'KS - hiPCA (Q) : {q_hiPCA}')
print(f'KS - hiPCA (Combined): {ks_hiPCA}')

Given hiPCA: 0.6726198083067092
KS - hiPCA (T^2) : 0.7310330138445155
KS - hiPCA (Q) : 0.6390521831735889
KS - hiPCA (Combined): 0.6837859424920127


>Note that we did not have the threshold for the given hiPCA to classify samples, we arbitrarly chose the threshold that gave best result to make a fair comparison

Now we will use this model to try to predict the unhealthy samples

In [47]:
taxonomy_covid = pd.read_csv('../../DataSets/COVID/CAMDA_taxa.txt', sep = '\t', index_col = 0)

In [29]:
taxonomy_covid.T

SampleID,Klebsiella_pneumoniae,Leuconostoc_lactis,Enterococcus_faecium,Veillonella_parvula,Escherichia_coli,Enterococcus_faecalis,Bifidobacterium_animalis,Clostridium_SGB6177,Clostridioides_difficile,Enterococcus_raffinosus,...,GGB4591_SGB6350,Lactobacillus_acidophilus,GGB4605_SGB6376,GGB27106_SGB6188,Methanomassiliicoccaceae_unclassified_SGB380,GGB4641_SGB6420,Candidatus_Neoclostridium_roslinense,GGB9608_SGB15041,GGB4661_SGB6449,GGB3169_SGB4183
Sample1a,0.0,0.0,0.0000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Sample1b,0.0,0.0,4.1925,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Sample5a,0.0,0.0,0.0000,0.0,2.43983,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Sample5b,0.0,0.0,0.0000,0.0,3.21447,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Sample6a,0.0,0.0,0.0000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Sample95b,0.0,0.0,0.0000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Sample96b,0.0,0.0,0.0000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Sample96a,0.0,0.0,0.0000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Sample98a,0.0,0.0,0.0000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.06305,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
[x for x in list(taxonomy_covid[['Sample73a', 'Sample63a']][taxonomy_covid['Sample73a'] != taxonomy_covid['Sample63a']].index) if x in non_healthy_features]

[]

In [48]:
test_transformed = transform_data(taxonomy_covid, [], non_healthy_features)

In [59]:
test_transformed.T[['Sample73a', 'Sample63a']]

Unnamed: 0,Sample73a,Sample63a
Hungatella_hathewayi,-0.3324845,-0.3324845
Ruminococcus_gnavus,-0.4305115,-0.4305115
Blautia_producta,7.105427e-15,7.105427e-15
Faecalimonas_umbilicata,-0.209325,-0.209325
Bacteroides_fragilis,-0.6838882,-0.6838882
Blautia_caecimuris,-0.1712313,-0.1712313
Flavonifractor_plautii,-0.1714876,-0.1714876
Clostridium_butyricum,-0.2108289,-0.2108289
Escherichia_coli,-0.5865085,-0.5865085
Klebsiella_pneumoniae,-0.2456241,-0.2456241


In [50]:
T2, Q, combined = [], [], []
pred_t2, pred_Q, pred_combined = [], [], []
    

for item in pca.transform(test_transformed):
    index = item.T @ D @ item
    index2 = item.T @ C @ item
    index3 = item.T @ fi @ item
    T2.append(index)
    Q.append(index2)
    combined.append(index3)
    if index > t2_threshold:
        pred_t2.append('Unhealthy')
    else:
        pred_t2.append('Healthy')
    
    if index2 > Q_alpha:
        pred_Q.append('Unhealthy')
    else:
        pred_Q.append('Healthy')
        
    if index3 > threshold_combined:
        pred_combined.append('Unhealthy')
    else:
        pred_combined.append('Healthy') 
        
results = pd.DataFrame(zip(test_transformed.index, T2, pred_t2, Q, pred_Q, combined, pred_combined), columns = ['SampleID', 'T2', 'Prediction T2', 'Q', 'Prediction Q', 'Combined Index', 'Combined Prediction'])

In [45]:
threshold_combined

1.6919901191461653

In [53]:
results.to_csv('results_covid.csv', index = False)

In [45]:
_, _, results_covid_taxonomy = hiPCA(taxonomy_covid, [], [], non_healthy_features = non_healthy_features)

ValueError: Found array with 0 sample(s) (shape=(0, 20)) while a minimum of 1 is required by PCA.

In [None]:
results_covid_taxonomy.to_csv('../../output/hiPCA/kshiPCA_covid_taxonomy.csv', index = False)

### Experiment 2

In [23]:
unhealthy_tax = pd.read_csv('../../DataSets/INDEX/hiPCA/zhu_ks92_unhealthy_species.txt', sep="\t")

In [265]:
healthy_features, non_healthy_features, results = hiPCA(taxonomy, healthy, non_healthy, non_healthy_features = list(unhealthy_tax['species']))

# Principal Components selected: 23
1.7667854515701797


In [266]:
hiPCA_zhutax = balanced_accuracy_score(['Unhealthy' if x != 'Healthy' else 'Healthy' for x in metadata[metadata['SampleID'].isin(taxonomy.columns)]['Diagnosis']],results['Prediction Combined'])
hiPCA_zhutax_t2 = balanced_accuracy_score(['Unhealthy' if x != 'Healthy' else 'Healthy' for x in metadata[metadata['SampleID'].isin(taxonomy.columns)]['Diagnosis']],results['Prediction T2'])
hiPCA_zhutax_q = balanced_accuracy_score(['Unhealthy' if x != 'Healthy' else 'Healthy' for x in metadata[metadata['SampleID'].isin(taxonomy.columns)]['Diagnosis']],results['Prediction Q'])

In [267]:
print(f'Given hiPCA: {original_hiPCA}')
print(f'hiPCA with Zhu taxa (T^2): {hiPCA_zhutax_t2}')
print(f'hiPCA with Zhu taxa (Q): {hiPCA_zhutax_q}')
print(f'hiPCA with Zhu taxa (Combined): {hiPCA_zhutax}')

Given hiPCA: 0.6726198083067092
hiPCA with Zhu taxa (T^2): 0.6213152289669861
hiPCA with Zhu taxa (Q): 0.5809850905218318
hiPCA with Zhu taxa (Combined): 0.6632268370607028


In [51]:
_, _, results_covid_taxonomy = hiPCA(taxonomy_covid, [], [], non_healthy_features = list(unhealthy_tax['species']) )

ValueError: Found array with 0 sample(s) (shape=(0, 42)) while a minimum of 1 is required by PCA.

In [None]:
results_covid_taxonomy.to_csv('../../output/hiPCA/hiPCAzhutax_covid.csv', index = False)

### Experiment 3

We tried to fit differentially expressed genes with FDR 0.05

In [306]:
unhealthy_tax = pd.read_csv('../../DataSets/INDEX/hiPCA/differential_expression1.txt', sep="\t")

In [307]:
healthy_features, non_healthy_features, results = hiPCA(taxonomy, healthy, non_healthy, non_healthy_features = list(unhealthy_tax['species']))

# Principal Components selected: 10
1.689442624423493


In [308]:
hiPCA_zhutax = balanced_accuracy_score(['Unhealthy' if x != 'Healthy' else 'Healthy' for x in metadata[metadata['SampleID'].isin(taxonomy.columns)]['Diagnosis']],results['Prediction Combined'])
hiPCA_zhutax_t2 = balanced_accuracy_score(['Unhealthy' if x != 'Healthy' else 'Healthy' for x in metadata[metadata['SampleID'].isin(taxonomy.columns)]['Diagnosis']],results['Prediction T2'])
hiPCA_zhutax_q = balanced_accuracy_score(['Unhealthy' if x != 'Healthy' else 'Healthy' for x in metadata[metadata['SampleID'].isin(taxonomy.columns)]['Diagnosis']],results['Prediction Q'])

In [309]:
print(f'Given hiPCA: {original_hiPCA}')
print(f'hiPCA with Zhu taxa (T^2): {hiPCA_zhutax_t2}')
print(f'hiPCA with Zhu taxa (Q): {hiPCA_zhutax_q}')
print(f'hiPCA with Zhu taxa (Combined): {hiPCA_zhutax}')

Given hiPCA: 0.6726198083067092
hiPCA with Zhu taxa (T^2): 0.7381842385516506
hiPCA with Zhu taxa (Q): 0.7076091586794462
hiPCA with Zhu taxa (Combined): 0.7345580404685836


### Experiment 4

In [310]:
unhealthy_tax = pd.read_csv('../../DataSets/INDEX/hiPCA/differential_expression2.txt', sep="\t")

In [311]:
healthy_features, non_healthy_features, results = hiPCA(taxonomy, healthy, non_healthy, non_healthy_features = list(unhealthy_tax['species']))

# Principal Components selected: 9
1.679500808042783


In [312]:
hiPCA_zhutax = balanced_accuracy_score(['Unhealthy' if x != 'Healthy' else 'Healthy' for x in metadata[metadata['SampleID'].isin(taxonomy.columns)]['Diagnosis']],results['Prediction Combined'])
hiPCA_zhutax_t2 = balanced_accuracy_score(['Unhealthy' if x != 'Healthy' else 'Healthy' for x in metadata[metadata['SampleID'].isin(taxonomy.columns)]['Diagnosis']],results['Prediction T2'])
hiPCA_zhutax_q = balanced_accuracy_score(['Unhealthy' if x != 'Healthy' else 'Healthy' for x in metadata[metadata['SampleID'].isin(taxonomy.columns)]['Diagnosis']],results['Prediction Q'])

In [313]:
print(f'Given hiPCA: {original_hiPCA}')
print(f'hiPCA with Zhu taxa (T^2): {hiPCA_zhutax_t2}')
print(f'hiPCA with Zhu taxa (Q): {hiPCA_zhutax_q}')
print(f'hiPCA with Zhu taxa (Combined): {hiPCA_zhutax}')

Given hiPCA: 0.6726198083067092
hiPCA with Zhu taxa (T^2): 0.692209797657082
hiPCA with Zhu taxa (Q): 0.725457933972311
hiPCA with Zhu taxa (Combined): 0.7264323748668797


### Experiment 5

In [60]:
unhealthy_tax = pd.read_csv('../../DataSets/INDEX/hiPCA/consensus_ks_differential_expression.txt', sep="\t")

In [61]:
healthy_features, non_healthy_features, results = hiPCA(taxonomy, healthy, non_healthy, non_healthy_features = list(unhealthy_tax['species']))

# Principal Components selected: 3


In [62]:
hiPCA_zhutax = balanced_accuracy_score(['Unhealthy' if x != 'Healthy' else 'Healthy' for x in metadata[metadata['SampleID'].isin(taxonomy.columns)]['Diagnosis']],results['Prediction Combined'])
hiPCA_zhutax_t2 = balanced_accuracy_score(['Unhealthy' if x != 'Healthy' else 'Healthy' for x in metadata[metadata['SampleID'].isin(taxonomy.columns)]['Diagnosis']],results['Prediction T2'])
hiPCA_zhutax_q = balanced_accuracy_score(['Unhealthy' if x != 'Healthy' else 'Healthy' for x in metadata[metadata['SampleID'].isin(taxonomy.columns)]['Diagnosis']],results['Prediction Q'])

In [63]:
print(f'Given hiPCA: {original_hiPCA}')
print(f'hiPCA with Zhu taxa (T^2): {hiPCA_zhutax_t2}')
print(f'hiPCA with Zhu taxa (Q): {hiPCA_zhutax_q}')
print(f'hiPCA with Zhu taxa (Combined): {hiPCA_zhutax}')

Given hiPCA: 0.6726198083067092
hiPCA with Zhu taxa (T^2): 0.6928434504792332
hiPCA with Zhu taxa (Q): 0.6681096911608093
hiPCA with Zhu taxa (Combined): 0.6958200212992545


### Experiment 6

In [64]:
new_taxa = pd.read_csv('../../DataSets/INDEX/hiPCA/tax_hmp2.csv', sep = '\t', index_col = 0)

In [65]:
unhealthy_tax = pd.read_csv('../../DataSets/INDEX/hiPCA/ks_camda_species.txt', sep="\t")

In [66]:
unhealthy_tax = pd.read_csv('../../DataSets/INDEX/hiPCA/zhu_ks92_unhealthy_species.txt', sep="\t")

In [68]:
healthy_features, non_healthy_features, results = hiPCA(new_taxa, [], new_taxa.columns, non_healthy_features = list(unhealthy_tax['species']), only_nonhealthy_features = True)

ValueError: Found array with 0 sample(s) (shape=(0, 63)) while a minimum of 1 is required by PCA.

In [None]:
results[results['Prediction Combined'] == 'Unhealthy']

In [None]:
hiPCA_zhutax = balanced_accuracy_score(['Unhealthy' if x != 'Healthy' else 'Healthy' for x in metadata[metadata['SampleID'].isin(taxonomy.columns)]['Diagnosis']],results['Prediction Combined'])
hiPCA_zhutax_t2 = balanced_accuracy_score(['Unhealthy' if x != 'Healthy' else 'Healthy' for x in metadata[metadata['SampleID'].isin(taxonomy.columns)]['Diagnosis']],results['Prediction T2'])
hiPCA_zhutax_q = balanced_accuracy_score(['Unhealthy' if x != 'Healthy' else 'Healthy' for x in metadata[metadata['SampleID'].isin(taxonomy.columns)]['Diagnosis']],results['Prediction Q'])

### Experiment 3

Now we tried to fit pathways data to the KS - hiPCA

In [348]:
pathways = pd.read_csv('../../DataSets/CAMDA/pathways.txt', sep = '\t', index_col = 0)

In [349]:
pathways = pathways.T[[x for x in pathways.index if 'UNINTEGRATED' not in x]]
pathways = pathways.T

In [350]:
good_samples = []
for c in taxonomy.columns:
    if sum(taxonomy[c]) > 90:
        good_samples.append(c)
pathways_aux = pathways[good_samples]

In [351]:
ob = list(metadata[metadata['Diagnosis'] == 'Obese']['SampleID'])

In [352]:
healthy = list(metadata[metadata['Diagnosis'] == 'Healthy']['SampleID'])
non_healthy = list(metadata[metadata['Diagnosis'] != 'Healthy']['SampleID'])
non_healthy = [x for x in non_healthy if x not in ob]

In [353]:
pathways_healthy = pathways[[x for x in pathways_aux.columns if x in healthy]].T
pathways_nonhealthy = pathways[[x for x in pathways_aux.columns if x in non_healthy]].T

In [354]:
label = []
for item in pathways_aux.columns:
    if metadata[metadata['SampleID'] == item]['Diagnosis'].iloc[0] == 'Healthy':
        label.append('Healthy')
    else:
        label.append('Unhealthy')

In [356]:
X_train, X_test, y_train, y_test = train_test_split(pathways_aux.T, label, test_size=0.20, random_state=20, stratify = label)

In [363]:
healthy_features, non_healthy_features, pca, pca_data, D, t2_threshold, C, Q_alpha, fi, threshold_combined, selected = hiPCA3(X_train.T, healthy, non_healthy, df_ks = X_train.T, ks = True, only_nonhealthy_features = False, method = 'asymp', p_val = 0.001)

# Healthy features selected by KS: 1016
# Unheatlhy features selected by KS: 659
# Principal Components selected: 26
0.95 26
38.885138659830055
38.885138659830055 118.03589503853671 2.78015192791155


In [358]:
test_transformed = transform_data(X_test.T, [], non_healthy_features)

In [364]:
test_transformed = transform_data(X_test.T, healthy_features, non_healthy_features)

In [365]:
T2, Q, combined = [], [], []
pred_t2, pred_Q, pred_combined = [], [], []
    

for item in np.array(test_transformed):
    # print(item)
    index = item.T @ D @ item
    index2 = item.T @ C @ item
    index3 = item.T @ fi @ item
    T2.append(index)
    Q.append(index2)
    combined.append(index3)
    if index > t2_threshold:
        pred_t2.append('Unhealthy')
    else:
        pred_t2.append('Healthy')
    
    if index2 > Q_alpha:
        pred_Q.append('Unhealthy')
    else:
        pred_Q.append('Healthy')
        
    if index3 > threshold_combined:
        pred_combined.append('Unhealthy')
    else:
        pred_combined.append('Healthy') 
        
results = pd.DataFrame(zip(test_transformed.index, T2, pred_t2, Q, pred_Q, combined, pred_combined), columns = ['SampleID', 'T2', 'Prediction T2', 'Q', 'Prediction Q', 'Combined Index', 'Combined Prediction'])

In [366]:
balanced_accuracy_score(results['Combined Prediction'], list(y_test))

0.7357549857549857

In [367]:
balanced_accuracy_score(results['Prediction T2'], list(y_test))

0.6480562448304383

In [368]:
balanced_accuracy_score(results['Prediction Q'], list(y_test))

0.6670020120724346

In [297]:
ks_hiPCA_path = balanced_accuracy_score(['Unhealthy' if x != 'Healthy' else 'Healthy' for x in metadata[metadata['SampleID'].isin(taxonomy.columns)]['Diagnosis']],results_pathways['Prediction Combined'])
ks_hiPCA_patht2 = balanced_accuracy_score(['Unhealthy' if x != 'Healthy' else 'Healthy' for x in metadata[metadata['SampleID'].isin(taxonomy.columns)]['Diagnosis']],results_pathways['Prediction T2'])
ks_hiPCA_pathq = balanced_accuracy_score(['Unhealthy' if x != 'Healthy' else 'Healthy' for x in metadata[metadata['SampleID'].isin(taxonomy.columns)]['Diagnosis']],results_pathways['Prediction Q'])

In [298]:
print(f'Given hiPCA: {original_hiPCA}')
print(f'hiPCA with pathways (T^2): {ks_hiPCA_patht2}')
print(f'hiPCA with pathways (Q): {ks_hiPCA_pathq}')
print(f'hiPCA with pathways (Combined): {ks_hiPCA_path}')

Given hiPCA: 0.6726198083067092
hiPCA with pathways (T^2): 0.7018690095846645
hiPCA with pathways (Q): 0.5
hiPCA with pathways (Combined): 0.5


In [277]:
pathways_covid = pd.read_csv('../../DataSets/COVID/CAMDA_pathways.txt', sep = '\t', index_col = 0)

In [278]:
_, _, results_hotelling_covid, results_q_covid, results_hiPCA_covid = hiPCA(pathways_covid, [], [], healthy_features = healthy_features, non_healthy_features = non_healthy_features)

ValueError: at least one array or dtype is required

### Experiment

In [340]:
unhealthy_tax = pd.read_csv('../../DataSets/INDEX/hiPCA/differential_expression1.txt', sep="\t")

In [341]:
healthy_features, non_healthy_features, results = hiPCA(taxonomy, healthy, non_healthy, non_healthy_features = list(unhealthy_tax['species']))

# Principal Components selected: 10
1.689442624423493


In [342]:
hiPCA_zhutax = balanced_accuracy_score(['Unhealthy' if x != 'Healthy' else 'Healthy' for x in metadata[metadata['SampleID'].isin(taxonomy.columns)]['Diagnosis']],results['Prediction Combined'])
hiPCA_zhutax_t2 = balanced_accuracy_score(['Unhealthy' if x != 'Healthy' else 'Healthy' for x in metadata[metadata['SampleID'].isin(taxonomy.columns)]['Diagnosis']],results['Prediction T2'])
hiPCA_zhutax_q = balanced_accuracy_score(['Unhealthy' if x != 'Healthy' else 'Healthy' for x in metadata[metadata['SampleID'].isin(taxonomy.columns)]['Diagnosis']],results['Prediction Q'])

In [343]:
print(f'Given hiPCA: {original_hiPCA}')
print(f'hiPCA with Zhu taxa (T^2): {hiPCA_zhutax_t2}')
print(f'hiPCA with Zhu taxa (Q): {hiPCA_zhutax_q}')
print(f'hiPCA with Zhu taxa (Combined): {hiPCA_zhutax}')

Given hiPCA: 0.6726198083067092
hiPCA with Zhu taxa (T^2): 0.7381842385516506
hiPCA with Zhu taxa (Q): 0.7076091586794462
hiPCA with Zhu taxa (Combined): 0.7345580404685836


In [344]:
unhealthy_functions = pd.read_csv('../../DataSets/INDEX/hiPCA/Function_UnHealthy_CD&OB&UC.txt', sep="\t")

In [369]:
pathways = pd.read_csv('../../DataSets/CAMDA/pathways.txt', sep = '\t', index_col = 0)

In [346]:
# pathways = pathways.T[list(unhealthy_functions['UnHealthy_CD&OB&UC'])]

In [382]:
X_train.index

Index(['SRR5946790', 'SRR5936223', 'SRR5946723', 'SRR5983412', 'SRR5946617',
       'ERR209226', 'SRR5946708', 'SRR5947008', 'SRR5946861', 'ERR209782',
       ...
       'ERR209746', 'SRR5946684', 'ERR209440', 'SRR5936014', 'SRR5983333',
       'SRR5936173', 'SRR5946989', 'SRR5947057', 'SRR5947000', 'ERR209287'],
      dtype='object', length=422)

In [409]:
pathways_train = pathways[list(X_train.index)]

In [410]:
pathways_train = pathways_train.T

In [411]:
pathways_train

# Pathway,UNINTEGRATED|g__Absiella.s__Absiella_dolichum,UNINTEGRATED|g__Acetobacter.s__Acetobacter_sp_CAG_267,UNINTEGRATED|g__Acetobacter.s__Acetobacter_sp_CAG_977,UNINTEGRATED|g__Acholeplasma.s__Acholeplasma_sp_CAG_878,UNINTEGRATED|g__Acidaminococcus.s__Acidaminococcus_intestini,UNINTEGRATED|g__Acinetobacter.s__Acinetobacter_idrijaensis,UNINTEGRATED|g__Acinetobacter.s__Acinetobacter_lwoffii,UNINTEGRATED|g__Actinomyces.s__Actinomyces_sp_HMSC035G02,UNINTEGRATED|g__Actinomyces.s__Actinomyces_sp_HPA0247,UNINTEGRATED|g__Actinomyces.s__Actinomyces_sp_ICM47,...,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_atypica,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_denticariosi,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_dispar,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_infantium,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_parvula,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_rogosae,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_seminalis,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_tobetsuensis,VALSYN-PWY: L-valine biosynthesis|g__Victivallales_unclassified.s__Victivallales_bacterium_CCUG_44730,VALSYN-PWY: L-valine biosynthesis|g__Victivallis.s__Victivallis_vadensis
SRR5946702,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
SRR5935764,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
SRR5936213,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
SRR5946708,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
SRR5936199,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR5946821,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000101,0.0,0.0,0.0,0.0,0.0
SRR5935889,0.0,0.0,0.0,0.0,0.002044,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
SRR5946660,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
SRR5947101,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [190]:
# labels_pathways = []
# for item in pathways.index:
#     if metadata[metadata['SampleID'] == item]['Diagnosis'].iloc[0] == 'Healthy':
#         labels_pathways.append('Healthy')
#     else:
#         labels_pathways.append('Unhealthy')

In [375]:
pathways_train

# Pathway,UNINTEGRATED|g__Absiella.s__Absiella_dolichum,UNINTEGRATED|g__Acetobacter.s__Acetobacter_sp_CAG_267,UNINTEGRATED|g__Acetobacter.s__Acetobacter_sp_CAG_977,UNINTEGRATED|g__Acholeplasma.s__Acholeplasma_sp_CAG_878,UNINTEGRATED|g__Acidaminococcus.s__Acidaminococcus_intestini,UNINTEGRATED|g__Acinetobacter.s__Acinetobacter_idrijaensis,UNINTEGRATED|g__Acinetobacter.s__Acinetobacter_lwoffii,UNINTEGRATED|g__Actinomyces.s__Actinomyces_sp_HMSC035G02,UNINTEGRATED|g__Actinomyces.s__Actinomyces_sp_HPA0247,UNINTEGRATED|g__Actinomyces.s__Actinomyces_sp_ICM47,...,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_atypica,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_denticariosi,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_dispar,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_infantium,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_parvula,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_rogosae,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_seminalis,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_tobetsuensis,VALSYN-PWY: L-valine biosynthesis|g__Victivallales_unclassified.s__Victivallales_bacterium_CCUG_44730,VALSYN-PWY: L-valine biosynthesis|g__Victivallis.s__Victivallis_vadensis
SRR5946790,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SRR5936223,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SRR5946723,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SRR5983412,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SRR5946617,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR5936173,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SRR5946989,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SRR5947057,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SRR5947000,0.0,0.0,0.0,0.0,0.00176,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [377]:
len(X_train)

422

In [376]:
# len(y_train), len(labels_pathways)

(422, 401)

In [263]:
des = pathways_train.describe()

In [266]:
des.T.sort_values('std', ascending = False)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
# Pathway,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
UNINTEGRATED|g__Bacteroides.s__Bacteroides_vulgatus,401.0,0.098312,0.129089,0.0,0.0,0.039231,0.153935,0.978464
UNINTEGRATED|g__Bacteroides.s__Bacteroides_fragilis,401.0,0.032614,0.075339,0.0,0.0,0.000000,0.026146,0.492118
UNINTEGRATED|g__Escherichia.s__Escherichia_coli,401.0,0.023515,0.071707,0.0,0.0,0.000000,0.003824,0.578823
UNINTEGRATED|g__Bacteroides.s__Bacteroides_uniformis,401.0,0.057475,0.067895,0.0,0.0,0.040042,0.091119,0.634088
UNINTEGRATED|g__Bacteroides.s__Bacteroides_dorei,401.0,0.027658,0.064966,0.0,0.0,0.000000,0.021344,0.482179
...,...,...,...,...,...,...,...,...
PWY-6385: peptidoglycan biosynthesis III (mycobacteria)|g__Gordonibacter.s__Gordonibacter_pamelaeae,401.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000
PWY-6385: peptidoglycan biosynthesis III (mycobacteria)|g__Fusobacterium.s__Fusobacterium_ulcerans,401.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000
PWY-6385: peptidoglycan biosynthesis III (mycobacteria)|g__Fusobacterium.s__Fusobacterium_sp_CAG_439,401.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000
PWY-6385: peptidoglycan biosynthesis III (mycobacteria)|g__Fusobacterium.s__Fusobacterium_mortiferum,401.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000


In [259]:
from sklearn.feature_selection import mutual_info_classif

In [260]:
info = mutual_info_classif(pathways_train, y_train)


KeyboardInterrupt



In [388]:
from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import train_test_split

In [194]:
# X_train, X_test, y_train, y_test = train_test_split(pathways, label, test_size=0.20, stratify = label, random_state=21)

In [412]:
clf = RandomForestClassifier(max_depth=10, min_samples_leaf=3, random_state=0)

In [413]:
clf.fit(pathways_train, y_train)

In [414]:
from sklearn.metrics import balanced_accuracy_score
pred = clf.predict(pathways[list(X_test.index)].T)
balanced_accuracy_score(y_test, pred)

0.8504709576138147

In [415]:
selected_pathways = list(pd.DataFrame(zip(pathways_train.columns, clf.feature_importances_)).sort_values(1, ascending = False)[:39][0])

In [199]:
# results['Combined']

In [229]:
selected_pathways

['PWY-5667: CDP-diacylglycerol biosynthesis I|g__Lachnospiraceae_unclassified.s__Eubacterium_rectale',
 "PWY-6123: inosine-5'-phosphate biosynthesis I|g__Alistipes.s__Alistipes_putredinis",
 'PWY-7977: L-methionine biosynthesis IV|g__Alistipes.s__Alistipes_putredinis',
 'DTDPRHAMSYN-PWY: dTDP-&beta;-L-rhamnose biosynthesis|g__Alistipes.s__Alistipes_putredinis',
 'NONMEVIPP-PWY: methylerythritol phosphate pathway I|g__Alistipes.s__Alistipes_putredinis',
 'PWY-6609: adenine and adenosine salvage III|g__Alistipes.s__Alistipes_putredinis',
 'UNINTEGRATED|g__Alistipes.s__Alistipes_onderdonkii',
 'PWY-3841: folate transformations II (plants)|g__Alistipes.s__Alistipes_putredinis',
 'UNINTEGRATED|g__Hungatella.s__Hungatella_hathewayi',
 'PWY-7851: coenzyme A biosynthesis II (eukaryotic)|g__Alistipes.s__Alistipes_putredinis',
 'PWY-7242: D-fructuronate degradation|g__Faecalibacterium.s__Faecalibacterium_prausnitzii',
 'UNINTEGRATED|g__Faecalibacterium.s__Faecalibacterium_prausnitzii',
 'PWY-612

In [416]:
selected = pathways_train[selected_pathways]

In [417]:
selected['hiPCA'] = list(results_train['Combined Index'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected['hiPCA'] = list(results_train['Combined Index'])


In [418]:
selected['hiPCA'] = selected['hiPCA'] - threshold_combined

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected['hiPCA'] = selected['hiPCA'] - threshold_combined


In [419]:
selected

# Pathway,GALACTUROCAT-PWY: D-galacturonate degradation I|g__Faecalibacterium.s__Faecalibacterium_prausnitzii,PWY-7234: inosine-5'-phosphate biosynthesis III|g__Alistipes.s__Alistipes_putredinis,DTDPRHAMSYN-PWY: dTDP-&beta;-L-rhamnose biosynthesis|g__Alistipes.s__Alistipes_putredinis,PWY-5103: L-isoleucine biosynthesis III|g__Lachnospiraceae_unclassified.s__Eubacterium_rectale,PWY-5667: CDP-diacylglycerol biosynthesis I|g__Lachnospiraceae_unclassified.s__Eubacterium_rectale,PWY-5030: L-histidine degradation III|g__Alistipes.s__Alistipes_putredinis,TRNA-CHARGING-PWY: tRNA charging|g__Alistipes.s__Alistipes_putredinis,PWY-1042: glycolysis IV|g__Lachnospiraceae_unclassified.s__Eubacterium_rectale,PWY-7977: L-methionine biosynthesis IV|g__Alistipes.s__Alistipes_putredinis,PWY-6121: 5-aminoimidazole ribonucleotide biosynthesis I|g__Alistipes.s__Alistipes_putredinis,...,DTDPRHAMSYN-PWY: dTDP-&beta;-L-rhamnose biosynthesis|g__Faecalibacterium.s__Faecalibacterium_prausnitzii,PWY-5686: UMP biosynthesis I|g__Alistipes.s__Alistipes_putredinis,PWY-8178: pentose phosphate pathway (non-oxidative branch) II|g__Bacteroides.s__Bacteroides_fragilis,PWY-7790: UMP biosynthesis II|g__Alistipes.s__Alistipes_putredinis,PANTOSYN-PWY: superpathway of coenzyme A biosynthesis I (bacteria)|g__Alistipes.s__Alistipes_putredinis,PWY-6277: superpathway of 5-aminoimidazole ribonucleotide biosynthesis|g__Alistipes.s__Alistipes_finegoldii,PWY-7953: UDP-N-acetylmuramoyl-pentapeptide biosynthesis III (meso-diaminopimelate containing)|g__Alistipes.s__Alistipes_putredinis,NONMEVIPP-PWY: methylerythritol phosphate pathway I|g__Alistipes.s__Alistipes_putredinis,PWY-6151: S-adenosyl-L-methionine salvage I|g__Bacteroides.s__Bacteroides_fragilis,hiPCA
SRR5946702,0.000027,0.000019,0.000022,0.000007,0.000005,0.000018,0.000021,0.000006,0.000017,0.000018,...,0.000031,0.000020,0.000032,0.000017,0.000015,0.000000,0.000018,0.000019,0.000024,-0.707453
SRR5935764,0.000127,0.000000,0.000000,0.000015,0.000016,0.000000,0.000000,0.000014,0.000000,0.000000,...,0.000134,0.000000,0.000000,0.000000,0.000000,0.000013,0.000000,0.000000,0.000000,-0.982715
SRR5936213,0.000090,0.000042,0.000039,0.000000,0.000000,0.000042,0.000041,0.000000,0.000037,0.000039,...,0.000116,0.000045,0.000000,0.000038,0.000040,0.000028,0.000037,0.000034,0.000000,-0.846744
SRR5946708,0.000076,0.000015,0.000014,0.000017,0.000009,0.000014,0.000016,0.000017,0.000012,0.000015,...,0.000095,0.000017,0.000000,0.000016,0.000015,0.000004,0.000014,0.000015,0.000000,-1.029230
SRR5936199,0.000057,0.000000,0.000000,0.000010,0.000008,0.000000,0.000000,0.000011,0.000000,0.000000,...,0.000089,0.000000,0.000167,0.000000,0.000000,0.000000,0.000000,0.000000,0.000157,2.170641
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR5946821,0.000000,0.000000,0.000000,0.000015,0.000008,0.000000,0.000000,0.000011,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.133573
SRR5935889,0.000015,0.000014,0.000010,0.000011,0.000009,0.000009,0.000012,0.000012,0.000010,0.000013,...,0.000029,0.000013,0.000008,0.000012,0.000011,0.000019,0.000011,0.000010,0.000007,-0.843628
SRR5946660,0.000012,0.000022,0.000022,0.000007,0.000006,0.000022,0.000025,0.000006,0.000021,0.000025,...,0.000022,0.000023,0.000000,0.000022,0.000022,0.000038,0.000020,0.000022,0.000000,-0.729817
SRR5947101,0.000006,0.000000,0.000000,0.000005,0.000004,0.000000,0.000000,0.000004,0.000000,0.000000,...,0.000007,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-1.054916


In [420]:
pathways_test = pathways.T[selected_pathways]
pathways_test = pathways_test.T[X_test.index].T
pathways_test['hiPCA'] = list(results['Combined Index'])
pathways_test['hiPCA'] = pathways_test['hiPCA'] - threshold_combined

In [421]:
pathways_test.head()

# Pathway,GALACTUROCAT-PWY: D-galacturonate degradation I|g__Faecalibacterium.s__Faecalibacterium_prausnitzii,PWY-7234: inosine-5'-phosphate biosynthesis III|g__Alistipes.s__Alistipes_putredinis,DTDPRHAMSYN-PWY: dTDP-&beta;-L-rhamnose biosynthesis|g__Alistipes.s__Alistipes_putredinis,PWY-5103: L-isoleucine biosynthesis III|g__Lachnospiraceae_unclassified.s__Eubacterium_rectale,PWY-5667: CDP-diacylglycerol biosynthesis I|g__Lachnospiraceae_unclassified.s__Eubacterium_rectale,PWY-5030: L-histidine degradation III|g__Alistipes.s__Alistipes_putredinis,TRNA-CHARGING-PWY: tRNA charging|g__Alistipes.s__Alistipes_putredinis,PWY-1042: glycolysis IV|g__Lachnospiraceae_unclassified.s__Eubacterium_rectale,PWY-7977: L-methionine biosynthesis IV|g__Alistipes.s__Alistipes_putredinis,PWY-6121: 5-aminoimidazole ribonucleotide biosynthesis I|g__Alistipes.s__Alistipes_putredinis,...,DTDPRHAMSYN-PWY: dTDP-&beta;-L-rhamnose biosynthesis|g__Faecalibacterium.s__Faecalibacterium_prausnitzii,PWY-5686: UMP biosynthesis I|g__Alistipes.s__Alistipes_putredinis,PWY-8178: pentose phosphate pathway (non-oxidative branch) II|g__Bacteroides.s__Bacteroides_fragilis,PWY-7790: UMP biosynthesis II|g__Alistipes.s__Alistipes_putredinis,PANTOSYN-PWY: superpathway of coenzyme A biosynthesis I (bacteria)|g__Alistipes.s__Alistipes_putredinis,PWY-6277: superpathway of 5-aminoimidazole ribonucleotide biosynthesis|g__Alistipes.s__Alistipes_finegoldii,PWY-7953: UDP-N-acetylmuramoyl-pentapeptide biosynthesis III (meso-diaminopimelate containing)|g__Alistipes.s__Alistipes_putredinis,NONMEVIPP-PWY: methylerythritol phosphate pathway I|g__Alistipes.s__Alistipes_putredinis,PWY-6151: S-adenosyl-L-methionine salvage I|g__Bacteroides.s__Bacteroides_fragilis,hiPCA
SRR5935802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4e-05,0.0,0.0,7.8188e-06,0.0,0.0,3.6e-05,1.331055
SRR5947102,0.0,0.0,0.0,3.92909e-06,4e-06,0.0,0.0,3e-06,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.36705
SRR5946673,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.247998
ERR209749,0.0,5e-06,5e-06,0.0,0.0,4e-06,5e-06,0.0,4e-06,4e-06,...,0.0,5e-06,0.0,5e-06,4e-06,3.7125e-07,4e-06,4e-06,0.0,2.439433
SRR5935984,0.0,2.9e-05,2.7e-05,8.53436e-07,0.0,2.5e-05,3.8e-05,0.0,2.6e-05,2.8e-05,...,0.0,3.1e-05,0.0,3.1e-05,2.9e-05,1.23427e-05,2.3e-05,2.6e-05,0.0,-0.271193


In [422]:
from sklearn.preprocessing import StandardScaler

In [423]:
scaler = StandardScaler()
for c in selected.columns:
    scaler.fit(np.array(selected[c]).reshape(-1, 1))
    selected[c] = scaler.transform(np.array(selected[c]).reshape(-1, 1))
    pathways_test[c] = scaler.transform(np.array(pathways_test[c]).reshape(-1, 1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected[c] = scaler.transform(np.array(selected[c]).reshape(-1, 1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected[c] = scaler.transform(np.array(selected[c]).reshape(-1, 1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected[c] = scaler.transform(np.array(selected[c]).reshape(-1, 1)

In [237]:
pathways_test.head()

# Pathway,PWY-5667: CDP-diacylglycerol biosynthesis I|g__Lachnospiraceae_unclassified.s__Eubacterium_rectale,PWY-6123: inosine-5'-phosphate biosynthesis I|g__Alistipes.s__Alistipes_putredinis,PWY-7977: L-methionine biosynthesis IV|g__Alistipes.s__Alistipes_putredinis,DTDPRHAMSYN-PWY: dTDP-&beta;-L-rhamnose biosynthesis|g__Alistipes.s__Alistipes_putredinis,NONMEVIPP-PWY: methylerythritol phosphate pathway I|g__Alistipes.s__Alistipes_putredinis,PWY-6609: adenine and adenosine salvage III|g__Alistipes.s__Alistipes_putredinis,UNINTEGRATED|g__Alistipes.s__Alistipes_onderdonkii,PWY-3841: folate transformations II (plants)|g__Alistipes.s__Alistipes_putredinis,UNINTEGRATED|g__Hungatella.s__Hungatella_hathewayi,PWY-7851: coenzyme A biosynthesis II (eukaryotic)|g__Alistipes.s__Alistipes_putredinis,...,PANTOSYN-PWY: superpathway of coenzyme A biosynthesis I (bacteria)|g__Alistipes.s__Alistipes_putredinis,GALACTUROCAT-PWY: D-galacturonate degradation I|g__Faecalibacterium.s__Faecalibacterium_prausnitzii,PWY-5695: inosine 5'-phosphate degradation|g__Alistipes.s__Alistipes_finegoldii,ILEUSYN-PWY: L-isoleucine biosynthesis I (from threonine)|g__Hungatella.s__Hungatella_hathewayi,PWY-6317: D-galactose degradation I (Leloir pathway)|g__Hungatella.s__Hungatella_hathewayi,PWY-6700: queuosine biosynthesis I (de novo)|g__Parabacteroides.s__Parabacteroides_merdae,PWY-7791: UMP biosynthesis III|g__Alistipes.s__Alistipes_putredinis,PWY-6277: superpathway of 5-aminoimidazole ribonucleotide biosynthesis|g__Flavonifractor.s__Flavonifractor_plautii,PWY-5667: CDP-diacylglycerol biosynthesis I|g__Hungatella.s__Hungatella_hathewayi,hiPCA
SRR5983264,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.2e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.707042
ERR209710,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033351,0.0,...,0.0,6e-06,0.0,3.6e-05,3.2e-05,0.0,0.0,0.0,1.9e-05,2.739968
SRR5947057,3e-06,0.0,0.0,0.0,0.0,0.0,0.001831,0.0,0.0,0.0,...,0.0,0.0,4e-06,0.0,0.0,9e-06,0.0,0.0,0.0,-0.56698
SRR5983384,1e-06,1.5e-05,1.3e-05,1.2e-05,1.2e-05,1.5e-05,0.001306,1.2e-05,0.0,1.2e-05,...,1.3e-05,3e-06,0.0,0.0,0.0,0.0,1.4e-05,0.0,0.0,-0.221122
SRR5935963,9e-06,1e-05,7e-06,8e-06,8e-06,9e-06,0.000917,9e-06,0.0,9e-06,...,9e-06,4.3e-05,4e-06,0.0,0.0,0.0,1e-05,0.0,0.0,-0.824825


In [238]:
selected.head()

# Pathway,PWY-5667: CDP-diacylglycerol biosynthesis I|g__Lachnospiraceae_unclassified.s__Eubacterium_rectale,PWY-6123: inosine-5'-phosphate biosynthesis I|g__Alistipes.s__Alistipes_putredinis,PWY-7977: L-methionine biosynthesis IV|g__Alistipes.s__Alistipes_putredinis,DTDPRHAMSYN-PWY: dTDP-&beta;-L-rhamnose biosynthesis|g__Alistipes.s__Alistipes_putredinis,NONMEVIPP-PWY: methylerythritol phosphate pathway I|g__Alistipes.s__Alistipes_putredinis,PWY-6609: adenine and adenosine salvage III|g__Alistipes.s__Alistipes_putredinis,UNINTEGRATED|g__Alistipes.s__Alistipes_onderdonkii,PWY-3841: folate transformations II (plants)|g__Alistipes.s__Alistipes_putredinis,UNINTEGRATED|g__Hungatella.s__Hungatella_hathewayi,PWY-7851: coenzyme A biosynthesis II (eukaryotic)|g__Alistipes.s__Alistipes_putredinis,...,PANTOSYN-PWY: superpathway of coenzyme A biosynthesis I (bacteria)|g__Alistipes.s__Alistipes_putredinis,GALACTUROCAT-PWY: D-galacturonate degradation I|g__Faecalibacterium.s__Faecalibacterium_prausnitzii,PWY-5695: inosine 5'-phosphate degradation|g__Alistipes.s__Alistipes_finegoldii,ILEUSYN-PWY: L-isoleucine biosynthesis I (from threonine)|g__Hungatella.s__Hungatella_hathewayi,PWY-6317: D-galactose degradation I (Leloir pathway)|g__Hungatella.s__Hungatella_hathewayi,PWY-6700: queuosine biosynthesis I (de novo)|g__Parabacteroides.s__Parabacteroides_merdae,PWY-7791: UMP biosynthesis III|g__Alistipes.s__Alistipes_putredinis,PWY-6277: superpathway of 5-aminoimidazole ribonucleotide biosynthesis|g__Flavonifractor.s__Flavonifractor_plautii,PWY-5667: CDP-diacylglycerol biosynthesis I|g__Hungatella.s__Hungatella_hathewayi,hiPCA
ERR209799,2.9e-05,1.1e-05,1e-05,9e-06,0.0,1e-05,0.0,1e-05,0.0,8e-06,...,0.0,4.1e-05,0.0,0.0,0.0,0.0,1.1e-05,0.0,0.0,-0.459553
SRR5983354,1.3e-05,2.6e-05,2.1e-05,2.5e-05,2.2e-05,3.2e-05,0.001581,2.3e-05,0.0,2e-05,...,2.2e-05,0.0,5.28564e-06,0.0,0.0,5e-06,2.4e-05,0.0,0.0,-0.666178
SRR5946648,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,7e-06,0.0,0.0,0.0,0.0,0.0,4e-06,0.0,-0.617286
ERR209854,1e-06,7e-06,6e-06,7e-06,6e-06,7e-06,0.000702,7e-06,0.0,6e-06,...,6e-06,2.5e-05,1.68532e-06,0.0,0.0,2e-06,6e-06,0.0,0.0,-0.295685
SRR5935795,1.7e-05,0.0,0.0,0.0,0.0,0.0,0.000447,0.0,0.0,0.0,...,0.0,5e-06,7.89973e-07,0.0,0.0,1.8e-05,0.0,0.0,0.0,-0.238637


In [424]:
from sklearn.linear_model import LogisticRegression


In [212]:
# X_train, X_test, y_train, y_test = train_test_split(selected, label, test_size=0.20, stratify = label, random_state=21)

In [425]:
clf = LogisticRegression(random_state=0).fit(selected, y_train)

In [426]:
clf.coef_

array([[-0.4540263 ,  0.40385164, -0.37193327, -0.03297965,  0.12524968,
         0.26508779, -0.62004166,  0.11146052,  0.50322734, -0.13548388,
         0.04915104, -0.56634493,  0.57041198, -0.53177851, -0.38861252,
        -0.30814701, -0.80635278, -0.40607177, -0.44156791,  0.15420597,
        -0.19268724,  0.57011172, -1.06843676,  0.23852674,  0.10905629,
         0.92959702,  0.38329012,  0.54321614,  0.06411004, -0.21466933,
         0.69264516, -0.62315825,  0.32069593,  0.2003836 , -1.00916063,
         0.44296006,  0.3127031 ,  0.44234899,  0.43450308,  1.73735209]])

In [427]:
list(selected.columns)

['GALACTUROCAT-PWY: D-galacturonate degradation I|g__Faecalibacterium.s__Faecalibacterium_prausnitzii',
 "PWY-7234: inosine-5'-phosphate biosynthesis III|g__Alistipes.s__Alistipes_putredinis",
 'DTDPRHAMSYN-PWY: dTDP-&beta;-L-rhamnose biosynthesis|g__Alistipes.s__Alistipes_putredinis',
 'PWY-5103: L-isoleucine biosynthesis III|g__Lachnospiraceae_unclassified.s__Eubacterium_rectale',
 'PWY-5667: CDP-diacylglycerol biosynthesis I|g__Lachnospiraceae_unclassified.s__Eubacterium_rectale',
 'PWY-5030: L-histidine degradation III|g__Alistipes.s__Alistipes_putredinis',
 'TRNA-CHARGING-PWY: tRNA charging|g__Alistipes.s__Alistipes_putredinis',
 'PWY-1042: glycolysis IV|g__Lachnospiraceae_unclassified.s__Eubacterium_rectale',
 'PWY-7977: L-methionine biosynthesis IV|g__Alistipes.s__Alistipes_putredinis',
 'PWY-6121: 5-aminoimidazole ribonucleotide biosynthesis I|g__Alistipes.s__Alistipes_putredinis',
 'PWY-6122: 5-aminoimidazole ribonucleotide biosynthesis II|g__Hungatella.s__Hungatella_hathewayi

In [428]:
importance = pd.DataFrame(zip(list(selected.columns), clf.coef_[0]), columns = ['Features', 'Coeficient']).sort_values('Coeficient', ascending  = False)

In [429]:
importance.to_csv('importance4.csv', index = False)

In [218]:
# scaler = StandardScaler()
# for c in pathways_test.columns:
#     scaler.fit(np.array(pathways_test[c]).reshape(-1, 1))
#     pathways_test[c] = scaler.transform(np.array(pathways_test[c]).reshape(-1, 1))

In [219]:
pathways_test

# Pathway,PWY-5667: CDP-diacylglycerol biosynthesis I|g__Lachnospiraceae_unclassified.s__Eubacterium_rectale,PWY-6123: inosine-5'-phosphate biosynthesis I|g__Alistipes.s__Alistipes_putredinis,PWY-7977: L-methionine biosynthesis IV|g__Alistipes.s__Alistipes_putredinis,DTDPRHAMSYN-PWY: dTDP-&beta;-L-rhamnose biosynthesis|g__Alistipes.s__Alistipes_putredinis,NONMEVIPP-PWY: methylerythritol phosphate pathway I|g__Alistipes.s__Alistipes_putredinis,PWY-6609: adenine and adenosine salvage III|g__Alistipes.s__Alistipes_putredinis,UNINTEGRATED|g__Alistipes.s__Alistipes_onderdonkii,PWY-3841: folate transformations II (plants)|g__Alistipes.s__Alistipes_putredinis,UNINTEGRATED|g__Hungatella.s__Hungatella_hathewayi,hiPCA
SRR5983264,-0.499775,-0.702289,-0.678395,-0.700928,-0.676720,-0.707826,-0.438920,-0.696985,-0.218844,-0.633211
ERR209710,-0.499775,-0.702289,-0.678395,-0.700928,-0.676720,-0.707826,-0.438920,-0.696985,1.094878,1.515895
SRR5947057,-0.197593,-0.702289,-0.678395,-0.700928,-0.676720,-0.707826,0.129545,-0.696985,-0.218844,-0.545887
SRR5983384,-0.409672,0.433781,0.476587,0.333120,0.441038,0.340195,-0.033644,0.283163,-0.218844,-0.330255
SRR5935963,0.262733,0.032212,-0.013416,-0.017728,0.029006,-0.087949,-0.154342,0.034053,-0.218844,-0.706646
...,...,...,...,...,...,...,...,...,...,...
SRR5935820,-0.358668,1.532311,1.810326,1.891903,1.735581,2.689571,0.211024,1.434491,-0.218844,-0.254922
SRR5983459,-0.286878,-0.702289,-0.678395,-0.700928,-0.676720,-0.707826,-0.438920,-0.696985,5.000349,1.870884
SRR5983346,-0.499775,1.741208,1.891563,1.576171,1.672287,1.298897,-0.438920,1.659310,-0.218844,-0.071019
ERR210519,-0.499775,-0.702289,-0.678395,-0.700928,-0.676720,-0.707826,-0.438920,-0.696985,-0.218844,-0.633211


In [430]:
pred = clf.predict(pathways_test)
balanced_accuracy_score(y_test, pred)

0.810243328100471

In [245]:
clf = RandomForestClassifier(max_depth=10, min_samples_leaf=3, random_state=0)
clf.fit(selected, y_train)
pred = clf.predict(pathways_test)
balanced_accuracy_score(y_test, pred)

0.8402668759811617

In [251]:
selected

# Pathway,PWY-5667: CDP-diacylglycerol biosynthesis I|g__Lachnospiraceae_unclassified.s__Eubacterium_rectale,PWY-6123: inosine-5'-phosphate biosynthesis I|g__Alistipes.s__Alistipes_putredinis,PWY-7977: L-methionine biosynthesis IV|g__Alistipes.s__Alistipes_putredinis,DTDPRHAMSYN-PWY: dTDP-&beta;-L-rhamnose biosynthesis|g__Alistipes.s__Alistipes_putredinis,NONMEVIPP-PWY: methylerythritol phosphate pathway I|g__Alistipes.s__Alistipes_putredinis,PWY-6609: adenine and adenosine salvage III|g__Alistipes.s__Alistipes_putredinis,UNINTEGRATED|g__Alistipes.s__Alistipes_onderdonkii,PWY-3841: folate transformations II (plants)|g__Alistipes.s__Alistipes_putredinis,UNINTEGRATED|g__Hungatella.s__Hungatella_hathewayi,PWY-7851: coenzyme A biosynthesis II (eukaryotic)|g__Alistipes.s__Alistipes_putredinis,...,PANTOSYN-PWY: superpathway of coenzyme A biosynthesis I (bacteria)|g__Alistipes.s__Alistipes_putredinis,GALACTUROCAT-PWY: D-galacturonate degradation I|g__Faecalibacterium.s__Faecalibacterium_prausnitzii,PWY-5695: inosine 5'-phosphate degradation|g__Alistipes.s__Alistipes_finegoldii,ILEUSYN-PWY: L-isoleucine biosynthesis I (from threonine)|g__Hungatella.s__Hungatella_hathewayi,PWY-6317: D-galactose degradation I (Leloir pathway)|g__Hungatella.s__Hungatella_hathewayi,PWY-6700: queuosine biosynthesis I (de novo)|g__Parabacteroides.s__Parabacteroides_merdae,PWY-7791: UMP biosynthesis III|g__Alistipes.s__Alistipes_putredinis,PWY-6277: superpathway of 5-aminoimidazole ribonucleotide biosynthesis|g__Flavonifractor.s__Flavonifractor_plautii,PWY-5667: CDP-diacylglycerol biosynthesis I|g__Hungatella.s__Hungatella_hathewayi,hiPCA
ERR209799,0.000029,0.000011,0.000010,0.000009,0.000000,0.000010,0.000000,0.000010,0.000000,0.000008,...,0.000000,0.000041,0.000000e+00,0.000000,0.0,0.000000,0.000011,0.000000,0.000000,-0.459553
SRR5983354,0.000013,0.000026,0.000021,0.000025,0.000022,0.000032,0.001581,0.000023,0.000000,0.000020,...,0.000022,0.000000,5.285640e-06,0.000000,0.0,0.000005,0.000024,0.000000,0.000000,-0.666178
SRR5946648,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000007,0.000000e+00,0.000000,0.0,0.000000,0.000000,0.000004,0.000000,-0.617286
ERR209854,0.000001,0.000007,0.000006,0.000007,0.000006,0.000007,0.000702,0.000007,0.000000,0.000006,...,0.000006,0.000025,1.685320e-06,0.000000,0.0,0.000002,0.000006,0.000000,0.000000,-0.295685
SRR5935795,0.000017,0.000000,0.000000,0.000000,0.000000,0.000000,0.000447,0.000000,0.000000,0.000000,...,0.000000,0.000005,7.899730e-07,0.000000,0.0,0.000018,0.000000,0.000000,0.000000,-0.238637
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR5935770,0.000030,0.000000,0.000000,0.000000,0.000000,0.000000,0.003341,0.000000,0.000000,0.000000,...,0.000000,0.000077,5.657960e-06,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,-1.092159
SRR5935891,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000e+00,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,1.644987
SRR5946762,0.000002,0.000000,0.000000,0.000000,0.000000,0.000000,0.000762,0.000000,0.000000,0.000000,...,0.000000,0.000006,5.760000e-06,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,-0.365656
SRR5947012,0.000004,0.000037,0.000028,0.000030,0.000030,0.000038,0.001963,0.000035,0.000000,0.000027,...,0.000029,0.000010,3.882740e-06,0.000000,0.0,0.000000,0.000037,0.000000,0.000000,-0.387735


In [252]:
list(pd.DataFrame(zip(selected.columns, clf.feature_importances_)).sort_values(1, ascending = False)[::][0])

['hiPCA',
 'UNINTEGRATED|g__Alistipes.s__Alistipes_putredinis',
 'PWY-5130: 2-oxobutanoate degradation I|g__Alistipes.s__Alistipes_putredinis',
 "PWY-6124: inosine-5'-phosphate biosynthesis II|g__Alistipes.s__Alistipes_putredinis",
 'PWY-6609: adenine and adenosine salvage III|g__Alistipes.s__Alistipes_putredinis',
 'UNINTEGRATED|g__Alistipes.s__Alistipes_onderdonkii',
 'DTDPRHAMSYN-PWY: dTDP-&beta;-L-rhamnose biosynthesis|g__Alistipes.s__Alistipes_putredinis',
 'UNINTEGRATED|g__Hungatella.s__Hungatella_hathewayi',
 'UNINTEGRATED|g__Faecalibacterium.s__Faecalibacterium_prausnitzii',
 'PWY-7851: coenzyme A biosynthesis II (eukaryotic)|g__Alistipes.s__Alistipes_putredinis',
 'TRNA-CHARGING-PWY: tRNA charging|g__Alistipes.s__Alistipes_putredinis',
 'PWY-3841: folate transformations II (plants)|g__Alistipes.s__Alistipes_putredinis',
 'PANTOSYN-PWY: superpathway of coenzyme A biosynthesis I (bacteria)|g__Alistipes.s__Alistipes_putredinis',
 'PWY-5667: CDP-diacylglycerol biosynthesis I|g__La

In [221]:
predp = clf.predict_proba(X_test)

Feature names unseen at fit time:
- Abiotrophia_sp_HMSC24B09
- Achromobacter_SGB14223
- Acidaminococcus_SGB42794
- Acidaminococcus_SGB5735
- Acidaminococcus_fermentans
- ...
Feature names seen at fit time, yet now missing:
- DTDPRHAMSYN-PWY: dTDP-&beta;-L-rhamnose biosynthesis|g__Alistipes.s__Alistipes_putredinis
- NONMEVIPP-PWY: methylerythritol phosphate pathway I|g__Alistipes.s__Alistipes_putredinis
- PWY-3841: folate transformations II (plants)|g__Alistipes.s__Alistipes_putredinis
- PWY-5667: CDP-diacylglycerol biosynthesis I|g__Lachnospiraceae_unclassified.s__Eubacterium_rectale
- PWY-6123: inosine-5'-phosphate biosynthesis I|g__Alistipes.s__Alistipes_putredinis
- ...



ValueError: X has 2115 features, but LogisticRegression is expecting 10 features as input.

In [None]:
p = []
for item in X_test.index:
    p.append(metadata[metadata['SampleID'] == item]['Diagnosis'].iloc[0])

In [None]:
pd.DataFrame(zip(X_test.index, [x[1] for x in predp], pred, p), columns = ['SampleID', 'DE-hiPCA_Pathways-LogReg', 'Prediction', 'Diagnosis']).to_csv('results.csv', index = False)

In [None]:
unhealthy_functions