# GMHI development notebook

In [1]:
#Librerias que se utilizarán
import pandas as pd
import numpy as np
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import pickle

Function to calculate: $f_H=\frac{p_{H,m}}{p_{N,m}}$, $f_N=\frac{p_{N,m}}{p_{H,m}}$ ,$d_H=p_{H,m}-p_{N,m}$ y $d_N=p_{N,m}-p_{H,m}$

In [2]:
def get_fH_fN_dH_dN(meta,tax):
    ######### Recibe los data frames de los metadatos y la taxonomia.
    
    #Se obtienen los id's de las muestras saludables identificadas en los metadatos y después 
    #observamos la taxonomia de las muestras saludables
    healthy_id = meta[meta['Diagnosis']=='Healthy']['SampleID']
    tax_healthy = tax[healthy_id]
    
    #Se obtienen los id's de muestras no saludables y despues se observa la taxonmia de estas muestras
    no_healthy_id = meta[meta['Diagnosis']!='Healthy']['SampleID']
    tax_no_healthy = tax[no_healthy_id]
    
    #Se obtienen todas las especies de todas las muestras
    species = tax.index
    
    #Definimos lower para establecer una cota y evitar divisiones entre 0
    lower=1e-05
    
    #Se crea un Data Frame que tendrá las metricas como columnas y a las especies como index
    metrics=pd.DataFrame(index=species,columns=['f_H','f_N','d_H','d_N'])
    
    #Este ciclo obtiene para cada especie m las prevalencias en las muestras saludables p_H y no saludables P_N
    #Posteriormente se  agregan f_H,f_N, d_H y d_N al data frame metric
    for specie in species:
        
        #Se localiza la especie en todas las muestras healthy y se obtiene su presencia absoluta
        specie_in_H=tax_healthy.loc[specie,:]
        abs_pres_H=len(specie_in_H[specie_in_H!=0])
        
        #Se localiza la especie en todas las muestras no-healthy y se obtiene su presencia absoluta
        specie_in_N=tax_no_healthy.loc[specie,:]
        abs_pres_N=len(specie_in_N[specie_in_N!=0])
        
        #Se obtiene PH y PN de la especie, tomando en cuenta que si el resultado es 0, entonces se intercambia por la cota 1e-05
        PH=np.divide(abs_pres_H,len(specie_in_H),out=np.asanyarray(lower),where=(abs_pres_H!=0))
        PN=np.divide(abs_pres_N,len(specie_in_N),out=np.asanyarray(lower),where=(abs_pres_N!=0))
        metrics.loc[specie,:]=[np.divide(PH,PN),np.divide(PN,PH),PH-PN,PN-PH]
    return metrics

######### Regresa un DataFrame en el que para cada especie se obtienen sus metricas f_H,f_N,d_H y d_N

In [3]:
def get_MH_MN(metrics,theta_f,theta_d):
    ######### Recibe el conjunto de metricas para cada especie y los parámetros de comparación
    
    
    #Se obtienen las especies beneficiosas que son mayores a los parametros theta_f y theta_d
    health_species_pass_theta_f=set(metrics[metrics['f_H']>=theta_f].index)
    health_species_pass_theta_d=set(metrics[metrics['d_H']>=theta_d].index)
    
    #Se obtienen las especies dañinas que son mayores a los parametros theta_f y theta_d
    no_health_species_pass_theta_f=set(metrics[metrics['f_N']>=theta_f].index)
    no_health_species_pass_theta_d=set(metrics[metrics['d_N']>=theta_d].index)
    
    #Se definen los conjuntos de las especies beneficiosas y dañinas que superan ambos parámetros
    MH=health_species_pass_theta_f & health_species_pass_theta_d
    MN=no_health_species_pass_theta_f & no_health_species_pass_theta_d
    
    # print('|MH|=', len(MH) )
    # print('|MN|=', len(MN))        
    return MH,MN

######### Regresa los conjuntos de especies identificadas beneficiosas MH y dañinas MN, de acuerdo a los parámetros

This function calculates $\Psi_{{M_H},i}=\frac{R_{{M_H},i}}{|M_H|} \sum_{j\in I_{M_H}}|n_{j,i}\ln(n_{j,i})|$ or $\Psi_{{M_N},i}$ for sample $i$.
Here: 
- $R_{{M_H},i}$ is the species richness of $M_H$ in sample $i$.</li>
- $|M_H|$ is the size of the set $M_H$.</li>
- $I_{M_H}$ is the index set of $M_H$.</li>
- $n_{j,i}$ is the relative abundance of species $j$ in sample $i$.</li>

$\Psi_{{M_N},i}$ for sample $i$ is calculated analogously.



In [4]:
def get_Psi(set_M,sample):
    ######### Recibe el conjunto M_H o M_N y la muestra con la presencia relativa de cada especie
    
    
    #M_in_sample es el conjunto M_H o M_N intersección las especies presentes en la muestra i
    M_in_sample=set(sample[sample!=0].index) & set_M
    
    #Se calcula la R_M
    R_M_sample=np.divide(len(M_in_sample),len(set_M))
    
    #Se obtiene el array n, que contiene las abundanicas relativas de las especies presentes de M en la muestra i
    #Posteriormente se calcula el logaritmo y la suma
    n=sample[sample!=0][list(M_in_sample)]
    log_n=np.log(n)
    sum_nlnn=np.sum(n*log_n)
    
    #Finalmente se recupera Psi para la muestra i y el conjunto M
    Psi=np.divide(R_M_sample,len(set_M))*np.absolute(sum_nlnn)
    
    #Se evita que el caso Psi sea igual a 0 para evitar división entre 0 en la siguiente función. 
    if Psi==0:
        Psi=1e-05
    return Psi

######### Regresa el número Psi asociado a la muestra i y  al conjunto M_H o M_N.   

In [5]:
def get_all_GMHI(tax,MH,MN):
    ######### Se ingresa la taxonomia, el conjunto de especies MH y MN.
    

    #Se crea la variable GMHI, una serie de pandas que tiene como indice el nombre de la muestra y como información su indice GMHI.
    #Esta serie se llenará con un ciclo for, que recorre todas las especies
    samples=tax.columns 
    GMHI=pd.Series(index=samples,name='GMHI',dtype='float64')
    for sample in samples:
        
        #Se obtiene Psi_MH y Psi_MN con la función get_Psi
        Psi_MH=get_Psi(MH,tax[sample])
        Psi_MN=get_Psi(MN,tax[sample])
        
        #Se hace el cociente y se evalua en el logaritmo base 10. Posteriormente se agrega la información a la serie GMHI
        GMHI_sample=np.log10(np.divide(Psi_MH,Psi_MN))
        GMHI[sample]=GMHI_sample
        
    return GMHI 

######### Se regresa la serie con el índice GMHI de cada muestra

In [6]:
def get_accuracy(GMHI,meta):    
    return balanced_accuracy_score(['Unhealthy' if x != 'Healthy' else 'Healthy' for x in meta['Diagnosis']], ['Unhealthy' if x < 0 else 'Healthy' for x in list(GMHI)])


In [7]:
meta = pd.read_csv('../../DataSets/CAMDA/metadata.txt', sep="\t")
tax=pd.read_csv('../../DataSets/CAMDA/taxonomy.txt',sep='\t',index_col=0)
metrics=get_fH_fN_dH_dN(meta,tax)

In [8]:
theta_f = 1.4
theta_d = 0.1
MH,MN=get_MH_MN(metrics,theta_f,theta_d)

In [9]:
MH_test=pd.read_csv('../../DataSets/INDEX/GMHI/MH_species.txt', sep="\t", index_col=0)
MN_test=pd.read_csv('../../DataSets/INDEX/GMHI/MN_species.txt', sep='\t',index_col=0)
MH_estandar=set()
for i in MH_test.index:
    MH_estandar.add(i[3:])
MN_estandar=set()    
for i in MN_test.index:
    MN_estandar.add(i[3:])

In [10]:
H=MH_estandar.union(MH)
N=MN_estandar.union(MN)

In [11]:
GMHI=get_all_GMHI(tax,MH_estandar,MN_estandar)
    
accuracy=get_accuracy(GMHI,meta)

print('El accuracy obtenido es de', accuracy*100 ,'%')
GMHI

El accuracy obtenido es de 68.33173588924389 %


SRR5946989    2.657701
SRR5983265    1.119042
SRR5946777    2.811782
SRR5946822   -1.875445
SRR5946857    0.986891
                ...   
SRR5946648    1.016711
SRR5946925    1.256304
ERR209694     0.898682
SRR5946668   -2.545446
ERR209312     0.000000
Name: GMHI, Length: 613, dtype: float64

In [12]:
#Se calcula el GMHI de cada muestra y también se obtiene el data frame de los metadatos
GMHI=get_all_GMHI(tax,MH,MN)
    
accuracy=get_accuracy(GMHI,meta)

print('El accuracy obtenido es de', accuracy*100 ,'%')
GMHI

El accuracy obtenido es de 68.27742279020235 %


SRR5946989    3.216072
SRR5983265    2.537224
SRR5946777    3.225344
SRR5946822   -0.628897
SRR5946857    1.887152
                ...   
SRR5946648    1.051364
SRR5946925    2.571253
ERR209694     0.427815
SRR5946668   -0.086489
ERR209312     1.907716
Name: GMHI, Length: 613, dtype: float64

In [13]:
#Se calcula el GMHI de cada muestra y también se obtiene el data frame de los metadatos
GMHI=get_all_GMHI(tax,H,N)
    
accuracy=get_accuracy(GMHI,meta)

print('El accuracy obtenido es de', accuracy*100 ,'%')
GMHI

El accuracy obtenido es de 63.85410010649627 %


SRR5946989    3.200351
SRR5983265    2.403045
SRR5946777    3.074671
SRR5946822    0.085752
SRR5946857    2.091053
                ...   
SRR5946648    1.407655
SRR5946925    2.075274
ERR209694     1.045994
SRR5946668    0.484435
ERR209312     2.674407
Name: GMHI, Length: 613, dtype: float64

In [14]:
labels = []
for sample in tax.T.index:
    labels.append(meta[meta['SampleID'] == sample]['Diagnosis'].iloc[0])

labels = pd.DataFrame(labels)

In [15]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=21)

evaluations = []
# Loop over each fold
for fold, (train_index, test_index) in enumerate(skf.split(tax.T, labels)):
    # model_name = f'camda_pathways_fold{fold+1}'
    X_train, X_test = tax.T.iloc[train_index], tax.T.iloc[test_index]
    y_train, y_test = meta.iloc[train_index], meta.iloc[test_index]

    print(f"Fold {fold + 1}")
    
    metrics=get_fH_fN_dH_dN(y_train,X_train.T)
    MH,MN=get_MH_MN(metrics,theta_f,theta_d)
    GMHI=get_all_GMHI(X_test.T,MH,MN)
    
    accuracy=get_accuracy(GMHI,y_test)
    # healthy_sub = [x for x in X_train.index if x in healthy]
    # nonhealthy_sub = [x for x in X_train.index if x in non_healthy]
    print(f'Balanced Accuracy: {accuracy}')
    evaluations.append(accuracy)

          

Fold 1
Balanced Accuracy: 0.7420634920634921
Fold 2
Balanced Accuracy: 0.6341269841269841
Fold 3
Balanced Accuracy: 0.6932539682539682
Fold 4
Balanced Accuracy: 0.6838709677419355
Fold 5
Balanced Accuracy: 0.667741935483871


In [16]:
pathways = pd.read_csv('../../DataSets/CAMDA/pathways.txt', sep = '\t', index_col = 0)
labels = []
for sample in pathways.T.index:
    labels.append(meta[meta['SampleID'] == sample]['Diagnosis'].iloc[0])

labels = pd.DataFrame(labels)

In [17]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=21)

evaluations = []
# Loop over each fold
for fold, (train_index, test_index) in enumerate(skf.split(pathways.T, labels)):
    # model_name = f'camda_pathways_fold{fold+1}'
    X_train, X_test = pathways.T.iloc[train_index], pathways.T.iloc[test_index]
    y_train, y_test = meta.iloc[train_index], meta.iloc[test_index]

    print(f"Fold {fold + 1}")
    
    metrics=get_fH_fN_dH_dN(y_train,X_train.T)
    MH,MN=get_MH_MN(metrics,theta_f,theta_d)
    GMHI=get_all_GMHI(X_test.T,MH,MN)
    
    accuracy=get_accuracy(GMHI,y_test)
    # healthy_sub = [x for x in X_train.index if x in healthy]
    # nonhealthy_sub = [x for x in X_train.index if x in non_healthy]
    print(f'Balanced Accuracy: {accuracy}')
    evaluations.append(accuracy)

Fold 1
Balanced Accuracy: 0.6607142857142857
Fold 2
Balanced Accuracy: 0.6591269841269841
Fold 3
Balanced Accuracy: 0.7011904761904761
Fold 4
Balanced Accuracy: 0.6596774193548387
Fold 5
Balanced Accuracy: 0.6502688172043011


In [18]:
sum(evaluations) / len(evaluations)

0.6661955965181772

In [19]:
pathways.T

# Pathway,UNINTEGRATED|g__Absiella.s__Absiella_dolichum,UNINTEGRATED|g__Acetobacter.s__Acetobacter_sp_CAG_267,UNINTEGRATED|g__Acetobacter.s__Acetobacter_sp_CAG_977,UNINTEGRATED|g__Acholeplasma.s__Acholeplasma_sp_CAG_878,UNINTEGRATED|g__Acidaminococcus.s__Acidaminococcus_intestini,UNINTEGRATED|g__Acinetobacter.s__Acinetobacter_idrijaensis,UNINTEGRATED|g__Acinetobacter.s__Acinetobacter_lwoffii,UNINTEGRATED|g__Actinomyces.s__Actinomyces_sp_HMSC035G02,UNINTEGRATED|g__Actinomyces.s__Actinomyces_sp_HPA0247,UNINTEGRATED|g__Actinomyces.s__Actinomyces_sp_ICM47,...,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_atypica,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_denticariosi,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_dispar,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_infantium,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_parvula,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_rogosae,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_seminalis,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_tobetsuensis,VALSYN-PWY: L-valine biosynthesis|g__Victivallales_unclassified.s__Victivallales_bacterium_CCUG_44730,VALSYN-PWY: L-valine biosynthesis|g__Victivallis.s__Victivallis_vadensis
SRR5946989,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
SRR5983265,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
SRR5946777,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
SRR5946822,0.0,0.0,0.0,0.0,0.02683,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
SRR5946857,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR5946648,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
SRR5946925,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
ERR209694,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
SRR5946668,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.000002,0.0,0.0,0.0,0.000075,0.000002,0.0,0.0,0.0,0.0


In [20]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=21)

evaluations = []
# Loop over each fold
for fold, (train_index, test_index) in enumerate(skf.split(pathways.T, labels)):
    # model_name = f'camda_pathways_fold{fold+1}'
    X_train_pathways, X_test_pathways = pathways.T.iloc[train_index], pathways.T.iloc[test_index]
    X_train_tax, X_test_tax = tax.T.iloc[train_index], tax.T.iloc[test_index]

    X_train_un = X_train_pathways[[x for x in X_train.columns if 'UNINTEGRATED' in x]]
    X_train_in = X_train_pathways[[x for x in X_train.columns if 'UNINTEGRATED' not in x]]

    X_test_un = X_test_pathways[[x for x in X_test_pathways.columns if 'UNINTEGRATED' in x]]
    X_test_in = X_test_pathways[[x for x in X_test_pathways.columns if 'UNINTEGRATED' not in x]]
    
    y_train, y_test = meta.iloc[train_index], meta.iloc[test_index]

    print(f"Fold {fold + 1}")

    # print(X_train_tax)
    
    metrics=get_fH_fN_dH_dN(y_train,X_train_un.T)
    MH_un,MN_un=get_MH_MN(metrics,theta_f,theta_d)
    GMHI_train_un=get_all_GMHI(X_train_un.T,MH_un,MN_un)
    GMHI_test_un=get_all_GMHI(X_test_un.T,MH_un,MN_un)
    
    metrics=get_fH_fN_dH_dN(y_train,X_train_in.T)
    MH_in,MN_in=get_MH_MN(metrics,theta_f,theta_d)
    GMHI_train_in=get_all_GMHI(X_train_in.T,MH_in,MN_in)
    GMHI_test_in=get_all_GMHI(X_test_in.T,MH_in,MN_in)
    
    metrics=get_fH_fN_dH_dN(y_train,X_train_tax.T)
    MH_tax,MN_tax=get_MH_MN(metrics,theta_f,theta_d)
    GMHI_train_tax=get_all_GMHI(X_train_tax.T,MH_tax,MN_tax)
    GMHI_test_tax=get_all_GMHI(X_test_tax.T,MH_tax,MN_tax)

    new_data_train = pd.DataFrame(zip(GMHI_train_un, GMHI_train_in, GMHI_train_tax), columns = ['GMHI_unintegrated', 'GMHI_integrated', 'GMHI_taxonomy'])
    new_data_train.index = X_train_tax.index

    new_data_test = pd.DataFrame(zip(GMHI_test_un, GMHI_test_in, GMHI_test_tax), columns = ['GMHI_unintegrated', 'GMHI_integrated', 'GMHI_taxonomy'])
    new_data_test.index = X_test_tax.index
    # print(new_data_test.head(3))

    clf = RandomForestClassifier(max_depth=5, min_samples_leaf = 3, random_state=0)
    # clf = DecisionTreeClassifier(max_depth=5, min_samples_leaf = 3, random_state=0)
    clf.fit(new_data_train, ['Unhealthy' if x != 'Healthy' else 'Healthy' for x in y_train['Diagnosis']])

    pred = clf.predict(new_data_test)

    print(balanced_accuracy_score(['Unhealthy' if x != 'Healthy' else 'Healthy' for x in y_test['Diagnosis']], pred))


    

    # print(GMHI_un)
    
    # accuracy=get_accuracy(GMHI,y_test)
    # # healthy_sub = [x for x in X_train.index if x in healthy]
    # # nonhealthy_sub = [x for x in X_train.index if x in non_healthy]
    # print(f'Balanced Accuracy: {accuracy}')
    # evaluations.append(accuracy)

Fold 1
0.8043650793650794
Fold 2
0.8353174603174602
Fold 3
0.803968253968254
Fold 4
0.7787634408602151
Fold 5
0.7696236559139785


# Final Model RF-GMHI

In [21]:
unintegrated = pathways.T[[x for x in pathways.index if 'UNINTEGRATED' in x]]
integrated = pathways.T[[x for x in pathways.index if 'UNINTEGRATED' not in x]]

In [22]:
metrics=get_fH_fN_dH_dN(meta,unintegrated.T)
MH_un,MN_un=get_MH_MN(metrics,theta_f,theta_d)
# print(MH_un,MN_un)
pd.DataFrame(zip(MH_un,MN_un), columns = ['Healthy', 'Unhealthy']).to_csv('model_data/pathways_unintegrated.csv', index = False)
GMHI_train_un=get_all_GMHI(unintegrated.T,MH_un,MN_un)
# GMHI_test_un=get_all_GMHI(X_test_un.T,MH_un,MN_un)
    
metrics=get_fH_fN_dH_dN(meta,integrated.T)
MH_in,MN_in=get_MH_MN(metrics,theta_f,theta_d)
pd.DataFrame(zip(MH_in,MN_in), columns = ['Healthy', 'Unhealthy']).to_csv('model_data/pathways_integrated.csv', index = False)
GMHI_train_in=get_all_GMHI(integrated.T,MH_in,MN_in)
# GMHI_test_in=get_all_GMHI(X_test_in.T,MH_in,MN_in)
    
metrics=get_fH_fN_dH_dN(meta,tax)
MH_tax,MN_tax=get_MH_MN(metrics,theta_f,theta_d)
pd.DataFrame(zip(MH_tax,MN_tax), columns = ['Healthy', 'Unhealthy']).to_csv('model_data/taxonomy.csv', index = False)
GMHI_train_tax=get_all_GMHI(tax,MH_tax,MN_tax)
# GMHI_test_tax=get_all_GMHI(X_test_tax.T,MH_tax,MN_tax)

new_data_train = pd.DataFrame(zip(GMHI_train_un, GMHI_train_in, GMHI_train_tax), columns = ['GMHI_unintegrated', 'GMHI_integrated', 'GMHI_taxonomy'])

new_data_train.index = tax.columns
print(new_data_train.head(3))

clf = RandomForestClassifier(max_depth=5, min_samples_leaf = 3, random_state=0)
clf.fit(new_data_train, ['Unhealthy' if x != 'Healthy' else 'Healthy' for x in meta['Diagnosis']])
with open('model_data/rf_gmhi.pkl', 'wb') as f:
    pickle.dump(clf, f)


            GMHI_unintegrated  GMHI_integrated  GMHI_taxonomy
SRR5946989           2.956522         0.376066       3.216072
SRR5983265           2.546234         4.130309       2.537224
SRR5946777           3.154197         0.550126       3.225344


## COVID data

In [23]:
meta_covid=pd.read_csv('../../DataSets/COVID/CAMDA_metadata.txt', sep="\t",index_col=0)
tax_covid=pd.read_csv('../../DataSets/COVID/CAMDA_taxa.txt', sep='\t',index_col=0)
pathways_covid=pd.read_csv('../../DataSets/COVID/CAMDA_pathways.txt', sep='\t',index_col=0)

In [24]:
unintegrated_covid = pathways_covid.T[[x for x in pathways_covid.index if 'UNINTEGRATED' in x]]
integrated_covid = pathways_covid.T[[x for x in pathways_covid.index if 'UNINTEGRATED' not in x]]

In [25]:
GMHI_covid_un=get_all_GMHI(unintegrated_covid.T,MH_un,MN_un)
GMHI_covid_in=get_all_GMHI(integrated_covid.T,MH_in,MN_in)
GMHI_covid_tax=get_all_GMHI(tax_covid,MH_tax,MN_tax)

In [26]:
covid_gmhi = pd.DataFrame(zip(GMHI_covid_un, GMHI_covid_in, GMHI_covid_tax), columns = ['GMHI_unintegrated', 'GMHI_integrated', 'GMHI_taxonomy'])

In [27]:
covid_gmhi.index = tax_covid.columns

In [28]:
covid_gmhi.head()

Unnamed: 0,GMHI_unintegrated,GMHI_integrated,GMHI_taxonomy
Sample1a,2.000261,0.0,0.389201
Sample1b,2.24355,0.0,1.368506
Sample5a,-0.559261,0.0,0.837585
Sample5b,-0.516629,0.0,0.741994
Sample6a,1.405457,0.0,2.651577


In [29]:
clf.predict(covid_gmhi)

array(['Unhealthy', 'Healthy', 'Unhealthy', 'Unhealthy', 'Healthy',
       'Unhealthy', 'Healthy', 'Healthy', 'Unhealthy', 'Unhealthy',
       'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Healthy',
       'Healthy', 'Healthy', 'Healthy', 'Unhealthy', 'Unhealthy',
       'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy',
       'Unhealthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy',
       'Unhealthy', 'Unhealthy', 'Healthy', 'Healthy', 'Healthy',
       'Healthy', 'Healthy', 'Unhealthy', 'Healthy', 'Healthy',
       'Unhealthy', 'Unhealthy', 'Healthy', 'Healthy', 'Healthy',
       'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Unhealthy', 'Healthy',
       'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy',
       'Healthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy',
       'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy',
       'Healthy', 'Unhealthy'], dtype='<U9')