# GMHI development notebook

In [1]:
#Librerias que se utilizarán
import pandas as pd
import numpy as np
from sklearn.metrics import balanced_accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import pickle

In [2]:
import warnings
warnings.filterwarnings("ignore")

Function to calculate: $f_H=\frac{p_{H,m}}{p_{N,m}}$, $f_N=\frac{p_{N,m}}{p_{H,m}}$ ,$d_H=p_{H,m}-p_{N,m}$ y $d_N=p_{N,m}-p_{H,m}$

In [3]:
def get_fH_fN_dH_dN(meta,tax):
    ######### Recibe los data frames de los metadatos y la taxonomia.
    
    #Se obtienen los id's de las muestras saludables identificadas en los metadatos y después 
    #observamos la taxonomia de las muestras saludables
    healthy_id = meta[meta['category']=='healthy']['sample']
    tax_healthy = tax[healthy_id]
    
    #Se obtienen los id's de muestras no saludables y despues se observa la taxonmia de estas muestras
    no_healthy_id = meta[meta['category']!='healthy']['sample']
    tax_no_healthy = tax[no_healthy_id]
    
    #Se obtienen todas las especies de todas las muestras
    species = tax.index
    
    #Definimos lower para establecer una cota y evitar divisiones entre 0
    lower=1e-05
    
    #Se crea un Data Frame que tendrá las metricas como columnas y a las especies como index
    metrics=pd.DataFrame(index=species,columns=['f_H','f_N','d_H','d_N'])
    
    #Este ciclo obtiene para cada especie m las prevalencias en las muestras saludables p_H y no saludables P_N
    #Posteriormente se  agregan f_H,f_N, d_H y d_N al data frame metric
    for specie in species:
        
        #Se localiza la especie en todas las muestras healthy y se obtiene su presencia absoluta
        specie_in_H=tax_healthy.loc[specie,:]
        abs_pres_H=len(specie_in_H[specie_in_H!=0])
        
        #Se localiza la especie en todas las muestras no-healthy y se obtiene su presencia absoluta
        specie_in_N=tax_no_healthy.loc[specie,:]
        abs_pres_N=len(specie_in_N[specie_in_N!=0])
        
        #Se obtiene PH y PN de la especie, tomando en cuenta que si el resultado es 0, entonces se intercambia por la cota 1e-05
        PH=np.divide(abs_pres_H,len(specie_in_H),out=np.asanyarray(lower),where=(abs_pres_H!=0))
        PN=np.divide(abs_pres_N,len(specie_in_N),out=np.asanyarray(lower),where=(abs_pres_N!=0))
        metrics.loc[specie,:]=[np.divide(PH,PN),np.divide(PN,PH),PH-PN,PN-PH]
    return metrics

######### Regresa un DataFrame en el que para cada especie se obtienen sus metricas f_H,f_N,d_H y d_N

In [4]:
def get_MH_MN(metrics,theta_f,theta_d):
    ######### Recibe el conjunto de metricas para cada especie y los parámetros de comparación
    
    
    #Se obtienen las especies beneficiosas que son mayores a los parametros theta_f y theta_d
    health_species_pass_theta_f=set(metrics[metrics['f_H']>=theta_f].index)
    health_species_pass_theta_d=set(metrics[metrics['d_H']>=theta_d].index)
    
    #Se obtienen las especies dañinas que son mayores a los parametros theta_f y theta_d
    no_health_species_pass_theta_f=set(metrics[metrics['f_N']>=theta_f].index)
    no_health_species_pass_theta_d=set(metrics[metrics['d_N']>=theta_d].index)
    
    #Se definen los conjuntos de las especies beneficiosas y dañinas que superan ambos parámetros
    MH=health_species_pass_theta_f & health_species_pass_theta_d
    MN=no_health_species_pass_theta_f & no_health_species_pass_theta_d
    
    # print('|MH|=', len(MH) )
    # print('|MN|=', len(MN))        
    return MH,MN

######### Regresa los conjuntos de especies identificadas beneficiosas MH y dañinas MN, de acuerdo a los parámetros

This function calculates $\Psi_{{M_H},i}=\frac{R_{{M_H},i}}{|M_H|} \sum_{j\in I_{M_H}}|n_{j,i}\ln(n_{j,i})|$ or $\Psi_{{M_N},i}$ for sample $i$.
Here: 
- $R_{{M_H},i}$ is the species richness of $M_H$ in sample $i$.</li>
- $|M_H|$ is the size of the set $M_H$.</li>
- $I_{M_H}$ is the index set of $M_H$.</li>
- $n_{j,i}$ is the relative abundance of species $j$ in sample $i$.</li>

$\Psi_{{M_N},i}$ for sample $i$ is calculated analogously.



In [5]:
def get_Psi(set_M,sample):
    ######### Recibe el conjunto M_H o M_N y la muestra con la presencia relativa de cada especie
    
    
    #M_in_sample es el conjunto M_H o M_N intersección las especies presentes en la muestra i
    M_in_sample=set(sample[sample!=0].index) & set_M
    
    #Se calcula la R_M
    R_M_sample=np.divide(len(M_in_sample),len(set_M))
    
    #Se obtiene el array n, que contiene las abundanicas relativas de las especies presentes de M en la muestra i
    #Posteriormente se calcula el logaritmo y la suma
    n=sample[sample!=0][list(M_in_sample)]
    log_n=np.log(n)
    sum_nlnn=np.sum(n*log_n)
    
    #Finalmente se recupera Psi para la muestra i y el conjunto M
    Psi=np.divide(R_M_sample,len(set_M))*np.absolute(sum_nlnn)
    
    #Se evita que el caso Psi sea igual a 0 para evitar división entre 0 en la siguiente función. 
    if Psi==0:
        Psi=1e-05
    return Psi

######### Regresa el número Psi asociado a la muestra i y  al conjunto M_H o M_N.   

In [6]:
def get_all_GMHI(tax,MH,MN):
    ######### Se ingresa la taxonomia, el conjunto de especies MH y MN.
    

    #Se crea la variable GMHI, una serie de pandas que tiene como indice el nombre de la muestra y como información su indice GMHI.
    #Esta serie se llenará con un ciclo for, que recorre todas las especies
    samples=tax.columns 
    GMHI=pd.Series(index=samples,name='GMHI',dtype='float64')
    for sample in samples:
        
        #Se obtiene Psi_MH y Psi_MN con la función get_Psi
        Psi_MH=get_Psi(MH,tax[sample])
        Psi_MN=get_Psi(MN,tax[sample])
        
        #Se hace el cociente y se evalua en el logaritmo base 10. Posteriormente se agrega la información a la serie GMHI
        GMHI_sample=np.log10(np.divide(Psi_MH,Psi_MN))
        GMHI[sample]=GMHI_sample
        
    return GMHI 

######### Se regresa la serie con el índice GMHI de cada muestra

In [7]:
def get_accuracy(GMHI,meta):    
    return balanced_accuracy_score(['Unhealthy' if x != 'healthy' else 'Healthy' for x in meta['category']], ['Unhealthy' if x < 0 else 'Healthy' for x in list(GMHI)])


In [8]:
def get_accuracy2(GMHI,meta):    
    return f1_score([0 if x != 'healthy' else 1 for x in meta['category']], [0 if x < 0 else 1 for x in list(GMHI)], pos_label = 0)


In [9]:
meta = pd.read_csv('../../DataSets/CAMDA_2025/metadata.txt', sep="\t")
tax=pd.read_csv('../../DataSets/CAMDA_2025/taxa.txt',sep='\t',index_col=0)
metrics=get_fH_fN_dH_dN(meta,tax)

In [35]:
theta_f = 1.4
# theta_d = 0.1 # Esto lo tengo que relajar porque pone en mucha desventaja especies poco abundantes
theta_d = 0.0001
MH,MN=get_MH_MN(metrics,theta_f,theta_d)

In [11]:
MH_test=pd.read_csv('../../DataSets/INDEX/GMHI/MH_species.txt', sep="\t", index_col=0)
MN_test=pd.read_csv('../../DataSets/INDEX/GMHI/MN_species.txt', sep='\t',index_col=0)
MH_estandar=set()
for i in MH_test.index:
    MH_estandar.add(i[3:])
MN_estandar=set()    
for i in MN_test.index:
    MN_estandar.add(i[3:])

In [12]:
H=MH_estandar.union(MH)
N=MN_estandar.union(MN)

In [13]:
GMHI=get_all_GMHI(tax,MH_estandar,MN_estandar)
    
accuracy=get_accuracy(GMHI,meta)

print('El accuracy obtenido es de', accuracy*100 ,'%')
GMHI

El accuracy obtenido es de 48.98468319169913 %


train_0      -2.499385
train_1       1.327050
train_2       0.911783
train_3       0.527261
train_4       2.318180
                ...   
train_4393   -1.240998
train_4394   -1.763727
train_4395    0.293090
train_4396   -3.290298
train_4397   -2.388021
Name: GMHI, Length: 4398, dtype: float64

In [14]:
#Se calcula el GMHI de cada muestra y también se obtiene el data frame de los metadatos
GMHI=get_all_GMHI(tax,MH,MN)
    
accuracy=get_accuracy(GMHI,meta)

print('El accuracy obtenido es de', accuracy*100 ,'%')
GMHI

El accuracy obtenido es de 61.23903517246176 %


train_0       0.752712
train_1       0.116071
train_2       0.911509
train_3       0.468087
train_4       1.142087
                ...   
train_4393   -0.113986
train_4394    0.985035
train_4395    2.086687
train_4396    0.652616
train_4397   -0.489720
Name: GMHI, Length: 4398, dtype: float64

In [15]:
#Se calcula el GMHI de cada muestra y también se obtiene el data frame de los metadatos
GMHI=get_all_GMHI(tax,H,N)
    
accuracy=get_accuracy(GMHI,meta)

print('El accuracy obtenido es de', accuracy*100 ,'%')
GMHI

El accuracy obtenido es de 59.30296543036224 %


train_0      -0.387869
train_1      -1.267850
train_2      -0.096878
train_3      -0.388359
train_4       1.159667
                ...   
train_4393    0.834604
train_4394   -0.993917
train_4395    0.241520
train_4396   -1.054785
train_4397   -0.688152
Name: GMHI, Length: 4398, dtype: float64

In [116]:
# meta = meta[meta['category'].isin(['healthy', 'CRC', 'adenoma', 'IBD', 'UC', 'CD', 'acute_diarrhoea', 'few_polyps'])]
# meta = meta[meta['category'].isin(["healthy", "ACVD", "CAD","HF","CAD:T2D","HF:CAD","HF:CAD:T2D"])]
meta = meta[meta['category'].isin(["healthy", "PD","BD","schizophrenia", "IGT:MS", "ME/CFS"])]
# metabolic_conditions = [
#         "T2D",
#         "IGT",
#         "metabolic_syndrome",
#         "hypercholesterolemia",
#         "adenoma;hypercholesterolemia",
#         "T2D:adenoma",
#         "CRC:hypercholesterolemia",
#         "healthy"
#     ]


# meta = meta[meta['category'].isin(metabolic_conditions)]
meta.reset_index(drop=True, inplace=True)
data = tax[list(meta['sample'])]

In [16]:
data = tax

In [17]:
labels = []
for sample in data.T.index:
    labels.append(meta[meta['sample'] == sample]['category'].iloc[0])
labels = ['Healthy' if x == 'healthy' else 'Unhealthy' for x in labels]
labels = pd.DataFrame(labels)

In [18]:
meta

Unnamed: 0,category,cohort,category_binary,GMHI,hiPCA,Shannon_Entropy,Shannon_Entropy_on_Functions,sample
0,healthy,BedarfJR_2017,1,-3.040711,-2.230508,2.712028,5.090256,train_0
1,healthy,BedarfJR_2017,1,2.815237,-0.255368,2.843911,4.680161,train_1
2,healthy,BedarfJR_2017,1,0.900492,-0.746855,3.213241,4.994824,train_2
3,healthy,BedarfJR_2017,1,-0.464491,-1.179001,3.359700,5.218929,train_3
4,healthy,BedarfJR_2017,1,1.615869,-0.672061,3.397204,5.096859,train_4
...,...,...,...,...,...,...,...,...
4393,schizophrenia,ZhuF_2020,0,-1.353887,-0.436142,1.267090,4.773954,train_4393
4394,schizophrenia,ZhuF_2020,0,-3.017436,-1.864214,2.581850,5.102139,train_4394
4395,schizophrenia,ZhuF_2020,0,-0.545163,-1.790557,2.539637,5.117396,train_4395
4396,schizophrenia,ZhuF_2020,0,-3.686342,-1.776473,2.405504,5.228193,train_4396


In [19]:
data

Unnamed: 0_level_0,train_0,train_1,train_2,train_3,train_4,train_5,train_6,train_7,train_8,train_9,...,train_4388,train_4389,train_4390,train_4391,train_4392,train_4393,train_4394,train_4395,train_4396,train_4397
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bacteroides_vulgatus,11.66601,2.73563,12.44949,10.04475,7.68169,7.41403,2.10612,0.43836,0.67724,2.27049,...,16.16002,84.97335,5.81158,0.18158,10.80066,0.12498,7.38215,9.87256,34.63441,11.67750
Roseburia_faecis,0.00000,0.00000,0.00000,0.24374,4.10589,1.37947,0.78747,3.13218,10.80664,1.06868,...,0.66276,0.00827,0.03672,4.94094,0.84947,0.29030,0.02743,0.01955,0.00000,0.00964
Ruminococcus_gnavus,0.59774,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.60347,0.04162,0.30747,0.10444,0.00000,0.00172,0.29646,2.59479,0.60770
Roseburia_intestinalis,17.31798,0.18250,5.60477,3.53356,0.50953,5.28531,0.00000,5.61191,0.26558,0.00160,...,1.17488,0.00565,0.23012,0.92168,0.08458,0.28876,1.18431,0.00395,0.00000,5.36433
Blautia_wexlerae,0.00000,0.03673,0.03077,0.00000,0.00914,0.26341,0.00000,0.00000,0.90261,0.00000,...,0.04031,0.30255,0.02733,0.02141,0.06950,0.00489,0.04333,0.03602,0.00318,0.04264
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Geobacillus_stearothermophilus,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
Corynebacterium_falsenii,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
Mycobacterium_avium,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
Oligella_ureolytica,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.01006,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000


In [36]:
metrics=get_fH_fN_dH_dN(meta, data)

In [37]:
MH,MN=get_MH_MN(metrics,theta_f,theta_d)

In [38]:
MH

{'Acinetobacter_parvus',
 'Aerococcus_urinaeequi',
 'Aerococcus_viridans',
 'Aeromonas_caviae',
 'Bacillus_cereus_group',
 'Bacillus_sp_FJAT_27916',
 'Bacteroides_caecimuris',
 'Bacteroides_faecichinchillae',
 'Bacteroides_intestinalis',
 'Bacteroides_oleiciplenus',
 'Bacteroides_pectinophilus',
 'Bacteroides_sartorii',
 'Bacteroides_sp_OM05_12',
 'Bacteroidetes_bacterium_oral_taxon_272',
 'Bifidobacterium_kashiwanohense',
 'Bifidobacterium_ruminantium',
 'Bosea_vaviloviae',
 'Candidatus_Gastranaerophilales_bacterium_HUM_9',
 'Candidatus_Stoquefichus_sp_KLE1796',
 'Carnobacterium_maltaromaticum',
 'Citrobacter_koseri',
 'Citrobacter_portucalensis',
 'Clostridium_neonatale',
 'Clostridium_sp_CAG_299',
 'Clostridium_sp_chh4_2',
 'Comamonas_kerstersii',
 'Coprobacter_sp',
 'Corynebacterium_casei',
 'Corynebacterium_glutamicum',
 'Corynebacterium_variabile',
 'Desulfovibrio_fairfieldensis',
 'Dialister_sp_CAG_357',
 'Dysgonomonas_gadei',
 'Dysgonomonas_sp_37_18',
 'Elizabethkingia_anopheli

In [39]:
MN

{'Abiotrophia_defectiva',
 'Abiotrophia_sp_HMSC24B09',
 'Acidaminococcus_fermentans',
 'Actinobaculum_sp_oral_taxon_183',
 'Actinomyces_cardiffensis',
 'Actinomyces_graevenitzii',
 'Actinomyces_hongkongensis',
 'Actinomyces_johnsonii',
 'Actinomyces_massiliensis',
 'Actinomyces_naeslundii',
 'Actinomyces_oris',
 'Actinomyces_sp_HMSC035G02',
 'Actinomyces_sp_HPA0247',
 'Actinomyces_sp_S6_Spd3',
 'Actinomyces_sp_oral_taxon_180',
 'Actinomyces_sp_oral_taxon_181',
 'Actinomyces_sp_oral_taxon_414',
 'Actinomyces_sp_oral_taxon_897',
 'Actinomyces_turicensis',
 'Actinomyces_viscosus',
 'Aeriscardovia_aeriphila',
 'Aeromonas_allosaccharophila',
 'Aeromonas_hydrophila',
 'Aggregatibacter_segnis',
 'Aggregatibacter_sp_oral_taxon_458',
 'Alistipes_sp_An66',
 'Alistipes_sp_CHKCI003',
 'Alloprevotella_rava',
 'Alloprevotella_tannerae',
 'Alloscardovia_omnicolens',
 'Anaerococcus_vaginalis',
 'Anaeroglobus_geminatus',
 'Arcobacter_butzleri',
 'Atlantibacter_hermannii',
 'Atopobium_deltae',
 'Atopobi

In [119]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

evaluations = []
indexes = []
# Loop over each fold
for fold, (train_index, test_index) in enumerate(skf.split(data.T, labels)):
    # model_name = f'camda_pathways_fold{fold+1}'
    X_train, X_test = data.T.iloc[train_index], data.T.iloc[test_index]
    y_train, y_test = meta.iloc[train_index], meta.iloc[test_index]

    print(f"Fold {fold + 1}")
    
    metrics=get_fH_fN_dH_dN(y_train,X_train.T)
    # print(metrics)
    MH,MN=get_MH_MN(metrics,theta_f,theta_d)
    # print(MH)
    # print('.......')
    # print(MN)
    GMHI=get_all_GMHI(X_test.T,MH,MN)
    indexes.append(GMHI)
    
    accuracy=get_accuracy2(GMHI,y_test)
    # healthy_sub = [x for x in X_train.index if x in healthy]
    # nonhealthy_sub = [x for x in X_train.index if x in non_healthy]
    print(f'F1_score: {accuracy}')
    evaluations.append(accuracy)

          

Fold 1
F1_score: 0.15527950310559005
Fold 2
F1_score: 0.16
Fold 3
F1_score: 0.16
Fold 4
F1_score: 0.13855421686746988
Fold 5
F1_score: 0.1373134328358209


In [120]:
GMHI_tax = pd.concat(indexes)

In [121]:
GMHI_tax.to_csv('GMHI_2025_tax_PC.csv')

In [65]:
GMHI_tax

train_5       1.027441
train_72      0.602550
train_81      0.134685
train_200     1.052856
train_203     1.344975
                ...   
train_4370    3.145617
train_4373    4.684258
train_4378    1.890247
train_4380    1.766152
train_4390    0.762429
Name: GMHI, Length: 2412, dtype: float64

In [122]:
pathways = pd.read_csv('../../DataSets/CAMDA_2025/pathways.txt', sep = '\t', index_col = 0)

In [123]:
# meta = meta[meta['category'].isin(['healthy', 'CRC', 'adenoma', 'IBD', 'UC', 'CD', 'acute_diarrhoea', 'few_polyps'])]
# meta.reset_index(drop=True, inplace=True)
data = pathways[list(meta['sample'])]

In [28]:
# data = pathways

In [124]:

labels = []
for sample in data.T.index:
    labels.append(meta[meta['sample'] == sample]['category'].iloc[0])

labels = pd.DataFrame(labels)

In [125]:
data

Unnamed: 0_level_0,train_0,train_1,train_2,train_3,train_4,train_5,train_6,train_7,train_8,train_9,...,train_4388,train_4389,train_4390,train_4391,train_4392,train_4393,train_4394,train_4395,train_4396,train_4397
sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PWY-5686:_UMP_biosynthesis,0.000058,0.000059,0.000179,0.000146,0.000147,0.000210,0.000191,0.000164,0.000046,0.000559,...,0.000438,0.000417,0.000700,0.000462,0.000599,0.000790,0.000571,0.000503,0.000463,0.000514
PWY-6151:_S-adenosyl-L-methionine_cycle_I,0.000040,0.000057,0.000158,0.000113,0.000128,0.000175,0.000139,0.000159,0.000051,0.000457,...,0.000375,0.000373,0.000687,0.000455,0.000644,0.000764,0.000357,0.000450,0.000290,0.000407
PWY-7221:_guanosine_ribonucleotides_de_novo_biosynthesis,0.000062,0.000062,0.000182,0.000154,0.000155,0.000196,0.000200,0.000157,0.000052,0.000581,...,0.000425,0.000506,0.000667,0.000454,0.000597,0.000772,0.000543,0.000515,0.000494,0.000474
PWY-7219:_adenosine_ribonucleotides_de_novo_biosynthesis,0.000062,0.000085,0.000180,0.000139,0.000158,0.000191,0.000197,0.000152,0.000048,0.000591,...,0.000436,0.000551,0.000709,0.000427,0.000593,0.000746,0.000505,0.000469,0.000429,0.000421
ILEUSYN-PWY:_L-isoleucine_biosynthesis_I_(from_threonine),0.000062,0.000045,0.000188,0.000165,0.000144,0.000214,0.000203,0.000151,0.000049,0.000489,...,0.000469,0.000421,0.000667,0.000466,0.000558,0.000808,0.000500,0.000480,0.000459,0.000521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PWY-5655:_L-tryptophan_degradation_IX,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
PWY-7255:_ergothioneine_biosynthesis_I_(bacteria),0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
PWY-5109:_2-methylbutanoate_biosynthesis,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
PWY-7626:_bacilysin_biosynthesis,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [126]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

evaluations = []
gmhi = []
# Loop over each fold
for fold, (train_index, test_index) in enumerate(skf.split(data.T, labels)):
    # model_name = f'camda_pathways_fold{fold+1}'
    X_train, X_test = data.T.iloc[train_index], data.T.iloc[test_index]
    y_train, y_test = meta.iloc[train_index], meta.iloc[test_index]

    print(f"Fold {fold + 1}")
    
    metrics=get_fH_fN_dH_dN(y_train,X_train.T)
    MH,MN=get_MH_MN(metrics,theta_f,theta_d)
    GMHI=get_all_GMHI(X_test.T,MH,MN)
    gmhi.append(GMHI)
    accuracy=get_accuracy2(GMHI,y_test)
    # healthy_sub = [x for x in X_train.index if x in healthy]
    # nonhealthy_sub = [x for x in X_train.index if x in non_healthy]
    print(f'F1_score: {accuracy}')
    evaluations.append(accuracy)

Fold 1
F1_score: 0.07650273224043715
Fold 2
F1_score: 0.1386138613861386
Fold 3
F1_score: 0.13872832369942195
Fold 4
F1_score: 0.14207650273224043
Fold 5
F1_score: 0.12903225806451613


In [127]:
pd.concat(gmhi).to_csv('GMHI_2025_pathways_PC.csv')

In [44]:
sum(evaluations) / len(evaluations)

0.5387895508769157

In [19]:
pathways.T

# Pathway,UNINTEGRATED|g__Absiella.s__Absiella_dolichum,UNINTEGRATED|g__Acetobacter.s__Acetobacter_sp_CAG_267,UNINTEGRATED|g__Acetobacter.s__Acetobacter_sp_CAG_977,UNINTEGRATED|g__Acholeplasma.s__Acholeplasma_sp_CAG_878,UNINTEGRATED|g__Acidaminococcus.s__Acidaminococcus_intestini,UNINTEGRATED|g__Acinetobacter.s__Acinetobacter_idrijaensis,UNINTEGRATED|g__Acinetobacter.s__Acinetobacter_lwoffii,UNINTEGRATED|g__Actinomyces.s__Actinomyces_sp_HMSC035G02,UNINTEGRATED|g__Actinomyces.s__Actinomyces_sp_HPA0247,UNINTEGRATED|g__Actinomyces.s__Actinomyces_sp_ICM47,...,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_atypica,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_denticariosi,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_dispar,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_infantium,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_parvula,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_rogosae,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_seminalis,VALSYN-PWY: L-valine biosynthesis|g__Veillonella.s__Veillonella_tobetsuensis,VALSYN-PWY: L-valine biosynthesis|g__Victivallales_unclassified.s__Victivallales_bacterium_CCUG_44730,VALSYN-PWY: L-valine biosynthesis|g__Victivallis.s__Victivallis_vadensis
SRR5946989,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
SRR5983265,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
SRR5946777,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
SRR5946822,0.0,0.0,0.0,0.0,0.02683,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
SRR5946857,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR5946648,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
SRR5946925,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
ERR209694,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
SRR5946668,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.000002,0.0,0.0,0.0,0.000075,0.000002,0.0,0.0,0.0,0.0


In [20]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=21)

evaluations = []
# Loop over each fold
for fold, (train_index, test_index) in enumerate(skf.split(pathways.T, labels)):
    # model_name = f'camda_pathways_fold{fold+1}'
    X_train_pathways, X_test_pathways = pathways.T.iloc[train_index], pathways.T.iloc[test_index]
    X_train_tax, X_test_tax = tax.T.iloc[train_index], tax.T.iloc[test_index]

    X_train_un = X_train_pathways[[x for x in X_train.columns if 'UNINTEGRATED' in x]]
    X_train_in = X_train_pathways[[x for x in X_train.columns if 'UNINTEGRATED' not in x]]

    X_test_un = X_test_pathways[[x for x in X_test_pathways.columns if 'UNINTEGRATED' in x]]
    X_test_in = X_test_pathways[[x for x in X_test_pathways.columns if 'UNINTEGRATED' not in x]]
    
    y_train, y_test = meta.iloc[train_index], meta.iloc[test_index]

    print(f"Fold {fold + 1}")

    # print(X_train_tax)
    
    metrics=get_fH_fN_dH_dN(y_train,X_train_un.T)
    MH_un,MN_un=get_MH_MN(metrics,theta_f,theta_d)
    GMHI_train_un=get_all_GMHI(X_train_un.T,MH_un,MN_un)
    GMHI_test_un=get_all_GMHI(X_test_un.T,MH_un,MN_un)
    
    metrics=get_fH_fN_dH_dN(y_train,X_train_in.T)
    MH_in,MN_in=get_MH_MN(metrics,theta_f,theta_d)
    GMHI_train_in=get_all_GMHI(X_train_in.T,MH_in,MN_in)
    GMHI_test_in=get_all_GMHI(X_test_in.T,MH_in,MN_in)
    
    metrics=get_fH_fN_dH_dN(y_train,X_train_tax.T)
    MH_tax,MN_tax=get_MH_MN(metrics,theta_f,theta_d)
    GMHI_train_tax=get_all_GMHI(X_train_tax.T,MH_tax,MN_tax)
    GMHI_test_tax=get_all_GMHI(X_test_tax.T,MH_tax,MN_tax)

    new_data_train = pd.DataFrame(zip(GMHI_train_un, GMHI_train_in, GMHI_train_tax), columns = ['GMHI_unintegrated', 'GMHI_integrated', 'GMHI_taxonomy'])
    new_data_train.index = X_train_tax.index

    new_data_test = pd.DataFrame(zip(GMHI_test_un, GMHI_test_in, GMHI_test_tax), columns = ['GMHI_unintegrated', 'GMHI_integrated', 'GMHI_taxonomy'])
    new_data_test.index = X_test_tax.index
    # print(new_data_test.head(3))

    clf = RandomForestClassifier(max_depth=5, min_samples_leaf = 3, random_state=0)
    # clf = DecisionTreeClassifier(max_depth=5, min_samples_leaf = 3, random_state=0)
    clf.fit(new_data_train, ['Unhealthy' if x != 'Healthy' else 'Healthy' for x in y_train['Diagnosis']])

    pred = clf.predict(new_data_test)

    print(balanced_accuracy_score(['Unhealthy' if x != 'Healthy' else 'Healthy' for x in y_test['Diagnosis']], pred))


    

    # print(GMHI_un)
    
    # accuracy=get_accuracy(GMHI,y_test)
    # # healthy_sub = [x for x in X_train.index if x in healthy]
    # # nonhealthy_sub = [x for x in X_train.index if x in non_healthy]
    # print(f'Balanced Accuracy: {accuracy}')
    # evaluations.append(accuracy)

Fold 1
0.8043650793650794
Fold 2
0.8353174603174602
Fold 3
0.803968253968254
Fold 4
0.7787634408602151
Fold 5
0.7696236559139785


# Final Model RF-GMHI

In [21]:
unintegrated = pathways.T[[x for x in pathways.index if 'UNINTEGRATED' in x]]
integrated = pathways.T[[x for x in pathways.index if 'UNINTEGRATED' not in x]]

In [22]:
metrics=get_fH_fN_dH_dN(meta,unintegrated.T)
MH_un,MN_un=get_MH_MN(metrics,theta_f,theta_d)
# print(MH_un,MN_un)
pd.DataFrame(zip(MH_un,MN_un), columns = ['Healthy', 'Unhealthy']).to_csv('model_data/pathways_unintegrated.csv', index = False)
GMHI_train_un=get_all_GMHI(unintegrated.T,MH_un,MN_un)
# GMHI_test_un=get_all_GMHI(X_test_un.T,MH_un,MN_un)
    
metrics=get_fH_fN_dH_dN(meta,integrated.T)
MH_in,MN_in=get_MH_MN(metrics,theta_f,theta_d)
pd.DataFrame(zip(MH_in,MN_in), columns = ['Healthy', 'Unhealthy']).to_csv('model_data/pathways_integrated.csv', index = False)
GMHI_train_in=get_all_GMHI(integrated.T,MH_in,MN_in)
# GMHI_test_in=get_all_GMHI(X_test_in.T,MH_in,MN_in)
    
metrics=get_fH_fN_dH_dN(meta,tax)
MH_tax,MN_tax=get_MH_MN(metrics,theta_f,theta_d)
pd.DataFrame(zip(MH_tax,MN_tax), columns = ['Healthy', 'Unhealthy']).to_csv('model_data/taxonomy.csv', index = False)
GMHI_train_tax=get_all_GMHI(tax,MH_tax,MN_tax)
# GMHI_test_tax=get_all_GMHI(X_test_tax.T,MH_tax,MN_tax)

new_data_train = pd.DataFrame(zip(GMHI_train_un, GMHI_train_in, GMHI_train_tax), columns = ['GMHI_unintegrated', 'GMHI_integrated', 'GMHI_taxonomy'])

new_data_train.index = tax.columns
print(new_data_train.head(3))

clf = RandomForestClassifier(max_depth=5, min_samples_leaf = 3, random_state=0)
clf.fit(new_data_train, ['Unhealthy' if x != 'Healthy' else 'Healthy' for x in meta['Diagnosis']])
with open('model_data/rf_gmhi.pkl', 'wb') as f:
    pickle.dump(clf, f)


            GMHI_unintegrated  GMHI_integrated  GMHI_taxonomy
SRR5946989           2.956522         0.376066       3.216072
SRR5983265           2.546234         4.130309       2.537224
SRR5946777           3.154197         0.550126       3.225344


## COVID data

In [23]:
meta_covid=pd.read_csv('../../DataSets/COVID/CAMDA_metadata.txt', sep="\t",index_col=0)
tax_covid=pd.read_csv('../../DataSets/COVID/CAMDA_taxa.txt', sep='\t',index_col=0)
pathways_covid=pd.read_csv('../../DataSets/COVID/CAMDA_pathways.txt', sep='\t',index_col=0)

In [24]:
unintegrated_covid = pathways_covid.T[[x for x in pathways_covid.index if 'UNINTEGRATED' in x]]
integrated_covid = pathways_covid.T[[x for x in pathways_covid.index if 'UNINTEGRATED' not in x]]

In [25]:
GMHI_covid_un=get_all_GMHI(unintegrated_covid.T,MH_un,MN_un)
GMHI_covid_in=get_all_GMHI(integrated_covid.T,MH_in,MN_in)
GMHI_covid_tax=get_all_GMHI(tax_covid,MH_tax,MN_tax)

In [26]:
covid_gmhi = pd.DataFrame(zip(GMHI_covid_un, GMHI_covid_in, GMHI_covid_tax), columns = ['GMHI_unintegrated', 'GMHI_integrated', 'GMHI_taxonomy'])

In [27]:
covid_gmhi.index = tax_covid.columns

In [28]:
covid_gmhi.head()

Unnamed: 0,GMHI_unintegrated,GMHI_integrated,GMHI_taxonomy
Sample1a,2.000261,0.0,0.389201
Sample1b,2.24355,0.0,1.368506
Sample5a,-0.559261,0.0,0.837585
Sample5b,-0.516629,0.0,0.741994
Sample6a,1.405457,0.0,2.651577


In [29]:
clf.predict(covid_gmhi)

array(['Unhealthy', 'Healthy', 'Unhealthy', 'Unhealthy', 'Healthy',
       'Unhealthy', 'Healthy', 'Healthy', 'Unhealthy', 'Unhealthy',
       'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Healthy',
       'Healthy', 'Healthy', 'Healthy', 'Unhealthy', 'Unhealthy',
       'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy',
       'Unhealthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy',
       'Unhealthy', 'Unhealthy', 'Healthy', 'Healthy', 'Healthy',
       'Healthy', 'Healthy', 'Unhealthy', 'Healthy', 'Healthy',
       'Unhealthy', 'Unhealthy', 'Healthy', 'Healthy', 'Healthy',
       'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Unhealthy', 'Healthy',
       'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy',
       'Healthy', 'Unhealthy', 'Unhealthy', 'Unhealthy', 'Unhealthy',
       'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy', 'Healthy',
       'Healthy', 'Unhealthy'], dtype='<U9')