# Build datasets for Reservoir Computing

## Exchange metabolites list

In [1]:
# Run cobra on various models

DIRECTORY = './Dataset_input/'

from Library.Import import *
from Library.Build_Dataset import get_objective
import cobra

MEDIA, EXCHANGE = {}, {}
species = [ 
    'e_coli_core', #0
    'iEK1008', 
    'iIT341', #2
    'iJN1463', 
    'iML1515', #4
    'iMM904',
    'iPC815', #6
    'iYO844',
    'iYS1720', 
    'iYS854', # 9
    'iCN718',
    'iNF517',
    'iML1515EXP'
]
for i in [ 11 ]:
    name = species[i]
    cobrafile = f'{DIRECTORY}{name}_duplicated.xml'
    model = cobra.io.read_sbml_model(cobrafile)
    for r in model.medium.keys():
        if r in MEDIA.keys():
            MEDIA[r] = MEDIA[r] + ',' + name
        else:
            MEDIA[r] = name
    nEXCHANGE = 0
    for reaction in model.reactions:
        r = reaction.id
        if '_e' not in r:
            continue
        nEXCHANGE += 1
        if r in EXCHANGE.keys():
            EXCHANGE[r] = EXCHANGE[r] + ',' + name
        else:
            EXCHANGE[r] = name
    
    objective = get_objective(model)
    solution = model.optimize()
    print(f'{name} Medium-size: {len(model.medium.keys())} Exchange reactions: {nEXCHANGE} Objective: {objective} {solution.fluxes[objective]}')

for r in MEDIA.keys():
    print(f'Medium {r} {MEDIA[r]}')
for r in EXCHANGE.keys():
    print(f'Exchange {r} {EXCHANGE[r]}')


iNF517 Medium-size: 99 Exchange reactions: 204 Objective: BIOMASS_LLA 0.04263460544337314
Medium EX_2aeppn_e_i iNF517
Medium EX_2h3mb_e_i iNF517
Medium EX_2h3mp_e_i iNF517
Medium EX_2hxic__L_e_i iNF517
Medium EX_2mba_e_i iNF517
Medium EX_2mbald_e_i iNF517
Medium EX_2mpa_e_i iNF517
Medium EX_3mba_e_i iNF517
Medium EX_3mbal_e_i iNF517
Medium EX_4abut_e_i iNF517
Medium EX_4abz_e_i iNF517
Medium EX_acald_e_i iNF517
Medium EX_acgala_e_i iNF517
Medium EX_acgam_e_i iNF517
Medium EX_acmana_e_i iNF517
Medium EX_actn__R_e_i iNF517
Medium EX_ade_e_i iNF517
Medium EX_akg_e_i iNF517
Medium EX_ala__D_e_i iNF517
Medium EX_ala__L_e_i iNF517
Medium EX_arg__L_e_i iNF517
Medium EX_asn__L_e_i iNF517
Medium EX_asp__L_e_i iNF517
Medium EX_btd_RR_e_i iNF517
Medium EX_bzal_e_i iNF517
Medium EX_cellb_e_i iNF517
Medium EX_ch4s_e_i iNF517
Medium EX_chol_e_i iNF517
Medium EX_cit_e_i iNF517
Medium EX_co2_e_i iNF517
Medium EX_cys__L_e_i iNF517
Medium EX_dha_e_i iNF517
Medium EX_diact_e_i iNF517
Medium EX_drib_e_i i

In [2]:
# Get media
# Minimal medium is composed of metabolites both in the initial strain and duplicated strain
# Variable medium is composed of metabolites in duplicated strain NOT in initial strain

DIRECTORY = './Dataset_input/'

from Library.Import import *
from Library.Build_Dataset import get_objective
import cobra

MEDIA, EXCHANGE = {}, {}
species = [ 
    'e_coli_core', #0
    'iEK1008', 
    'iIT341', #2
    'iJN1463', 
    'iML1515', #4
    'iMM904',
    'iPC815', #6
    'iYO844',
    'iYS1720', 
    'iYS854', # 9
    'iCN718',
    'iNF517',
    'iML1515EXP'
]
for i in [ 11 ]:
    name = species[i]
    cobrafile = f'{DIRECTORY}{name}.xml'  # name of the model 
    model = cobra.io.read_sbml_model(cobrafile)
    for r in model.medium.keys():
        print(name, r)
for i in [ 11 ]:
    name = species[i]
    cobrafile = f'{DIRECTORY}{name}_duplicated.xml'  # name of the model 
    model = cobra.io.read_sbml_model(cobrafile)
    for r in model.medium.keys():
        print(name, r, model.medium[r])
    solution = model.optimize()    
    print(solution.objective_value)

iNF517 EX_4abz_e
iNF517 EX_ade_e
iNF517 EX_ala__L_e
iNF517 EX_arg__L_e
iNF517 EX_asp__L_e
iNF517 EX_co2_e
iNF517 EX_cys__L_e
iNF517 EX_fe2_e
iNF517 EX_fe3_e
iNF517 EX_glc__D_e
iNF517 EX_glu__L_e
iNF517 EX_gly_e
iNF517 EX_gua_e
iNF517 EX_h2o_e
iNF517 EX_h_e
iNF517 EX_his__L_e
iNF517 EX_ile__L_e
iNF517 EX_ins_e
iNF517 EX_leu__L_e
iNF517 EX_lys__L_e
iNF517 EX_met__L_e
iNF517 EX_mn2_e
iNF517 EX_nac_e
iNF517 EX_nh4_e
iNF517 EX_orot_e
iNF517 EX_phe__L_e
iNF517 EX_pi_e
iNF517 EX_pnto__R_e
iNF517 EX_ribflv_e
iNF517 EX_ser__L_e
iNF517 EX_thm_e
iNF517 EX_thr__L_e
iNF517 EX_thymd_e
iNF517 EX_ura_e
iNF517 EX_val__L_e
iNF517 EX_xan_e
iNF517 EX_zn2_e
iNF517 EX_2aeppn_e_i 1e-300
iNF517 EX_2h3mb_e_i 1e-300
iNF517 EX_2h3mp_e_i 1e-300
iNF517 EX_2hxic__L_e_i 1e-300
iNF517 EX_2mba_e_i 1e-300
iNF517 EX_2mbald_e_i 1e-300
iNF517 EX_2mpa_e_i 1e-300
iNF517 EX_3mba_e_i 1e-300
iNF517 EX_3mbal_e_i 1e-300
iNF517 EX_4abut_e_i 1e-300
iNF517 EX_4abz_e_i 0.00999
iNF517 EX_acald_e_i 1e-300
iNF517 EX_acgala_e_i 1e-300
i

## Generating training sets

### From experimental data set

In [3]:
# Generate training set with experimental dataset

from Library.Import import *
from Library.Build_Dataset import TrainingSet, get_index_from_id
DIRECTORY = './Dataset_input/Covid/'

# What you can change
seed = 10
np.random.seed(seed=seed)  # seed for random number generator
cobraname = 'Covid_duplicated'  # name of the model
mediumname = 'Covid_GR'  # name of the medium file
method = 'EXP'  # FBA, pFBA or EXP
reduce = False  # Set at True if you want to reduce the model
# End of What you can change

# Get parameters from medium file
cobrafile = f'{DIRECTORY}{cobraname}'
mediumfile = f'{DIRECTORY}{mediumname}'
parameter = TrainingSet(cobraname=cobrafile,
                        mediumname=mediumfile,
                        method=method,
                        verbose=False)

trainingfile = f'{DIRECTORY}{mediumname}'
parameter.save(trainingfile, reduce=reduce)

# Verifying
parameter = TrainingSet()
parameter.load(trainingfile)
print(trainingfile)
parameter.printout()

SystemExit: ./Dataset_input/Covid/Covid_GR file not found

### From FBA simulated data

In [4]:
# Create a FBA simulated training for various species

from Library.Import import *
from Library.Build_Dataset import TrainingSet, get_objective, get_minmed_varmed_ko
import matplotlib.pyplot as plt
import cobra

DIRECTORY = './Dataset_input/'

from Library.Utilities import *
from Library.Build_Dataset import TrainingSet

seed = 10
np.random.seed(seed=seed)  
size = 10000
ratmed =  28 # max nbr of variable media turned on biolog 127 Paul 28

species = [ 
    'e_coli_core', #0
    'iEK1008', 
    'iIT341', #2
    'iJN1463', 
    'iML1515', #4
    'iMM904',
    'iPC815', #6
    'iYO844',
    'iYS1720', 
    'iYS854', # 9
    'iCN718',
    'iNF517',
    'iML1515EXP'
]

for i in [ 11 ]:
    name = species[i]    
    method = 'EXP' if 'EXP' in name else 'FBA'
    # Get data
    cobrafile = f'{DIRECTORY}{name}_duplicated'
    mediumfile  = f'{DIRECTORY}{name}'
    parameter = TrainingSet(cobraname=cobrafile, 
                            mediumname=mediumfile, 
                            method=method,
                            ratmed=ratmed,
                            verbose=True)

    # Build training set
    if method == 'FBA':
        parameter.X = np.asarray([]).reshape(0, 0)
        parameter.Y = np.asarray([]).reshape(0, 0)
        parameter.size = 0
        minmed, varmed, ko = get_minmed_varmed_ko(parameter.medium)
        parameter.get(sample_size=size, verbose=False)

    # Saving file
    trainingfile = f'{DIRECTORY}{name}_train'
    parameter.save(trainingfile, reduce=False)

    # Verifying
    parameter = TrainingSet()
    parameter.load(trainingfile)
    parameter.printout()


medium: {'EX_4abz_e_i': (1.0, 6.5), 'EX_nac_e_i': (1.0, 6.5), 'EX_orot_e_i': (1.0, 6.5), 'EX_pnto__R_e_i': (1.0, 6.5), 'EX_ribflv_e_i': (1.0, 6.5), 'EX_thm_e_i': (1.0, 6.5), 'EX_xan_e_i': (1.0, 6.5), 'EX_2aeppn_e_i': (1.0, 6.5), 'EX_2h3mb_e_i': (1.0, 6.5), 'EX_2h3mp_e_i': (1.0, 6.5), 'EX_2hxic__L_e_i': (1.0, 6.5), 'EX_2mba_e_i': (1.0, 6.5), 'EX_2mbald_e_i': (1.0, 6.5), 'EX_2mpa_e_i': (1.0, 6.5), 'EX_3mba_e_i': (1.0, 6.5), 'EX_3mbal_e_i': (1.0, 6.5), 'EX_4abut_e_i': (1.0, 6.5), 'EX_acald_e_i': (1.0, 6.5), 'EX_acgala_e_i': (1.0, 6.5), 'EX_acgam_e_i': (1.0, 6.5), 'EX_acmana_e_i': (1.0, 6.5), 'EX_actn__R_e_i': (1.0, 6.5), 'EX_akg_e_i': (1.0, 6.5), 'EX_ala__D_e_i': (1.0, 6.5), 'EX_btd_RR_e_i': (1.0, 6.5), 'EX_bzal_e_i': (1.0, 6.5), 'EX_cellb_e_i': (1.0, 6.5), 'EX_ch4s_e_i': (1.0, 6.5), 'EX_chol_e_i': (1.0, 6.5), 'EX_cit_e_i': (1.0, 6.5), 'EX_dha_e_i': (1.0, 6.5), 'EX_diact_e_i': (1.0, 6.5), 'EX_drib_e_i': (1.0, 6.5), 'EX_fol_e_i': (1.0, 6.5), 'EX_fru_e_i': (1.0, 6.5), 'EX_gal_e_i': (1.0, 6.

KeyboardInterrupt: 

# Entropy plots

In [5]:
from Library.Import import *
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import fcluster, linkage
from sklearn.decomposition import PCA

def getY(filename):
    """
    Return all Y and Y for objective
    Parameters:
    File path
    Returns:
    Y_all, Y_obj (np.array): Arrays of Y values.
    """
    from Library.Build_Dataset import TrainingSet, get_objective
    
    # Read and return Y 
    parameter = TrainingSet()
    parameter.load(filename)
    Y_all = np.copy(parameter.Y)
    parameter.filter_measure(measure=[get_objective(parameter.model)])
    Y_obj = parameter.Y
    
    return Y_all, Y_obj 

def compute_shannon_entropy(cluster_counts):
    """
    Computes the Shannon entropy of the cluster distribution.
    Parameters:
    cluster_counts (pd.Series): Counts of elements in each cluster.
    Returns:
    float: Shannon entropy of the distribution.
    """
    total_elements = cluster_counts.sum()
    probabilities = cluster_counts / total_elements
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

def cluster(Y, distance_threshold):
    """
    Clusters the Y values based on a distance threshold after optional dimensionality reduction.

    Parameters:
    Y (np.array): Array of Y values.
    distance_threshold (float): The distance threshold for clustering.

    Returns:
    tuple: Cluster labels for each point in Y, adjusted distance threshold, number of clusters, and Shannon entropy.
    """
    if Y.shape[1] > 1:
        # Compute average number of non-null values per row
        size = np.mean(np.count_nonzero(Y, axis=1))
        #pca = PCA(n_components=1)  # Reduce dimensions to 1 for clustering
        #Y_reduced = pca.fit_transform(Y)
        Y_reduced  = np.sum(Y, axis=1).reshape(-1, 1) / size
        print(Y_reduced.shape)
    else:
        size = Y.shape[1]
        Y_reduced = Y
    
    Z = linkage(Y_reduced, method='ward')
    clusters = fcluster(Z, distance_threshold, criterion='distance')
    
    df = pd.DataFrame(Y)
    df['Cluster'] = clusters
    cluster_counts = df['Cluster'].value_counts().sort_index()
    
    num_clusters = len(cluster_counts)
    entropy = compute_shannon_entropy(cluster_counts)
    
    return size, clusters, distance_threshold, num_clusters, entropy

def plot_clusters(Y, clusters, file_path, distance_threshold, adjusted_threshold, color):
    """
    Plots a bar plot of the clustered Y values.
    Parameters:
    Y (np.array): Array of Y values.
    clusters (np.array): Cluster labels for each point in Y.
    file_path (str): Path to the file containing Y values.
    distance_threshold (float): The distance threshold for clustering.
    adjusted_threshold (float): The adjusted distance threshold for clustering.
    color (str): Color for the bars in the plot.
    """
    print(f"File: {file_path} size: {Y.shape}")
    if Y.shape[1] > 1:
        print(f"Threshold (Precision): {distance_threshold:.2f} Adjusted Threshold: {adjusted_threshold:.2f}")
    else:
        print(f"Threshold (Precision): {distance_threshold:.2f}")
    
    df = pd.DataFrame(Y)
    df['Cluster'] = clusters
    cluster_counts = df['Cluster'].value_counts().sort_index()

    num_clusters = len(cluster_counts)
    entropy = compute_shannon_entropy(cluster_counts)

    print(f"Number of clusters: {num_clusters}")
    print(f"Shannon entropy: {entropy:.4f}")

    # For each cluster, compute the mean vector
    cluster_means = df.groupby('Cluster').mean()

    bins = cluster_means.index
    dis = cluster_counts

    plt.figure(figsize=(12, 8))
    if Y.shape[1] == 1:
        tick_labels = [f'{mean[0]:.2f}' for mean in cluster_means.values]
    else:
        tick_labels = [f'Cluster {i}' for i in bins]
    
    plt.bar(bins, dis, color=color, tick_label=tick_labels)
    plt.xlabel('Average Y values for clusters' if Y.shape[1] == 1 else 'Clusters')
    plt.ylabel('Number of elements in cluster')
    plt.title(f"Cluster {file_path}")
    
    # Adjust y-axis to log scale
    plt.yscale('log')
    
    # Adjust tick labels to avoid overlap
    plt.xticks(rotation=90)
    
    # Limit the number of x-tick labels displayed
    if num_clusters > 20:
        step = max(1, num_clusters // 20)
        visible_ticks = np.arange(0, num_clusters, step)
        plt.gca().set_xticks(visible_ticks)
        plt.gca().set_xticklabels([tick_labels[i] for i in visible_ticks])
    
    plt.tight_layout()
    plt.show()

from Library.Utilities import *
from Library.Build_Dataset import TrainingSet, get_objective
import matplotlib.pyplot as plt
DIRECTORY = './Dataset_input/'

seed = 1
np.random.seed(seed=seed)  
species = [ 
    'e_coli_core', #0
    'iEK1008', 
    'iIT341', #2
    'iJN1463', 
    'iML1515', #4
    'iMM904',
    'iPC815', #6
    'iYO844',
    'iYS1720', 
    'iYS854', # 9
    'iCN718',
    'iNF517',
    'iML1515EXP' # 12
]
precision = 0.01
results = []
plotting = False

for i in list(range(12)): 
    name = species[i]
    print(name)
    # Get data
    filename = f'{name}_train'
    trainingfile = f'{DIRECTORY}{filename}'
    Y_all, Y_obj = getY(trainingfile)
    
    # Clustering for Y_obj
    size_obj, clusters_obj, adjusted_threshold_obj, num_clusters_obj, entropy_obj = cluster(Y_obj, precision)
    if plotting:
        plot_clusters(Y_obj, clusters_obj, f'{name}_Y_obj', precision, adjusted_threshold_obj, color='cyan')
    
    # Clustering for Y_all
    size_all, clusters_all, adjusted_threshold_all, num_clusters_all, entropy_all = cluster(Y_all, precision)
    if plotting:
        plot_clusters(Y_all, clusters_all, f'{name}_Y_all', precision, adjusted_threshold_all, color='orange')
    
    results.append((name, size_obj, adjusted_threshold_obj, num_clusters_obj, entropy_obj, size_all, adjusted_threshold_all, num_clusters_all, entropy_all))

# Print summary after plotting all curves for all files
print("\nSummary:")
print("File Name,Size(obj),Threshold(Y_obj),Clusters(Y_obj),Entropy(Y_obj),Size(all),Threshold(Y_all),Clusters(Y_all),Entropy(Y_all)")
for result in results:
    name, size_obj, adjusted_threshold_obj, num_clusters_obj, entropy_obj, size_all, adjusted_threshold_all, num_clusters_all, entropy_all = result
    print(f"{name},{size_obj},{adjusted_threshold_obj:.2f},{num_clusters_obj},{entropy_obj:.4f},{size_all},{adjusted_threshold_all:.2f},{num_clusters_all},{entropy_all:.4f}")


e_coli_core
(10000, 1)
iEK1008
(10000, 1)
iIT341
(10000, 1)
iJN1463
(10000, 1)
iML1515
(500, 1)
iMM904
(10000, 1)
iPC815
(10000, 1)
iYO844
(10000, 1)
iYS1720
(10000, 1)
iYS854
(10000, 1)
iCN718
(10000, 1)
iNF517
(10000, 1)

Summary:
File Name,Size(obj),Threshold(Y_obj),Clusters(Y_obj),Entropy(Y_obj),Size(all),Threshold(Y_all),Clusters(Y_all),Entropy(Y_all)
e_coli_core,1,0.01,8,2.7113,51.9413,0.01,53,3.7451
iEK1008,1,0.01,122,6.5623,473.5062,0.01,370,8.2311
iIT341,1,0.01,140,5.2601,308.2361,0.01,303,7.1050
iJN1463,1,0.01,176,7.0302,607.6669,0.01,277,7.8844
iML1515,1,0.01,101,6.4724,448.9,0.01,272,7.9030
iMM904,1,0.01,216,7.4888,288.9085,0.01,693,9.3287
iPC815,1,0.01,294,7.8259,377.0682,0.01,551,8.8676
iYO844,1,0.01,180,7.2270,331.4583,0.01,588,8.6993
iYS1720,1,0.01,364,8.2992,430.9575,0.01,1318,9.9860
iYS854,1,0.01,607,8.9376,541.0579,0.01,867,9.5249
iCN718,1,0.01,110,6.3612,304.7148,0.01,1059,9.5954
iNF517,1,0.01,220,7.5784,374.5864,0.01,397,8.2097
