# Test dataset model generation
This notebook is used to test the generation of data realized originally in the Build_dataset notebook. We give one example for each situation, simulated data and experimental data.

### Simulated data

In [1]:
import os
import numpy as np
from metabolicDataset import MetabolicDataset

np.random.seed(seed=10) 

DIRECTORY = "./"
cobra_name =  'e_coli_core_duplicated'  
medium_name = 'e_coli_core'
cobra_file = os.path.join(DIRECTORY,"Dataset_input",cobra_name)
medium_file = os.path.join(DIRECTORY,"Dataset_input",medium_name)

# Run cobra
parameter = MetabolicDataset(cobra_name=cobra_file, 
                             medium_name=medium_file, 
                             medium_bound='UB',#'EB' 
                             method='pFBA',
                             objective=[],
                             measure=[])

size  = 50
parameter.get_simulated_data(sample_size=size) # ? Leaving objective and measure as empty lists sets the default objective reaction of the SBML model as the objective reaction value_med and the measure (Y) as this objective reaction.

# Saving file
training_name = medium_name+'_'+parameter.medium_bound+'_'+str(size)
training_file = os.path.join(DIRECTORY,"Dataset_model",training_name)
parameter.save(training_file, reduce=False) # Reduce model

# Load
parameter = MetabolicDataset(training_file=training_file)
parameter.printout()

model file name: ./Dataset_model/e_coli_core_EB_50
reduced model: False
medium file name: ./Dataset_input/e_coli_core
medium bound: EB
list of reactions in objective: ['BIOMASS_Ecoli_core_w_GAM']
method: pFBA
training size: 50
list of medium reactions: 20
list of medium levels: 20
list of medium values: 20
ratio of variable medium turned on: 0.5
list of measured reactions: 154
Stoichiometric matrix (72, 154)
Boundary matrix from reactions to medium: (20, 154)
Measurement matrix from reaction to measures: (154, 154)
Reaction to metabolite matrix: (72, 154)
Metabolite to reaction matrix: (154, 72)
Training set X: (50, 20)
Training set Y: (50, 154)
S_int matrix (67, 154)
S_ext matrix (154, 154)
Q matrix (154, 67)
P matrix (154, 154)
b_int vector (50, 67)
b_ext vector (154,)
Sb matrix (154, 72)
c vector (154,)


In [2]:
import os
import time
import numpy as np
import tensorflow as tf
from aMNWtModel import AMNWtModel
from tools import printout

DIRECTORY = './'
SAVE_RESERVOIR = False

seed = 10
np.random.seed(seed=seed)  
tf.random.set_seed(seed)

# FBA simulated training set for E. coli core
## (not working with M1 chips ). I don't understand :)
# Create, train and evaluate AMN_Wt models with FBA simulated training set for E. coli core with upper bound (UB) or exact bound (EB) 
train_name = 'e_coli_core_UB_50' # e_coli_core_UB_50
objective = ['BIOMASS_Ecoli_core_w_GAM']
reservoir_name = train_name + "_AMN_Wt"
training_file = os.path.join(DIRECTORY,'Dataset_model/',train_name)

print("---------------------------------------- model ----------------------------------------")

model = AMNWtModel(training_file = training_file, 
                   objective=objective,  
                   model_type='AMN_Wt', 
                   timestep =4,
                   n_hidden = 1,
                   hidden_dim = 50,
                   scaler=True,
                   train_rate=1e-2,
                   epochs=10, 
                   xfold=5,
                   verbose=True,
                   batch_size=7)

model.train_test_split(test_size=0.1, random_state=seed)
model.printout()

print("---------------------------------------- train and evaluate ----------------------------------------")
start_time = time.time()
_, stats, _ = model.train_evaluate(verbose=False)
reservoir = model
delta_time = time.time() - start_time

print("---------------------------------------- printing cross-validation results ----------------------------------------")
stats.printout(reservoir_name, delta_time)


print("---------------------------------------- evaluate model on test set ----------------------------------------")
if SAVE_RESERVOIR:
    reservoir_file = os.path.join(DIRECTORY,'Reservoir/',reservoir_name)
    reservoir.save(reservoir_file)

reservoir.printout()

start_time = time.time()

## Strange two first lines, investigate
reservoir.X, reservoir.Y = model.X_test, model.Y_test
X, Y = reservoir.model_input(model.X_test, model.Y_test, verbose=False)
pred, obj, loss = reservoir.evaluate_model(X, Y, verbose=False)
delta_time = time.time() - start_time
printout('Test set', delta_time, obj, loss)

---------------------------------------- model ----------------------------------------
number of reactions:  154 154
number of metabolites:  72
filtered measurements size:  1
training file: ./Dataset_model/e_coli_core_EB_50
model type: AMN_Wt
model scaler: 1.0
model input dim: 0
model output dim: 0
model medium bound: EB
timestep: 4
training set size (50, 20) (50, 1)
nbr hidden layer: 1
hidden layer size: 50
activation function: relu
training epochs: 10
training regression: True
training learn rate: 0.01
training droP_out: 0.25
training batch size: 7
training validation iter: 0
training xfold: 5
training early stopping: False
---------------------------------------- train and evaluate ----------------------------------------
Instructions for updating:
Use `tf.linalg.matmul` instead


2023-07-19 16:44:07.031974: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-19 16:44:07.033448: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2023-07-19 16:44:07.290248: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


train = -29.24 test = -48.17 loss-train = 0.025292 loss-test = 0.024604
train = -27.05 test = -17.57 loss-train = 0.038747 loss-test = 0.038635
train = -33.82 test = -54.88 loss-train = 0.007286 loss-test = 0.007257
train = -43.07 test = -28.59 loss-train = 0.004031 loss-test = 0.004059
train = -26.02 test = -62.95 loss-train = 0.017256 loss-test = 0.017014
---------------------------------------- printing cross-validation results ----------------------------------------
Stats for e_coli_core_EB_50_AMN_Wt CPU-time 18.8442
R2 = -31.8397 (+/- 6.2234) Constraint = 0.0185 (+/- 0.0126)
Q2 = -42.4299 (+/- 16.8426) Constraint = 0.0183 (+/- 0.0125)
---------------------------------------- evaluate model on test set ----------------------------------------
training file: ./Dataset_model/e_coli_core_EB_50
model type: AMN_Wt
model scaler: 10.000000000007269
model input dim: 4
model output dim: 0
model medium bound: EB
timestep: 4
training set size (50, 20) (50, 1)
nbr hidden layer: 1
hidden layer

### Experimental data

In [3]:
import os
import numpy as np
from metabolicDataset import MetabolicDataset

np.random.seed(seed=10) 

DIRECTORY = "./"
cobra_name =  'iML1515_EXP' # reduced iML1515 model  
medium_name = 'iML1515_EXP'
cobra_file = os.path.join(DIRECTORY,"Dataset_input",cobra_name)
medium_file = os.path.join(DIRECTORY,"Dataset_input",medium_name)

# Get data
parameter = MetabolicDataset(cobra_name=cobra_file, 
                             medium_name=medium_file, 
                             medium_bound='UB', 
                             medium_size=38, 
                             method='EXP',
                             verbose=False)

# Saving file
training_name = medium_name+'_'+parameter.medium_bound
training_file = os.path.join(DIRECTORY,"Dataset_model",training_name)
parameter.save(training_file, reduce=False) # Reduce model ## parameter.save(Directory)

# Verifying
parameter = MetabolicDataset(training_file)
parameter.printout()

model file name: ./Dataset_model/iML1515_EXP_UB
reduced model: False
medium file name: ./Dataset_input/iML1515_EXP
medium bound: UB
list of reactions in objective: ['BIOMASS_Ec_iML1515_core_75p37M']
method: EXP
training size: 110
list of medium reactions: 38
list of medium levels: 0
list of medium values: 0
ratio of variable medium turned on: 0
list of measured reactions: 543
Stoichiometric matrix (1080, 543)
Boundary matrix from reactions to medium: (38, 543)
Measurement matrix from reaction to measures: (543, 543)
Reaction to metabolite matrix: (1080, 543)
Metabolite to reaction matrix: (543, 1080)
Training set X: (110, 38)
Training set Y: (110, 1)
S_int matrix (478, 543)
S_ext matrix (543, 2703)
Q matrix (543, 478)
P matrix (543, 543)
b_int vector (478,)
b_ext vector (110, 2703)
Sb matrix (543, 1080)
c vector (543,)


In [4]:
import os
import time
import numpy as np
import tensorflow as tf
from aMNWtModel import AMNWtModel
from tools import printout

DIRECTORY = './'
SAVE_RESERVOIR = False

seed = 10
np.random.seed(seed=seed)  
tf.random.set_seed(seed)

# FBA simulated training set for E. coli core
## (not working with M1 chips ). I don't understand :)
# Create, train and evaluate AMN_Wt models with FBA simulated training set for E. coli core with upper bound (UB) or exact bound (EB) 
train_name = 'iML1515_EXP_UB' # e_coli_core_EB
objective = ['BIOMASS_Ec_iML1515_core_75p37M']
reservoir_name = train_name + "_AMN_Wt"
training_file = os.path.join(DIRECTORY,'Dataset_model/',train_name)

print("---------------------------------------- model ----------------------------------------")

model = AMNWtModel(training_file = training_file, 
                   objective=objective,  
                   model_type='AMN_Wt', 
                   timestep =4,
                   n_hidden = 1,
                   hidden_dim = 50,
                   scaler=True,
                   train_rate=1e-2,
                   epochs=10, 
                   xfold=5,
                   verbose=True,
                   batch_size=7)

model.train_test_split(test_size=0.1, random_state=seed)
model.printout()

print("---------------------------------------- train and evaluate ----------------------------------------")
start_time = time.time()
_, stats, _ = model.train_evaluate(verbose=False)
reservoir = model
delta_time = time.time() - start_time

print("---------------------------------------- printing cross-validation results ----------------------------------------")
stats.printout(reservoir_name, delta_time)


print("---------------------------------------- evaluate model on test set ----------------------------------------")
if SAVE_RESERVOIR:
    reservoir_file = os.path.join(DIRECTORY,'Reservoir/',reservoir_name)
    reservoir.save(reservoir_file)

reservoir.printout()

start_time = time.time()

## Strange two first lines, investigate
reservoir.X, reservoir.Y = model.X_test, model.Y_test
X, Y = reservoir.model_input(model.X_test, model.Y_test, verbose=False)
pred, obj, loss = reservoir.evaluate_model(X, Y, verbose=False)
delta_time = time.time() - start_time
printout('Test set', delta_time, obj, loss)

---------------------------------------- model ----------------------------------------
number of reactions:  543 1
number of metabolites:  1080
filtered measurements size:  1
training file: ./Dataset_model/iML1515_EXP_UB
model type: AMN_Wt
model scaler: 1.0
model input dim: 0
model output dim: 0
model medium bound: UB
timestep: 4
training set size (110, 38) (110, 1)
nbr hidden layer: 1
hidden layer size: 50
activation function: relu
training epochs: 10
training regression: True
training learn rate: 0.01
training droP_out: 0.25
training batch size: 7
training validation iter: 0
training xfold: 5
training early stopping: False
---------------------------------------- train and evaluate ----------------------------------------
train = -518.07 test = -346.42 loss-train = 0.044226 loss-test = 0.044243
train = -0.09 test = -0.13 loss-train = 0.002123 loss-test = 0.002107
train = -0.05 test = -0.01 loss-train = 0.000436 loss-test = 0.000444
train = -0.01 test = -0.10 loss-train = 0.000334 lo

In [5]:
# Cobra utilities and stoichiometric derived matrices
def get_index_from_id(name,L):
    # Return index in L of id name
    for i in range(len(L)):
        if L[i].id == name:
            return i
    return -1


In [24]:
parameter.model.reactions[0]

for i in range(len(parameter.model.reactions)):
    if parameter.model.reactions[i].id == "OMPDC":
        print(i)



1
-1


In [25]:
for r in parameter.model.reactions:
    if r.id == "OMPDC":
        print(r.)

OMPDC_reverse_45ba1


In [9]:
def get_index_from_id(name,L):
    # Return index in L of id name
    for i in range(len(L)):
        if L[i].id == name:
            return i
    return -1

In [10]:
name = parameter.medium[5]
L = parameter.model.reactions

In [11]:
get_index_from_id(name,L)

137

In [24]:
L.index(name)

137

In [None]:
i = 0
for rid in self.medium:
    j = get_index_from_id(rid, self.model.reactions)
    X[:,i] = Y[:,j] 
    i += 1


In [31]:
for i,j in enumerate(parameter.medium):
    print(i)
    print(j)

0
EX_pi_e_i
1
EX_co2_e_i
2
EX_fe3_e_i
3
EX_h_e_i
4
EX_mn2_e_i
5
EX_fe2_e_i
6
EX_zn2_e_i
7
EX_mg2_e_i
8
EX_ca2_e_i
9
EX_ni2_e_i
10
EX_cu2_e_i
11
EX_sel_e_i
12
EX_cobalt2_e_i
13
EX_h2o_e_i
14
EX_mobd_e_i
15
EX_so4_e_i
16
EX_nh4_e_i
17
EX_k_e_i
18
EX_na1_e_i
19
EX_cl_e_i
20
EX_o2_e_i
21
EX_tungs_e_i
22
EX_slnt_e_i
23
EX_glyc_e_i
24
EX_ala__L_e_i
25
EX_pro__L_e_i
26
EX_thr__L_e_i
27
EX_gly_e_i
28
EX_rib__D_e_i
29
EX_malt_e_i
30
EX_melib_e_i
31
EX_tre_e_i
32
EX_fru_e_i
33
EX_gal_e_i
34
EX_ac_e_i
35
EX_lac__D_e_i
36
EX_succ_e_i
37
EX_pyr_e_i


In [28]:
parameter.medium

array(['EX_pi_e_i', 'EX_co2_e_i', 'EX_fe3_e_i', 'EX_h_e_i', 'EX_mn2_e_i',
       'EX_fe2_e_i', 'EX_zn2_e_i', 'EX_mg2_e_i', 'EX_ca2_e_i',
       'EX_ni2_e_i', 'EX_cu2_e_i', 'EX_sel_e_i', 'EX_cobalt2_e_i',
       'EX_h2o_e_i', 'EX_mobd_e_i', 'EX_so4_e_i', 'EX_nh4_e_i',
       'EX_k_e_i', 'EX_na1_e_i', 'EX_cl_e_i', 'EX_o2_e_i', 'EX_tungs_e_i',
       'EX_slnt_e_i', 'EX_glyc_e_i', 'EX_ala__L_e_i', 'EX_pro__L_e_i',
       'EX_thr__L_e_i', 'EX_gly_e_i', 'EX_rib__D_e_i', 'EX_malt_e_i',
       'EX_melib_e_i', 'EX_tre_e_i', 'EX_fru_e_i', 'EX_gal_e_i',
       'EX_ac_e_i', 'EX_lac__D_e_i', 'EX_succ_e_i', 'EX_pyr_e_i'],
      dtype='<U14')