# Test dataset model generation
This notebook is used to test the generation of data realized originally in the Build_dataset notebook. We give one example for each situation, simulated data and experimental data.

### Simulated data

In [1]:
import os
import numpy as np
# from simulatedDataset import SimulatedDataset
from simulatedDataset import SimulatedDataset
from metabolicDataset import MetabolicDataset

np.random.seed(seed=10)

# Generate dataset using cobra
medium_dir = "./Dataset_input"
cobra_dir = "./Dataset_input"
medium_file = 'e_coli_core.csv'
cobra_file = 'e_coli_core_duplicated.xml'
parameter = SimulatedDataset(input_cobra_file=os.path.join(cobra_dir,cobra_file), 
                             medium_file=os.path.join(medium_dir,medium_file), ##input !
                             medium_bound='UB',#'EB' 
                             method='pFBA',
                             measure=[],
                             sample_size=50)


# Saving dataset
# Save the dataset into a npz file and the cobra model in the given directory
dataset_dir = "./Dataset"
dataset_name = 'e_coli_core_UB_50'
parameter.save(dataset_dir=dataset_dir, 
               dataset_name=dataset_name,
               reduce=False)

# Load dataset
parameter = MetabolicDataset(dataset_file=os.path.join(dataset_dir,dataset_name)+'.npz')
parameter.printout()

reactions : ['PFK' 'PFL' 'PGI_for' 'PGK_for' 'PGL' 'ACALD_for' 'AKGt2r_i' 'PGM_for'
 'PIt2r_i' 'ALCD2x_for' 'ACALDt_i' 'ACKr_for' 'PPC' 'ACONTa_for'
 'ACONTb_for' 'ATPM' 'PPCK' 'ACt2r_i' 'PPS' 'ADK1_for' 'AKGDH' 'ATPS4r_i'
 'PTAr_for' 'PYK' 'BIOMASS_Ecoli_core_w_GAM' 'PYRt2_i' 'CO2t_i' 'RPE_for'
 'CS' 'RPI_for' 'SUCCt2_2' 'CYTBD' 'D_LACt2_i' 'ENO_for' 'SUCCt3'
 'ETOHt2r_i' 'SUCDi' 'SUCOAS_for' 'TALA_for' 'THD2' 'TKT1_for' 'TKT2_for'
 'TPI_for' 'EX_ac_e_o' 'EX_acald_e_o' 'EX_akg_e_o' 'EX_co2_e_o'
 'EX_etoh_e_o' 'EX_for_e_o' 'EX_fru_e_o' 'EX_fum_e_o' 'EX_glc__D_e_o'
 'EX_gln__L_e_o' 'EX_glu__L_e_o' 'EX_h_e_o' 'EX_h2o_e_o' 'EX_lac__D_e_o'
 'EX_mal__L_e_o' 'EX_nh4_e_o' 'EX_o2_e_o' 'EX_pi_e_o' 'EX_pyr_e_o'
 'EX_succ_e_o' 'FBA_for' 'FBP' 'FORt2' 'FRD7' 'FRUpts2' 'FUM_for'
 'FUMt2_2' 'G6PDH2r_for' 'GAPD_for' 'GLCpts' 'GLNS' 'GLNabc' 'GLUDy_for'
 'GLUN' 'GLUSy' 'GLUt2r_i' 'GND' 'H2Ot_i' 'ICDHyr_for' 'ICL' 'LDH_D_for'
 'MALS' 'MALt2_2' 'MDH_for' 'ME1' 'ME2' 'NADH16' 'NADTRHD' 'NH4t_i'
 'O2t_i' 

In [2]:
import time
import numpy as np
import tensorflow as tf
from aMNWtModel import AMNWtModel
from tools import printout

seed = 10
np.random.seed(seed=seed)  
tf.random.set_seed(seed)

print("---------------------------------------- model ----------------------------------------")
model = AMNWtModel(dataset_file="./Dataset/e_coli_core_UB_50.npz", 
                   objective=['BIOMASS_Ecoli_core_w_GAM'],
                   timestep=4,
                   n_hidden=1,
                   hidden_dim=50,
                   scaler=True,
                   train_rate=1e-2,
                   epochs=10, 
                   xfold=5,
                   verbose=True,
                   batch_size=7)

model.train_test_split(test_size=0.1, random_state=seed)
model.printout()

print("---------------------------------------- train and evaluate ----------------------------------------")
start_time = time.time()
_, stats, _ = model.train_evaluate(verbose=False)
reservoir = model
delta_time = time.time() - start_time

print("---------------------------------------- printing cross-validation results ----------------------------------------")
reservoir_name = "e_coli_core_UB_50_AMN_Wt"
stats.printout(reservoir_name, delta_time)

# reservoir.save("./Reservoir/e_coli_core_UB_50_AMN_Wt")
# reservoir.printout()

print("---------------------------------------- evaluate model on test set ----------------------------------------")

start_time = time.time()
## Strange two first lines, investigate
reservoir.X, reservoir.Y = model.X_test, model.Y_test
X, Y = reservoir.model_input(model.X_test, model.Y_test, verbose=False)
pred, obj, loss = reservoir.evaluate_model(X, Y, verbose=False)
delta_time = time.time() - start_time
printout('Test set', delta_time, obj, loss)

---------------------------------------- model ----------------------------------------
number of metabolites:  72
filtered measurements size:  1
dataset file: ./Dataset/e_coli_core_UB_50.npz
model type: AMNWt
model scaler: 1.0
model medium bound: UB
timestep: 4
training set size (50, 20) (50, 1)
nbr hidden layer: 1
hidden layer size: 50
activation function: relu
training epochs: 10
training regression: True
training learn rate: 0.01
training droP_out: 0.25
training batch size: 7
training validation iter: 0
training xfold: 5
training early stopping: False
---------------------------------------- train and evaluate ----------------------------------------
Instructions for updating:
Use `tf.linalg.matmul` instead


2023-08-04 16:04:00.440999: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-04 16:04:00.442354: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2023-08-04 16:04:00.713950: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


train = -25.47 test = -41.32 loss-train = 0.028367 loss-test = 0.027396
train = -34.05 test = -21.94 loss-train = 0.023299 loss-test = 0.024118
train = -25.56 test = -42.41 loss-train = 0.023264 loss-test = 0.022864
train = -27.96 test = -18.93 loss-train = 0.034178 loss-test = 0.034799
train = -27.78 test = -66.81 loss-train = 0.017664 loss-test = 0.018178
---------------------------------------- printing cross-validation results ----------------------------------------
Stats for e_coli_core_UB_50_AMN_Wt CPU-time 18.3661
R2 = -28.1636 (+/- 3.1255) Constraint = 0.0254 (+/- 0.0056)
Q2 = -38.2816 (+/- 17.2118) Constraint = 0.0255 (+/- 0.0055)
---------------------------------------- evaluate model on test set ----------------------------------------
Stats for Test set CPU-time 0.0501
R2 = -47.8442 Constraint = 0.0338


### Experimental data

In [3]:
import os
import numpy as np
from metabolicDataset import MetabolicDataset
from experimentalDataset import ExperimentalDataset

np.random.seed(seed=10) 


# Generate dataset using cobra model and experimental data
medium_dir = "./Dataset_input"
cobra_dir = "./Dataset_input"
medium_file = 'iML1515_EXP.csv'
cobra_file = 'iML1515_EXP.xml'
parameter = ExperimentalDataset(input_cobra_file=os.path.join(cobra_dir,cobra_file), 
                                medium_file=os.path.join(medium_dir,medium_file), 
                                medium_bound='UB', 
                                # medium_size=38, 
                                method='EXP',
                                verbose=False)

# Saving dataset
# Save the dataset into a npz file and the cobra model in the given directory
dataset_dir = "./Dataset"
dataset_name = 'iML1515_EXP_UB'
parameter.save(dataset_dir = dataset_dir, 
               dataset_name=dataset_name)

# Load dataset
parameter = MetabolicDataset(dataset_file=os.path.join(dataset_dir,dataset_name)+'.npz')
# parameter.printout()

In [4]:
import time
import numpy as np
import tensorflow as tf
from aMNWtModel import AMNWtModel
from tools import printout

seed = 10
np.random.seed(seed=seed)  
tf.random.set_seed(seed)

print("---------------------------------------- model ----------------------------------------")
model = AMNWtModel(dataset_file="./Dataset/iML1515_EXP_UB.npz", 
                   objective=['BIOMASS_Ec_iML1515_core_75p37M'],  
                   timestep=4,
                   n_hidden=1,
                   hidden_dim=50,
                   scaler=True,
                   train_rate=1e-2,
                   epochs=10, 
                   xfold=5,
                   verbose=True,
                   batch_size=7)

model.train_test_split(test_size=0.1, random_state=seed)
model.printout()

print("---------------------------------------- train and evaluate ----------------------------------------")
start_time = time.time()
_, stats, _ = model.train_evaluate(verbose=False)
reservoir = model
delta_time = time.time() - start_time

print("---------------------------------------- printing cross-validation results ----------------------------------------")
reservoir_name = "./Dataset/iML1515_EXP_UB.npz_AMN_Wt"
stats.printout(reservoir_name, delta_time)

# reservoir.save("./Reservoir/e_coli_core_UB_50_AMN_Wt")
# reservoir.printout()

print("---------------------------------------- evaluate model on test set ----------------------------------------")

start_time = time.time()
## Strange two first lines, investigate
reservoir.X, reservoir.Y = model.X_test, model.Y_test
X, Y = reservoir.model_input(model.X_test, model.Y_test, verbose=False)
pred, obj, loss = reservoir.evaluate_model(X, Y, verbose=False)
delta_time = time.time() - start_time
printout('Test set', delta_time, obj, loss)



---------------------------------------- model ----------------------------------------
number of metabolites:  1080
filtered measurements size:  1
dataset file: ./Dataset/iML1515_EXP_UB.npz
model type: AMNWt
model scaler: 1.0
model medium bound: UB
timestep: 4
training set size (110, 38) (110, 1)
nbr hidden layer: 1
hidden layer size: 50
activation function: relu
training epochs: 10
training regression: True
training learn rate: 0.01
training droP_out: 0.25
training batch size: 7
training validation iter: 0
training xfold: 5
training early stopping: False
---------------------------------------- train and evaluate ----------------------------------------
train = -518.07 test = -346.42 loss-train = 0.044226 loss-test = 0.044243
train = -0.09 test = -0.13 loss-train = 0.002123 loss-test = 0.002107
train = -0.05 test = -0.01 loss-train = 0.000436 loss-test = 0.000444
train = -0.01 test = -0.10 loss-train = 0.000334 loss-test = 0.000333
train = -1.48 test = -2.09 loss-train = 0.029051 los

### Simulated with experimental dataset medium variation

In [7]:
import numpy as np
from simulatedDataset import SimulatedDataset
from metabolicDataset import MetabolicDataset

np.random.seed(seed=10)  

# Get X from experimental data set
cobra_file = './Dataset_input/IJN1463_duplicated'
exp_file  = './Dataset_input/IJN1463_EXP.csv'
medium_file = './Dataset_input/IJN1463_10.csv'


parameter = SimulatedDataset(input_cobra_file="./Dataset_input/IJN1463_duplicated.xml", 
                             experimental_file=exp_file,
                             medium_file='./Dataset_input/IJN1463_10.csv', 
                             medium_bound='UB', 
                            #  medium_size=196, 
                             method='EXP',
                             verbose=False)


dataset_dir = "./Dataset"
dataset_name = 'IJN1463_10_UB'
parameter.save(dataset_dir=dataset_dir, 
               dataset_name=dataset_name,
               reduce=True)

# Load dataset
parameter = MetabolicDataset(dataset_file=os.path.join(dataset_dir,dataset_name)+'.npz')
parameter.printout()

sample: 0
primal objectif = ['BIOMASS_KT2440_WT3'] EXP 0.9394817780932792
sample: 1
primal objectif = ['BIOMASS_KT2440_WT3'] EXP 0.6647854389588707
sample: 2
primal objectif = ['BIOMASS_KT2440_WT3'] EXP 0.9394817780932543
sample: 3
primal objectif = ['BIOMASS_KT2440_WT3'] EXP 0.664785438958875
sample: 4
primal objectif = ['BIOMASS_KT2440_WT3'] EXP 0.9394817780932936
sample: 5
primal objectif = ['BIOMASS_KT2440_WT3'] EXP 0.9394817780932936
sample: 6
primal objectif = ['BIOMASS_KT2440_WT3'] EXP 0.6647854389588747
sample: 7
primal objectif = ['BIOMASS_KT2440_WT3'] EXP 0.9394817780932774
sample: 8
primal objectif = ['BIOMASS_KT2440_WT3'] EXP 0.9394817780932774
sample: 9
primal objectif = ['BIOMASS_KT2440_WT3'] EXP 0.9394817780932774
sample: 10
primal objectif = ['BIOMASS_KT2440_WT3'] EXP 0.9394817780932774
sample: 11
primal objectif = ['BIOMASS_KT2440_WT3'] EXP 0.6647854389588722
sample: 12
primal objectif = ['BIOMASS_KT2440_WT3'] EXP 0.9394817780932814
sample: 13
primal objectif = ['BIOMA



primal objectif = ['BIOMASS_KT2440_WT3'] EXP -0.005056460474296988
sample: 1
primal objectif = ['BIOMASS_KT2440_WT3'] EXP -0.005056460474296988
sample: 2
primal objectif = ['BIOMASS_KT2440_WT3'] EXP -0.005056460474296988
sample: 3
primal objectif = ['BIOMASS_KT2440_WT3'] EXP -0.005056460474296988
sample: 4
primal objectif = ['BIOMASS_KT2440_WT3'] EXP -0.005056460474296988
sample: 5
primal objectif = ['BIOMASS_KT2440_WT3'] EXP -0.005056460474296988
sample: 6
primal objectif = ['BIOMASS_KT2440_WT3'] EXP -0.005056460474296988
sample: 7
primal objectif = ['BIOMASS_KT2440_WT3'] EXP -0.005056460474296988
sample: 8
primal objectif = ['BIOMASS_KT2440_WT3'] EXP -0.005056460474296988
sample: 9
primal objectif = ['BIOMASS_KT2440_WT3'] EXP -0.005056460474296988
sample: 10
primal objectif = ['BIOMASS_KT2440_WT3'] EXP -0.005056460474296988
sample: 11
primal objectif = ['BIOMASS_KT2440_WT3'] EXP -0.005056460474296988
sample: 12
primal objectif = ['BIOMASS_KT2440_WT3'] EXP -0.005056460474296988
sample

### Test reduce

In [2]:
import os
import numpy as np
from simulatedDataset import SimulatedDataset
from metabolicDataset import MetabolicDataset

np.random.seed(seed=10) 

# Generate dataset using cobra
medium_dir = "./Dataset_input"
cobra_dir = "./Dataset_input"
medium_file = 'e_coli_core.csv'
cobra_file = 'e_coli_core_duplicated.xml'
parameter = SimulatedDataset(input_cobra_file=os.path.join(cobra_dir,cobra_file), 
                             medium_file=os.path.join(medium_dir,medium_file), ##input !
                             medium_bound='UB',#'EB' 
                             method='pFBA',
                             measure=[],
                             sample_size=50)


# Saving dataset
# Save the dataset into a npz file and the cobra model in the given directory
dataset_dir = "./Dataset_test_reduce"
dataset_name = 'e_coli_core_UB_50'
parameter.save(dataset_dir=dataset_dir, 
               dataset_name=dataset_name,
               reduce=True)

# Load dataset
parameter = MetabolicDataset(dataset_file=os.path.join(dataset_dir,dataset_name)+'.npz')
parameter.printout()


reduced numbers of metabolites and reactions: 70 104
reactions : ['PFK' 'PFL' 'PGI_for' 'PGL' 'ACALD_for' 'AKGt2r_i' 'PIt2r_i' 'ALCD2x_for'
 'ACALDt_i' 'PPC' 'ACONTa_for' 'ACONTb_for' 'ATPM' 'AKGDH' 'ATPS4r_i'
 'PTAr_for' 'PYK' 'BIOMASS_Ecoli_core_w_GAM' 'PYRt2_i' 'CO2t_i' 'RPE_for'
 'CS' 'SUCCt2_2' 'CYTBD' 'D_LACt2_i' 'ENO_for' 'ETOHt2r_i' 'SUCDi'
 'TALA_for' 'THD2' 'TKT1_for' 'TKT2_for' 'TPI_for' 'EX_ac_e_o'
 'EX_co2_e_o' 'EX_etoh_e_o' 'EX_for_e_o' 'EX_glu__L_e_o' 'EX_h_e_o'
 'EX_h2o_e_o' 'EX_nh4_e_o' 'FBA_for' 'FRUpts2' 'FUM_for' 'FUMt2_2'
 'G6PDH2r_for' 'GAPD_for' 'GLCpts' 'GLNS' 'GLNabc' 'GLUDy_for' 'GLUN'
 'GLUSy' 'GLUt2r_i' 'GND' 'H2Ot_i' 'ICDHyr_for' 'LDH_D_for' 'MALt2_2'
 'MDH_for' 'ME2' 'NADH16' 'NH4t_i' 'O2t_i' 'PDH' 'PGK_rev' 'ACALD_rev'
 'PGM_rev' 'ALCD2x_rev' 'ACKr_rev' 'ACt2r_o' 'CO2t_o' 'RPE_rev' 'RPI_rev'
 'ETOHt2r_o' 'SUCOAS_rev' 'TALA_rev' 'TKT1_rev' 'TKT2_rev' 'EX_ac_e_i'
 'EX_acald_e_i' 'EX_akg_e_i' 'EX_co2_e_i' 'EX_etoh_e_i' 'EX_for_e_i'
 'EX_fru_e_i' 'EX_fum_e_i'

In [7]:
len(parameter.reactions)

104

In [6]:
if not None:
    print("youpi !")

youpi !
