# Published data - Cupriavidus necator
Data collection
- Batch culture samples from two time points during exponential phase

In [1]:
from BFAIR.mfa.INCA import INCA_script
import pandas as pd
import numpy as np
import time
import ast
import matlab.engine
import sys
#import escher
import dotenv
from BFAIR.mfa.INCA import INCA_reimport
from BFAIR.parsers import modelReactions_file_parser, atomMapping_reactions2_file_parser, atom_mapping_metabolites_file_parser

DEBUG:optlang.util:Gurobi python bindings not available.
DEBUG:optlang.util:GLPK python bindings found at /Users/s143838/.virtualenvs/bfair-testing/lib/python3.10/site-packages/swiglpk
DEBUG:optlang.util:Mosek python bindings not available.
DEBUG:optlang.util:CPLEX python bindings not available.
DEBUG:optlang.util:OSQP python bindings not available.
DEBUG:optlang.util:COINOR_CBC python bindings not available.
DEBUG:optlang.util:Scipy linprog function found at /Users/s143838/.virtualenvs/bfair-testing/lib/python3.10/site-packages/scipy/optimize/__init__.py


In [2]:
# import environment variables
INCA_base_directory = dotenv.get_key(dotenv.find_dotenv(), "INCA_base_directory")

#### Import using parsers

In [3]:
imported_reactions = modelReactions_file_parser(
    'Literature data/Cupriavidus necator  Alagesan 2017/reactions_2nd.xlsx',
    'Cupriavidus_necator2017',
    reaction_id_col_name="Reaction ID",
    equation_col_name="Equations (Carbon atom transition)",
)
imported_atom_mapping = atomMapping_reactions2_file_parser(
    'Literature data/Cupriavidus necator  Alagesan 2017/reactions_2nd.xlsx',
    'Cupriavidus_necator2017',
    reaction_id_col_name="Reaction ID",
    equation_col_name="Equations (Carbon atom transition)",
)
imported_atoms_metabolites = atom_mapping_metabolites_file_parser(
    'Literature data/Cupriavidus necator  Alagesan 2017/reactions_2nd.xlsx',
    'Cupriavidus_necator2017',
    reaction_id_col_name="Reaction ID",
    equation_col_name="Equations (Carbon atom transition)",
)

DEBUG:BFAIR.parsers.data_import_parsers:Duplicates in reactants: [('PYR', 1.0, 'abc'), ('PYR', 1.0, 'def')]
DEBUG:BFAIR.parsers.data_import_parsers:Duplicates: ['PYR']
DEBUG:BFAIR.parsers.data_import_parsers:Duplicate mappings: ['abc', 'def']
DEBUG:BFAIR.parsers.data_import_parsers:Duplicates in reactants: [('PYR', 1.0, 'abc'), ('PYR', 1.0, 'def')]
DEBUG:BFAIR.parsers.data_import_parsers:Duplicates: ['PYR']
DEBUG:BFAIR.parsers.data_import_parsers:Duplicate mappings: ['abc', 'def']


#### Manual set up of additional information

In [4]:
tracer_info = pd.DataFrame.from_dict({
    'experiment_id': [
        'D-[1-13C]fructose', '[1,2-13C]glycerol', '[1,2-13C]glycerolandCO2',
    ],
    'met_id': ['F6P.ext', 'GLY.ext', 'GLY.ext'],
    'met_name': [
        'D-[1-13C]fructose', '[1,2-13C]glycerol', '[1,2-13C]glycerolandCO2',
    ],
    'met_atompositions': [
        '{1}', '{1,2}', '{1,2}',
    ],
    'met_elements': [
        '{C}', '{C,C}', '{C,C}',
    ],
    'ratio': [
        '1', '1', '1',
    ]    
}, orient='columns')

In [5]:
measured_fluxes_info = pd.DataFrame.from_dict({
    'experiment_id': [
        'D-[1-13C]fructose', '[1,2-13C]glycerol', '[1,2-13C]glycerolandCO2',
    ],
    'model_id': ['Cupriavidus_necator2017' for i in range(3)],
    'rxn_id': [
        'ex_1', 'ex_2', 'ex_2',
        ],
    'flux_average': ['1' for i in range(3)],
    'flux_stdev': ['0.01' for i in range(3)],
    'flux_lb': ['0.99' for i in range(3)],
    'flux_ub': ['1.01' for i in range(3)],  
}, orient='columns')

#### This one should be its own parser

In [6]:
experimental_mdvs = pd.read_csv('Literature data/Cupriavidus necator  Alagesan 2017/MDVs.csv', sep=';', decimal=',')

In [7]:
experiment_ids = []
met_ids = []
fragment_ids = []
time_points = []
intensity_normalized_averages = []
intensity_normalized_stdevs = []
met_atompositions = []
met_elements = []
sample_name_abbreviation = []

experiments = [experimental_mdvs.columns[2], experimental_mdvs.columns[4], experimental_mdvs.columns[6]]
for experiment in experiments:
    fragments = list(set([row[1] for i, row in experimental_mdvs[experimental_mdvs.columns[:2]].iterrows()]))
    for fragment in fragments:
        experiment_ids.append(experiment)
        fragment_ids.append(fragment)
        mdv_list = experimental_mdvs[experimental_mdvs[experimental_mdvs.columns[1]] == fragment][experiment].to_list()
        stdev_list = experimental_mdvs[experimental_mdvs[experimental_mdvs.columns[1]] == fragment]['stdev_' + experiment].to_list()
        mdv = '{'
        stdev = '{'
        atompos = '{'
        frag_element = '{'
        for i, mdv_value in enumerate(mdv_list):
            if i == 0:
                mdv += str(mdv_value)
                stdev += str(stdev_list[i])
            else:
                mdv += ',' + str(mdv_value)
                stdev += ',' + str(stdev_list[i])
                if i == 1:
                    atompos += str(i-1)
                    frag_element += 'C'
                else:
                    atompos += ',' + str(i-1)
                    frag_element += ',' + 'C'
        mdv += '}'
        stdev += '}'
        atompos += '}'
        frag_element += '}'

        met_ids.append(list(set(experimental_mdvs[experimental_mdvs[experimental_mdvs.columns[1]] == fragment]['Met'].to_list()))[0])
        time_points.append('0')
        intensity_normalized_averages.append(mdv)
        intensity_normalized_stdevs.append(stdev)
        sample_name_abbreviation.append('')
        met_atompositions.append(atompos)
        met_elements.append(frag_element)

mdvs = pd.DataFrame.from_dict({
    'experiment_id': experiment_ids,
    'met_id': met_ids,
    'fragment_id': fragment_ids,
    'time_point': time_points,
    'met_atompositions': met_atompositions,
    'met_elements': met_elements,
    'sample_name_abbreviation': sample_name_abbreviation,
    'intensity_normalized_average': intensity_normalized_averages,
    'intensity_normalized_stdev': intensity_normalized_stdevs,  
}, orient='columns')

In [8]:
mdvs

Unnamed: 0,experiment_id,met_id,fragment_id,time_point,met_atompositions,met_elements,sample_name_abbreviation,intensity_normalized_average,intensity_normalized_stdev
0,D-[1-13C]fructose,PHE,Phenylalanine336,0,"{0,1,2,3,4,5,6,7,8}","{C,C,C,C,C,C,C,C,C}",,"{0.8209,0.1435,0.0327,0.0028,0,0,0,0,0,0}","{0.0256,0.0182,0.0082,0.001,0.001,0.001,0.001,..."
1,D-[1-13C]fructose,LEU,Leucine274,0,"{0,1,2,3,4}","{C,C,C,C,C}",,"{0.9364,0.0620,0.0013,0.0002,0.00001,0}","{0.0068,0.0067,0.0002,0.001,0.001,0.001}"
2,D-[1-13C]fructose,ALA,Alanine232,0,"{0,1}","{C,C}",,"{0.9759,0.0230,0.0011}","{0.0027,0.0035,0.0009}"
3,D-[1-13C]fructose,ALA,Alanine260,0,"{0,1,2}","{C,C,C}",,"{0.3509,0.6407,0.0078,0.0006}","{0.0135,0.0138,0.0003,0.0006}"
4,D-[1-13C]fructose,MET,Methionine320,0,"{0,1,2,3,4}","{C,C,C,C,C}",,"{0.7391,0.2339,0.0239,0.0029,0,0}","{0.0183,0.0158,0.0044,0.001,0.001,0.001}"
...,...,...,...,...,...,...,...,...,...
64,"[1,2-13C]glycerolandCO2",PHE,Phenylalanine308,0,"{0,1,2,3,4,5,6,7}","{C,C,C,C,C,C,C,C}",,"{0.0279,0.0491,0.1184,0.2083,0.2584,0.2121,0.1...","{0.0061,0.0130,0.0245,0.0237,0.0085,0.0303,0.0..."
65,"[1,2-13C]glycerolandCO2",HIS,Histidine440,0,"{0,1,2,3,4,5}","{C,C,C,C,C,C}",,"{0.0634,0.1595,0.2636,0.2882,0.1752,0.0473,0.0...","{0.0149,0.0277,0.0223,0.0203,0.0333,0.0118,0.0..."
66,"[1,2-13C]glycerolandCO2",VAL,Valine288,0,"{0,1,2,3,4}","{C,C,C,C,C}",,"{0.0652,0.1620,0.2968,0.3150,0.1526,0.0083}","{0.0132,0.0281,0.0182,0.0292,0.0299,0.0003}"
67,"[1,2-13C]glycerolandCO2",GL,Glycine246,0,"{0,1}","{C,C}",,"{0.3019,0.6202,0.0778}","{0.0322,0.0331,0.0263}"


In [9]:
# measured fragments/MS data, tracers and measured fluxes should be limited to one experiment

atomMappingReactions_data_I = imported_atom_mapping
modelReaction_data_I = imported_reactions
atomMappingMetabolite_data_I = imported_atoms_metabolites
measuredFluxes_data_I = measured_fluxes_info
experimentalMS_data_I = mdvs
tracer_I = tracer_info

In [10]:
INCA_script_test2 = INCA_script()
# The files need to be limited by model id and mapping id, I picked "ecoli_RL2013_02" here
atomMappingReactions_data_I = INCA_script_test2.limit_to_one_model(atomMappingReactions_data_I, 'mapping_id', 'Cupriavidus_necator2017')
modelReaction_data_I = INCA_script_test2.limit_to_one_model(modelReaction_data_I, 'model_id', 'Cupriavidus_necator2017')
atomMappingMetabolite_data_I = INCA_script_test2.limit_to_one_model(atomMappingMetabolite_data_I, 'mapping_id', 'Cupriavidus_necator2017')
measuredFluxes_data_I = INCA_script_test2.limit_to_one_model(measuredFluxes_data_I, 'model_id', 'Cupriavidus_necator2017')

# Limiting fluxes, fragments and tracers to one experiment
measuredFluxes_data_I = INCA_script_test2.limit_to_one_experiment(measuredFluxes_data_I, 'experiment_id', '[1,2-13C]glycerol')
experimentalMS_data_I = INCA_script_test2.limit_to_one_experiment(experimentalMS_data_I, 'experiment_id', '[1,2-13C]glycerol')
tracer_I = INCA_script_test2.limit_to_one_experiment(tracer_I, 'experiment_id', '[1,2-13C]glycerol')

In [11]:
script = INCA_script_test2.script_generator(
    modelReaction_data_I,
    atomMappingReactions_data_I,
    atomMappingMetabolite_data_I,
    measuredFluxes_data_I,
    experimentalMS_data_I,
    tracer_I
)

script_folder = %pwd
matlab_script = "c_necator_lit"
runner_script = matlab_script + "_runner"
INCA_script_test2.save_INCA_script(script, matlab_script)
runner = INCA_script_test2.runner_script_generator('C_necator', 10)
INCA_script_test2.save_runner_script(runner=runner, scriptname=matlab_script)

There is no stoichiometry given for: R72


In [12]:
# Comment out to avoid rerunning INCA
#INCA_script_test2.run_INCA_in_MATLAB(INCA_base_directory, script_folder, matlab_script, runner_script)

In [13]:
filename = 'C_necator.mat'
simulation_info = experimentalMS_data_I
simulation_id = '[1,2-13C]glycerol'

In [14]:
experimentalMS_data_I.experiment_id.unique()

array(['[1,2-13C]glycerol'], dtype=object)

In [15]:
reimport_data = INCA_reimport()

In [16]:
# Succession of functions
info = reimport_data.extract_file_info(filename)
parallel, non_stationary = reimport_data.det_simulation_type(simulation_info)
m, f = reimport_data.data_extraction(filename)
model_info = reimport_data.extract_model_info(m)
simulationParameters = reimport_data.extract_sim_params(simulation_id, info, m, filename)
fittedData = reimport_data.extract_base_stats(f, simulation_id, info)
f_mnt_info = reimport_data.get_fit_info(f)
fittedMeasuredFluxes, fittedMeasuredFragments = reimport_data.sort_fit_info(f_mnt_info, simulation_info, fittedData)
f_mnt_res_info = reimport_data.get_residuals_info(f, simulation_info)
fittedMeasuredFluxResiduals, fittedMeasuredFragmentResiduals = reimport_data.sort_residual_info(f_mnt_res_info, simulation_info, fittedData)
f_par_info = reimport_data.get_fitted_parameters(f, simulation_info)
fittedFluxes, fittedFragments = reimport_data.sort_parameter_info(f_par_info, simulation_info, fittedData)

DEBUG:BFAIR.mfa.INCA.INCA_reimport:Recived error:
Traceback (most recent call last):
  File "/Users/s143838/.virtualenvs/bfair-testing/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 3803, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas/_libs/index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 165, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 2263, in pandas._libs.hashtable.Int64HashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 2273, in pandas._libs.hashtable.Int64HashTable.get_item
KeyError: 0

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/s143838/projects/AutoFlow-OmicsDataHandling/BFAIR/mfa/INCA/INCA_reimport.py", line 543, in sort_residual_info
    "sample_name_abbreviation": simulation_info[
  File "/Users/s143838/.virtualenvs/bfair-testing/lib/py

No fluxes found
No MS data found
No MS data found
No MS data found
No MS data found
No MS data found
No MS data found
No MS data found
No MS data found
No MS data found
No MS data found
No MS data found
No MS data found
No MS data found
No MS data found
No MS data found
No MS data found
No MS data found
No MS data found
No MS data found
No MS data found
No MS data found
No MS data found
No MS data found
Fragment list: ['Alanine232', '0', '0', '[1,2-13C]glycerol']
Fragment list: ['Alanine260', '0', '0', '[1,2-13C]glycerol']
Fragment list: ['Asparticacid302', '0', '0', '[1,2-13C]glycerol']
Fragment list: ['Asparticacid390', '0', '0', '[1,2-13C]glycerol']
Fragment list: ['Asparticacid418', '0', '0', '[1,2-13C]glycerol']
Fragment list: ['Glutamicacid330', '0', '0', '[1,2-13C]glycerol']
Fragment list: ['Glutamicacid432', '0', '0', '[1,2-13C]glycerol']
Fragment list: ['Glycine218', '0', '0', '[1,2-13C]glycerol']
Fragment list: ['Glycine246', '0', '0', '[1,2-13C]glycerol']
Fragment list: ['Hi

In [21]:
simulation_info

Unnamed: 0,experiment_id,met_id,fragment_id,time_point,met_atompositions,met_elements,sample_name_abbreviation,intensity_normalized_average,intensity_normalized_stdev
23,"[1,2-13C]glycerol",PHE,Phenylalanine336,0,"{0,1,2,3,4,5,6,7,8}","{C,C,C,C,C,C,C,C,C}",,"{0.0004,0,0.0027,0.0171,0.0821,0.2639,0.3745,0...","{0.001,0.001,0.0003,0.0017,0.0027,0.0026,0.005..."
24,"[1,2-13C]glycerol",LEU,Leucine274,0,"{0,1,2,3,4}","{C,C,C,C,C}",,"{0.0022,0.0248,0.1678,0.3708,0.3302,0.1041}","{0.0001,0.0011,0.0012,0.0010,0.0012,0.0005}"
25,"[1,2-13C]glycerol",ALA,Alanine232,0,"{0,1}","{C,C}",,"{0.0495,0.5002,0.4504}","{0.0022,0.0009,0.0023}"
26,"[1,2-13C]glycerol",ALA,Alanine260,0,"{0,1,2}","{C,C,C}",,"{0.0221,0.1317,0.7290,0.117145}","{0.0013,0.0056,0.0131,0.0070}"
27,"[1,2-13C]glycerol",MET,Methionine320,0,"{0,1,2,3,4}","{C,C,C,C,C}",,"{0.0066,0.0418,0.2062,0.3885,0.2921,0.0647}","{0.0002,0.0018,0.0012,0.0032,0.0024,0.0032}"
28,"[1,2-13C]glycerol",PHE,Phenylalanine302,0,"{0,1}","{C,C}",,"{0.0428,0.4737,0.4834}","{0.0011,0.004,0.0038}"
29,"[1,2-13C]glycerol",GLU,Glutamicacid432,0,"{0,1,2,3,4}","{C,C,C,C,C}",,"{0.0037,0.0279,0.1568,0.3639,0.3418,0.1059}","{0.0004,0.0003,0.0017,0.0013,0.0019,0.0013}"
30,"[1,2-13C]glycerol",ILE,Isoleucine274,0,"{0,1,2,3,4}","{C,C,C,C,C}",,"{0.0039,0.0279,0.1532,0.3613,0.3435,0.1103}","{0.0001,0.0006,0.0024,0.0011,0.0008,0.0035}"
31,"[1,2-13C]glycerol",ASP,Asparticacid418,0,"{0,1,2,3}","{C,C,C,C}",,"{0.0130,0.0827,0.3460,0.4339,0.1243}","{0.0007,0.0018,0.0031,0.0033,0.0014}"
32,"[1,2-13C]glycerol",ASP,Asparticacid302,0,"{0,1}","{C,C}",,"{0.0879,0.4883,0.4238}","{0.002,0.0007,0.0021}"


The issue is that the parser for the parameter info relies on very specific naming of the fragments, i.e. it relies on the id being split and hardcoded indexing to select different features. 

I happens because the `len(fragment_list)` parameter is < 5. But in general this data does not contain the compound equation for the fragments. The best solution would probably to remake the data model for how the information, which is currently stored in the rxn_id, could be more properly handled.

The `rxn_id` is parsed directly from the matlab object in `get_fitted_parameters()`. Thus, its is matlab/INCA which creates these strange ids.

The fragment data does not contain fragment formular, I think this is the reason why it fails.

The msdata() has the .more attribute, this appears to contain the atoms of all non-labelled atoms, this could for example be all non carbon atoms in the molecule or the fomular for the derivatized compound minus the carbon atoms that originate from the amino acid.


## Refactoring sort_... functions
These functions are hard to understand because they do many thing at the same time. Their main purpose seems to be converting a dictionary into a pd.Dataframe(). This can be done much simpler using pd.DataFrame.from_dict(). Then it only remain to add a bit of extra information, such as simulation_id and simulation_dateAndTime.

In [18]:
test_fittedFluxes = pd.DataFrame.from_dict(f_par_info).query('par_type == "Net flux"').copy()

print(test_fittedFluxes.columns)
print(fittedFluxes.columns)

Index(['rxn_id', 'flux', 'flux_stdev', 'par_type', 'flux_lb', 'flux_ub',
       'flux_units', 'fit_alf', 'fit_chi2s', 'fit_cor', 'fit_cov', 'free'],
      dtype='object')
Index(['simulation_id', 'simulation_dateAndTime', 'rxn_id', 'flux',
       'flux_stdev', 'flux_lb', 'flux_ub', 'flux_units', 'fit_alf',
       'fit_chi2s', 'fit_cor', 'fit_cov', 'free', 'used_', 'comment_'],
      dtype='object')


We see that it is only the 'simulation_id', 'simulation_dateAndTime' that are missing.

Expected output columns fittedMeasuredFragmentsResiduals (Found in the MFA_INCA_data_reimport.ipynb):

['simulation_id', MISSING
'simulation_dateAndTime', MISSING
'experiment_id',
'sample_name_abbreviation', MISSING
'time_point', 
'fragment_id',MISSING
'fragment_mass', MISSING
'res_data', 
'res_fit', 
'res_peak', 
'res_stdev',
'res_val',
'res_msens', MISSING - but hardcoded to None
'res_esens', MISSING - but hardcoded to None
'used_', MISSING - but hardcoded to True
'comment_']MISSING - but hardcoded to None

The reaction ID contain the fragment id, this is due to the specific id schema for the fragments. Thus it should be safe to take the fragment ID from the reation ID.

In [27]:
pd.DataFrame.from_dict(f_mnt_res_info)

Unnamed: 0,res_val,res_fit,expt_type,rxn_id,res_stdev,time_point,experiment_id,res_data,res_peak
0,-0.065921,0.999341,Flux,ex_2,0.01,0,"[1,2-13C]glycerol",1.0,
1,7.39227,0.065756,MS,"Alanine232_0_0_[1,2-13C]glycerol",0.0022,0,"[1,2-13C]glycerol",0.049495,M0
2,-6.37479,0.013814,MS,"Alanine260_0_0_[1,2-13C]glycerol",0.0013,0,"[1,2-13C]glycerol",0.022101,M0
3,-3.160252,0.081579,MS,"Asparticacid302_0_0_[1,2-13C]glycerol",0.002,0,"[1,2-13C]glycerol",0.0879,M0
4,-6.636464,0.025966,MS,"Asparticacid390_0_0_[1,2-13C]glycerol",0.001,0,"[1,2-13C]glycerol",0.032603,M0
5,-7.053291,0.005947,MS,"Asparticacid418_0_0_[1,2-13C]glycerol",0.001,0,"[1,2-13C]glycerol",0.013001,M0
6,-0.767888,0.010532,MS,"Glutamicacid330_0_0_[1,2-13C]glycerol",0.001,0,"[1,2-13C]glycerol",0.0113,M0
7,-1.72698,0.001973,MS,"Glutamicacid432_0_0_[1,2-13C]glycerol",0.001,0,"[1,2-13C]glycerol",0.0037,M0
8,50.711864,0.366238,MS,"Glycine218_0_0_[1,2-13C]glycerol",0.005201,0,"[1,2-13C]glycerol",0.10251,M0
9,15.972849,0.08314,MS,"Glycine246_0_0_[1,2-13C]glycerol",0.0025,0,"[1,2-13C]glycerol",0.043204,M0


In [30]:
fittedData

Unnamed: 0,fitted_echi2,fitted_alf,fitted_chi2,fitted_dof,simulation_id,simulation_dateAndTime,used_,comment_
0,33.161786,0.05,13780.709741,51,"[1,2-13C]glycerol",2023-01-05 16:36:09,True,
