# Published data - Cupriavidus necator
Data collection
- Batch culture samples from two time points during exponential phase

In [1]:
from BFAIR.mfa.INCA import INCA_script
import pandas as pd
import numpy as np
import time
import ast
import matlab.engine
import sys
#import escher
import dotenv
from BFAIR.mfa.INCA import INCA_reimport
from BFAIR.parsers import modelReactions_file_parser, atomMapping_reactions2_file_parser, atom_mapping_metabolites_file_parser

DEBUG:optlang.util:Gurobi python bindings not available.
DEBUG:optlang.util:GLPK python bindings found at /Users/s143838/.virtualenvs/bfair-testing/lib/python3.10/site-packages/swiglpk
DEBUG:optlang.util:Mosek python bindings not available.
DEBUG:optlang.util:CPLEX python bindings not available.
DEBUG:optlang.util:OSQP python bindings not available.
DEBUG:optlang.util:COINOR_CBC python bindings not available.
DEBUG:optlang.util:Scipy linprog function found at /Users/s143838/.virtualenvs/bfair-testing/lib/python3.10/site-packages/scipy/optimize/__init__.py


In [2]:
# import environment variables
INCA_base_directory = dotenv.get_key(dotenv.find_dotenv(), "INCA_base_directory")

#### Import using parsers

In [3]:
imported_reactions = modelReactions_file_parser(
    'Literature data/Cupriavidus necator  Alagesan 2017/reactions_2nd.xlsx',
    'Cupriavidus_necator2017',
    reaction_id_col_name="Reaction ID",
    equation_col_name="Equations (Carbon atom transition)",
)
imported_atom_mapping = atomMapping_reactions2_file_parser(
    'Literature data/Cupriavidus necator  Alagesan 2017/reactions_2nd.xlsx',
    'Cupriavidus_necator2017',
    reaction_id_col_name="Reaction ID",
    equation_col_name="Equations (Carbon atom transition)",
)
imported_atoms_metabolites = atom_mapping_metabolites_file_parser(
    'Literature data/Cupriavidus necator  Alagesan 2017/reactions_2nd.xlsx',
    'Cupriavidus_necator2017',
    reaction_id_col_name="Reaction ID",
    equation_col_name="Equations (Carbon atom transition)",
)

DEBUG:BFAIR.parsers.data_import_parsers:Duplicates in reactants: [('PYR', 1.0, 'abc'), ('PYR', 1.0, 'def')]
DEBUG:BFAIR.parsers.data_import_parsers:Duplicates: ['PYR']
DEBUG:BFAIR.parsers.data_import_parsers:Duplicate mappings: ['abc', 'def']
DEBUG:BFAIR.parsers.data_import_parsers:Duplicates in reactants: [('PYR', 1.0, 'abc'), ('PYR', 1.0, 'def')]
DEBUG:BFAIR.parsers.data_import_parsers:Duplicates: ['PYR']
DEBUG:BFAIR.parsers.data_import_parsers:Duplicate mappings: ['abc', 'def']


#### Manual set up of additional information

In [4]:
tracer_info = pd.DataFrame.from_dict({
    'experiment_id': [
        'D-[1-13C]fructose', '[1,2-13C]glycerol', '[1,2-13C]glycerolandCO2',
    ],
    'met_id': ['F6P.ext', 'GLY.ext', 'GLY.ext'],
    'met_name': [
        'D-[1-13C]fructose', '[1,2-13C]glycerol', '[1,2-13C]glycerolandCO2',
    ],
    'met_atompositions': [
        '{1}', '{1,2}', '{1,2}',
    ],
    'met_elements': [
        '{C}', '{C,C}', '{C,C}',
    ],
    'ratio': [
        '1', '1', '1',
    ]    
}, orient='columns')

In [5]:
measured_fluxes_info = pd.DataFrame.from_dict({
    'experiment_id': [
        'D-[1-13C]fructose', '[1,2-13C]glycerol', '[1,2-13C]glycerolandCO2',
    ],
    'model_id': ['Cupriavidus_necator2017' for i in range(3)],
    'rxn_id': [
        'ex_1', 'ex_2', 'ex_2',
        ],
    'flux_average': ['1' for i in range(3)],
    'flux_stdev': ['0.01' for i in range(3)],
    'flux_lb': ['0.99' for i in range(3)],
    'flux_ub': ['1.01' for i in range(3)],  
}, orient='columns')

#### This one should be its own parser

In [6]:
experimental_mdvs = pd.read_csv('Literature data/Cupriavidus necator  Alagesan 2017/MDVs.csv', sep=';', decimal=',')

In [7]:
experiment_ids = []
met_ids = []
fragment_ids = []
time_points = []
intensity_normalized_averages = []
intensity_normalized_stdevs = []
met_atompositions = []
met_elements = []
sample_name_abbreviation = []

experiments = [experimental_mdvs.columns[2], experimental_mdvs.columns[4], experimental_mdvs.columns[6]]
for experiment in experiments:
    fragments = list(set([row[1] for i, row in experimental_mdvs[experimental_mdvs.columns[:2]].iterrows()]))
    for fragment in fragments:
        experiment_ids.append(experiment)
        fragment_ids.append(fragment)
        mdv_list = experimental_mdvs[experimental_mdvs[experimental_mdvs.columns[1]] == fragment][experiment].to_list()
        stdev_list = experimental_mdvs[experimental_mdvs[experimental_mdvs.columns[1]] == fragment]['stdev_' + experiment].to_list()
        mdv = '{'
        stdev = '{'
        atompos = '{'
        frag_element = '{'
        for i, mdv_value in enumerate(mdv_list):
            if i == 0:
                mdv += str(mdv_value)
                stdev += str(stdev_list[i])
            else:
                mdv += ',' + str(mdv_value)
                stdev += ',' + str(stdev_list[i])
                if i == 1:
                    atompos += str(i-1)
                    frag_element += 'C'
                else:
                    atompos += ',' + str(i-1)
                    frag_element += ',' + 'C'
        mdv += '}'
        stdev += '}'
        atompos += '}'
        frag_element += '}'

        met_ids.append(list(set(experimental_mdvs[experimental_mdvs[experimental_mdvs.columns[1]] == fragment]['Met'].to_list()))[0])
        time_points.append('0')
        intensity_normalized_averages.append(mdv)
        intensity_normalized_stdevs.append(stdev)
        sample_name_abbreviation.append('')
        met_atompositions.append(atompos)
        met_elements.append(frag_element)

mdvs = pd.DataFrame.from_dict({
    'experiment_id': experiment_ids,
    'met_id': met_ids,
    'fragment_id': fragment_ids,
    'time_point': time_points,
    'met_atompositions': met_atompositions,
    'met_elements': met_elements,
    'sample_name_abbreviation': sample_name_abbreviation,
    'intensity_normalized_average': intensity_normalized_averages,
    'intensity_normalized_stdev': intensity_normalized_stdevs,  
}, orient='columns')

In [8]:
mdvs

Unnamed: 0,experiment_id,met_id,fragment_id,time_point,met_atompositions,met_elements,sample_name_abbreviation,intensity_normalized_average,intensity_normalized_stdev
0,D-[1-13C]fructose,ILE,Isoleucine274,0,"{0,1,2,3,4}","{C,C,C,C,C}",,"{0.8210975,0.1654325,0.01305,0.0003375,0,0}","{0.0120,0.0083,0.0040,0.001,0.001,0.001}"
1,D-[1-13C]fructose,THR,Threonine376,0,"{0,1,2}","{C,C,C}",,"{0.8401,0.1544,0.0055,0}","{0.0089,0.0098,0.0016,0.001}"
2,D-[1-13C]fructose,PHE,Phenylalanine336,0,"{0,1,2,3,4,5,6,7,8}","{C,C,C,C,C,C,C,C,C}",,"{0.8209,0.1435,0.0327,0.0028,0,0,0,0,0,0}","{0.0256,0.0182,0.0082,0.001,0.001,0.001,0.001,..."
3,D-[1-13C]fructose,HIS,Histidine338,0,"{0,1,2,3,4}","{C,C,C,C,C}",,"{0.4626,0.4672,0.0427,0.0255,0.0008,0.0009}","{0.0091,0.0173,0.0119,0.0030,0.001,nan}"
4,D-[1-13C]fructose,VAL,Valine260,0,"{0,1,2,3}","{C,C,C,C}",,"{0.9513,0.0436,0.0049,0,0.00003}","{0.0058,0.0057,0.0005,0.001,0.001}"
...,...,...,...,...,...,...,...,...,...
64,"[1,2-13C]glycerolandCO2",GL,Glycine246,0,"{0,1}","{C,C}",,"{0.3019,0.6202,0.0778}","{0.0322,0.0331,0.0263}"
65,"[1,2-13C]glycerolandCO2",ALA,Alanine232,0,"{0,1}","{C,C}",,"{0.2128,0.4596,0.3275}","{0.0341,0.0103,0.0239}"
66,"[1,2-13C]glycerolandCO2",PHE,Phenylalanine308,0,"{0,1,2,3,4,5,6,7}","{C,C,C,C,C,C,C,C}",,"{0.0279,0.0491,0.1184,0.2083,0.2584,0.2121,0.1...","{0.0061,0.0130,0.0245,0.0237,0.0085,0.0303,0.0..."
67,"[1,2-13C]glycerolandCO2",GL,Glycine218,0,{0},{C},,"{0.3111,0.6889}","{0.0365,0.0365}"


In [9]:
# measured fragments/MS data, tracers and measured fluxes should be limited to one experiment

atomMappingReactions_data_I = imported_atom_mapping
modelReaction_data_I = imported_reactions
atomMappingMetabolite_data_I = imported_atoms_metabolites
measuredFluxes_data_I = measured_fluxes_info
experimentalMS_data_I = mdvs
tracer_I = tracer_info

In [10]:
INCA_script_test2 = INCA_script()
# The files need to be limited by model id and mapping id, I picked "ecoli_RL2013_02" here
atomMappingReactions_data_I = INCA_script_test2.limit_to_one_model(atomMappingReactions_data_I, 'mapping_id', 'Cupriavidus_necator2017')
modelReaction_data_I = INCA_script_test2.limit_to_one_model(modelReaction_data_I, 'model_id', 'Cupriavidus_necator2017')
atomMappingMetabolite_data_I = INCA_script_test2.limit_to_one_model(atomMappingMetabolite_data_I, 'mapping_id', 'Cupriavidus_necator2017')
measuredFluxes_data_I = INCA_script_test2.limit_to_one_model(measuredFluxes_data_I, 'model_id', 'Cupriavidus_necator2017')

# Limiting fluxes, fragments and tracers to one experiment
measuredFluxes_data_I = INCA_script_test2.limit_to_one_experiment(measuredFluxes_data_I, 'experiment_id', '[1,2-13C]glycerol')
experimentalMS_data_I = INCA_script_test2.limit_to_one_experiment(experimentalMS_data_I, 'experiment_id', '[1,2-13C]glycerol')
tracer_I = INCA_script_test2.limit_to_one_experiment(tracer_I, 'experiment_id', '[1,2-13C]glycerol')

In [11]:
script = INCA_script_test2.script_generator(
    modelReaction_data_I,
    atomMappingReactions_data_I,
    atomMappingMetabolite_data_I,
    measuredFluxes_data_I,
    experimentalMS_data_I,
    tracer_I
)

script_folder = %pwd
matlab_script = "c_necator_lit"
runner_script = matlab_script + "_runner"
INCA_script_test2.save_INCA_script(script, matlab_script)
runner = INCA_script_test2.runner_script_generator('C_necator', 100, True)
INCA_script_test2.save_runner_script(runner=runner, scriptname=matlab_script)

There is no stoichiometry given for: R72


In [12]:
# Comment out to avoid rerunning INCA
INCA_script_test2.run_INCA_in_MATLAB(INCA_base_directory, script_folder, matlab_script, runner_script)


                                         Directional 
 Iteration      Residual     Step-size    derivative        Lambda
     0       1.67571e+07
     1        5.6792e+06           0.5     -7.38e+06      0.936766
     2       5.66816e+06       0.00159     -3.48e+06      0.936766
     3       5.66741e+06      0.000113     -3.32e+06      0.936766
     4       5.66103e+06       0.00103     -3.09e+06      0.936766
     5       5.65824e+06      0.000484     -2.88e+06      0.936766
     6       5.65707e+06      0.000221     -2.66e+06      0.936766
     7       5.64994e+06       0.00146     -2.43e+06      0.936766
     8       5.64878e+06      0.000263      -2.2e+06      0.936766
     9       5.64807e+06      0.000179     -1.98e+06      0.936766
    10       5.64039e+06       0.00218     -1.76e+06      0.936766
    11       5.62837e+06       0.00392     -1.53e+06      0.936766
    12       5.62005e+06       0.00319      -1.3e+06      0.936766
    13       5.61764e+06       0.00112     -1.08e

In [13]:
filename = 'C_necator.mat'
simulation_info = experimentalMS_data_I
simulation_id = '[1,2-13C]glycerol'

In [14]:
experimentalMS_data_I.experiment_id.unique()

array(['[1,2-13C]glycerol'], dtype=object)

In [15]:
from BFAIR.mfa.INCA.INCA_results import INCA_results

In [16]:
res = INCA_results("C_necator.mat")

In [17]:
for i in res.matlab_obj['s']:
    print(i.keys())

dict_keys(['atom', 'expt', 'id', 'more', 'sens', 'time', 'type', 'val', 'base'])
dict_keys(['atom', 'expt', 'id', 'more', 'sens', 'time', 'type', 'val', 'base'])
dict_keys(['atom', 'expt', 'id', 'more', 'sens', 'time', 'type', 'val', 'base'])
dict_keys(['atom', 'expt', 'id', 'more', 'sens', 'time', 'type', 'val', 'base'])
dict_keys(['atom', 'expt', 'id', 'more', 'sens', 'time', 'type', 'val', 'base'])
dict_keys(['atom', 'expt', 'id', 'more', 'sens', 'time', 'type', 'val', 'base'])
dict_keys(['atom', 'expt', 'id', 'more', 'sens', 'time', 'type', 'val', 'base'])
dict_keys(['atom', 'expt', 'id', 'more', 'sens', 'time', 'type', 'val', 'base'])
dict_keys(['atom', 'expt', 'id', 'more', 'sens', 'time', 'type', 'val', 'base'])
dict_keys(['atom', 'expt', 'id', 'more', 'sens', 'time', 'type', 'val', 'base'])
dict_keys(['atom', 'expt', 'id', 'more', 'sens', 'time', 'type', 'val', 'base'])
dict_keys(['atom', 'expt', 'id', 'more', 'sens', 'time', 'type', 'val', 'base'])
dict_keys(['atom', 'expt', '

In [18]:
res.measurements_and_fit_overview

Unnamed: 0,cont,expt,id,sres,type,base
0,"[0.8423400785475454, 0.9999999999999999, 0.999...","[1,2-13C]glycerol",ex_2,1.972152e-27,Flux,{'id': []}
1,"[1.0622183164411536e-05, 2.8529591363128463e-3...","[1,2-13C]glycerol",Alanine232,286888.8,MS,{'id': []}
2,"[9.823202595689777e-09, 1.625638177619044e-31,...","[1,2-13C]glycerol",Alanine260,3923.773,MS,{'id': []}
3,"[7.524020814738302e-07, 2.6276651322606852e-33...","[1,2-13C]glycerol",Asparticacid302,276835.7,MS,{'id': []}
4,"[5.8520577347948237e-08, 5.933789587178909e-31...","[1,2-13C]glycerol",Asparticacid390,79482.63,MS,{'id': []}
5,"[7.135089522974173e-07, 1.2959793723299909e-33...","[1,2-13C]glycerol",Asparticacid418,39708.61,MS,{'id': []}
6,"[0.08655403482714412, 9.165126602221567e-29, 9...","[1,2-13C]glycerol",Glutamicacid330,148167.7,MS,{'id': []}
7,"[0.010721885620381658, 1.1381780844607772e-29,...","[1,2-13C]glycerol",Glutamicacid432,126626.5,MS,{'id': []}
8,"[1.6310293260808075e-05, 3.3150780095148925e-3...","[1,2-13C]glycerol",Glycine218,29704.9,MS,{'id': []}
9,"[2.0299455399345797e-05, 1.1714445022726048e-3...","[1,2-13C]glycerol",Glycine246,32108.09,MS,{'id': []}


Now that we have loaded the inca results, we can investigate how results. The first step is to investigate the diagnostics. Here we want to investigate a few factors:
1. Did the fit pass the Goodness-of-fit test
2. Are the residuals normally distributed
3. Are the any measurements that appears to be outliers

 

In [19]:
res.get_goodness_of_fit()

Fit accepted: False
Confidence level: 0.05
Chi-square value (SSR): 1991036.5606258186
Expected chi-square range: [33.16178637 72.61599227]


In [20]:
res.test_normality_of_residuals()

Residuals are normally distributed: False on a 0.05 significance level


In [21]:
import BFAIR.mfa.visualization.diagnositics as diagnostics

diagnostics.plot_norm_probplot(res, interactive=True)

DEBUG:matplotlib:matplotlib data path: /Users/s143838/.virtualenvs/bfair-testing/lib/python3.10/site-packages/matplotlib/mpl-data
DEBUG:matplotlib:CONFIGDIR=/Users/s143838/.matplotlib
DEBUG:matplotlib:interactive is False
DEBUG:matplotlib:platform is darwin
DEBUG:matplotlib:CACHEDIR=/Users/s143838/.matplotlib
DEBUG:matplotlib.font_manager:Using fontManager instance from /Users/s143838/.matplotlib/fontlist-v330.json
  for col_name, dtype in df.dtypes.iteritems():


In [22]:
res.measurements_and_fit_detailed['weighted residual'].transform(lambda x: x**2).sum()

1991036.5606258186

In [23]:
res.fitted_parameters['eqn']

0     FRU.ext -> F6P
1     GLY.ext -> GLY
2        GLY -> DHAP
3         3PG -> G3P
4         3PG -> PEP
           ...      
92                []
93                []
94                []
95                []
96                []
Name: eqn, Length: 97, dtype: object

The issue is that the parser for the parameter info relies on very specific naming of the fragments, i.e. it relies on the id being split and hardcoded indexing to select different features. 

I happens because the `len(fragment_list)` parameter is < 5. But in general this data does not contain the compound equation for the fragments. The best solution would probably to remake the data model for how the information, which is currently stored in the rxn_id, could be more properly handled.

The `rxn_id` is parsed directly from the matlab object in `get_fitted_parameters()`. Thus, its is matlab/INCA which creates these strange ids.

The fragment data does not contain fragment formular, I think this is the reason why it fails.

The msdata() has the .more attribute, this appears to contain the atoms of all non-labelled atoms, this could for example be all non carbon atoms in the molecule or the fomular for the derivatized compound minus the carbon atoms that originate from the amino acid.


## Refactoring sort_... functions
These functions are hard to understand because they do many thing at the same time. Their main purpose seems to be converting a dictionary into a pd.Dataframe(). This can be done much simpler using pd.DataFrame.from_dict(). Then it only remain to add a bit of extra information, such as simulation_id and simulation_dateAndTime.

We see that it is only the 'simulation_id', 'simulation_dateAndTime' that are missing.

Expected output columns fittedMeasuredFragmentsResiduals (Found in the MFA_INCA_data_reimport.ipynb):

['simulation_id', MISSING
'simulation_dateAndTime', MISSING
'experiment_id',
'sample_name_abbreviation', MISSING
'time_point', 
'fragment_id',MISSING
'fragment_mass', MISSING
'res_data', 
'res_fit', 
'res_peak', 
'res_stdev',
'res_val',
'res_msens', MISSING - but hardcoded to None
'res_esens', MISSING - but hardcoded to None
'used_', MISSING - but hardcoded to True
'comment_']MISSING - but hardcoded to None

The reaction ID contain the fragment id, this is due to the specific id schema for the fragments. Thus it should be safe to take the fragment ID from the reation ID.