# Use case of the INCA Parser: Isotopically non-stationary 13C-MFA


In [1]:
import pandas as pd
import numpy as np
import dotenv
import ast
import pandera as pa
import BFAIR.mfa.INCA.dataschemas as dataschemas
import BFAIR.mfa.INCA.INCAScript_writing as INCAScript_writing
from BFAIR.mfa.INCA.run_inca import run_inca
from BFAIR.mfa.visualization.schema_overview import present_schema_overview
import BFAIR.mfa.utils.merge_reversible_reactions as mrr
import pathlib

In [2]:
# import environment variables
INCA_base_directory = dotenv.get_key(dotenv.find_dotenv(), "INCA_base_directory")
data_dir = pathlib.Path('./Literature data/Astrocytes and neural stem cells/')
data_dir.exists()

True

First, we will load and process the reactions data. We will start by refreshing the requirements for the dataframe. 

In [3]:
present_schema_overview(dataschemas.ReactionsSchema)

Unnamed: 0,column name,dtype,required,nullable,description
0,rxn_id,str,True,False,The unique id of the reaction
1,rxn_eqn,str,True,False,"The reaction equation with atom map. Allowed reaction arrows: ->, <->."


We will now read in the data and rename the column to fit the requirements from `ReactionsSchema`.

In [4]:
xl = pd.ExcelFile(data_dir / 'Model.xlsx')
reactions = (xl.parse('Sheet1')
    .rename(columns={'Reaction ID': 'rxn_id', 'Equations (Carbon atom transition)': 'rxn_eqn'})
)
reactions.head()

Unnamed: 0,rxn_id,rxn_eqn
0,R1,G6P (abcdef) <-> F6P (abcdef)
1,R2,F6P (abcdef) -> FBP (abcdef)
2,R3,FBP (abcdef) <-> DHAP (cba) + GAP (def)
3,R4,DHAP (abc) <-> GAP (abc)
4,R5,GAP (abc) <-> 3PG (abc)


We are now ready to validate the data. This is easiest done if the validation is wrapped in the try-except statement, because will produce a much nicer output in case the validation does not pass.

In [5]:
try:
    dataschemas.ReactionsSchema.validate(reactions)
except pa.schemas.errors.SchemaError as e:
    print(type(e))
    print(e)

The data passes the validation and we can move on to measurements of the exchange fluxes.

In [6]:
present_schema_overview(dataschemas.FluxMeasurementsSchema)

Unnamed: 0,column name,dtype,required,nullable,description
0,experiment_id,str,True,False,"ID of the experiment. Must be a valid MATLAB variable name, legal characters are a-z, A-Z, 0-9, and the underscore character."
1,rxn_id,str,True,False,The unique id of the reaction
2,flux,float64,True,False,Measured/estimated rate typically in mmol/gDW/h
3,flux_std_error,float64,True,False,Standard error of the measured/estimated rate


In [7]:
flux_measurements_raw = pd.read_csv(data_dir / 'exchange_fluxes.csv')
flux_measurements_raw.head()

Unnamed: 0,Metabolite,Neural Stem Cells,Astrocytes
0,Glucose,-256.3 ± 1.3,-175.7 ± 17.9
1,Lactate,237.8 ± 10.1,348.6 ± 9.6
2,Aspartate,-2.6 ± 0.7,-11.9 ± 5.8
3,Glutamate,-1.8 ± 0.4,-10.5 ± 3.4
4,Serine,-2.8 ± 1.4,-4.1 ± 1.7


In [8]:
flux_measurements = (flux_measurements_raw
    .melt(id_vars=['Metabolite'], value_vars=['Astrocytes', 'Neural Stem Cells'], var_name='Cell type', value_name='flux')
    .replace({'flux': {'nan': np.nan}})
)

flux_measurements[['flux', 'flux_std_error']] = flux_measurements['flux'].str.split('±', expand=True).astype(float)
flux_measurements['experiment_id'] = flux_measurements['Cell type'].str.replace(' ', '_')
flux_measurements.head()

Unnamed: 0,Metabolite,Cell type,flux,flux_std_error,experiment_id
0,Glucose,Astrocytes,-175.7,17.9,Astrocytes
1,Lactate,Astrocytes,348.6,9.6,Astrocytes
2,Aspartate,Astrocytes,-11.9,5.8,Astrocytes
3,Glutamate,Astrocytes,-10.5,3.4,Astrocytes
4,Serine,Astrocytes,-4.1,1.7,Astrocytes


The raw data uses the metabolite to reference which the exchange reactions. The INCA Parse require the reaction id (`rxn_id`) to attach a flux measurement to a reaction. Next, we make a map from metabolite to `rxn_id`.

In [9]:
exchange_reactions_metabolite_map = {
    'CO2': 'R44',
    'Glucose': 'R45',
    'Lactate': 'R46',
    'Alanine': 'R47',
    'Pyruvate': 'R48',
    'Citrate': 'R49',
    'Glutamine': 'R50',
    'Glutamate': 'R51',
    'Aspartate': 'R52',
    'Asparagine': 'R53',
    'Serine': 'R54',
    'Glycine': 'R55',
    'Proline': 'R56',
    'Valine': 'R57',
    'Isoleucine': 'R58',
    'Leucine': 'R59',
    'Threonine': 'R60',
    'Phenylalanine': 'R61',
    'Tyrosine': 'R62',
    'Methionine': 'R63',
    'Lysine': 'R64',
    'Histidine': 'R65',
    'Arginine': 'R66',
}

We can use this map to create a new colum with the reaction ids.

In [10]:
flux_measurements['rxn_id'] = flux_measurements['Metabolite'].map(exchange_reactions_metabolite_map)
flux_measurements.head()

Unnamed: 0,Metabolite,Cell type,flux,flux_std_error,experiment_id,rxn_id
0,Glucose,Astrocytes,-175.7,17.9,Astrocytes,R45
1,Lactate,Astrocytes,348.6,9.6,Astrocytes,R46
2,Aspartate,Astrocytes,-11.9,5.8,Astrocytes,R52
3,Glutamate,Astrocytes,-10.5,3.4,Astrocytes,R51
4,Serine,Astrocytes,-4.1,1.7,Astrocytes,R54


In [11]:
try:
    dataschemas.FluxMeasurementsSchema.validate(flux_measurements)
except pa.schemas.errors.SchemaError as e:
    print(type(e))
    print(e)

<class 'pandera.errors.SchemaError'>
non-nullable series 'flux' contains null values:
6    NaN
12   NaN
13   NaN
14   NaN
17   NaN
29   NaN
30   NaN
31   NaN
39   NaN
40   NaN
Name: flux, dtype: float64


Ups, the validation failed because the INCA Parser does not allow null values in the measured fluxes. In the supplementary documentation the authors write: "NaN represents undetermined transport rates". We interpret as these rate were not measured, thus we don't know anything about the rate and it could take any value. Therefore, we will simply remove the measurements with NaN. 

An alternative interpretation is that the rate were measured but the value was below the detection limits. In that case the NaNs should be replaced by small numbers. 

In [12]:
flux_measurements.dropna(subset=['flux'], inplace=True)

try:
    dataschemas.FluxMeasurementsSchema.validate(flux_measurements)
except pa.schemas.errors.SchemaError as e:
    print(type(e))
    print(e)

Now, the flux measurements dataframe passes the validation and we can import the MS measurements.

In [13]:
ms_measurements_raw = pd.read_csv(data_dir / 'MDVs.csv')
ms_measurements_raw.head()

Unnamed: 0.1,Unnamed: 0,N_0.33 h,N_3 h,N_12 h,N_24 h,A_0.33 h,A_3 h,A_12 h,A_24 h
0,3PG (M0),0.639,0.787,0.604,0.564,0.586,0.568,0.667,0.582
1,3PG (M1),0.361,0.213,0.396,0.436,0.414,0.432,0.333,0.418
2,PEP (M0),0.695,0.771,0.618,0.578,0.608,0.639,0.754,0.674
3,PEP (M1),0.305,0.229,0.382,0.422,0.392,0.361,0.246,0.326
4,Lac (M0),0.906,0.882,0.891,0.844,0.992,0.986,0.968,0.824


We see that the header contains information about the the cell type and the sampling time. We will start by processing the data into tidy format.

In [27]:
ms_measurements = (ms_measurements_raw
    .melt(id_vars=['Unnamed: 0'], var_name=['cell_time'], value_name='intensity')
)
ms_measurements[['cell_type', 'time']] = ms_measurements['cell_time'].str.split('_', expand=True)

ms_measurements['experiment_id'] = ms_measurements['cell_type'].map({'N': 'Neural_Stem_Cells', 'A': 'Astrocytes'})
ms_measurements['time'] = ms_measurements['time'].str.replace(' h', '').astype(float)
ms_measurements[['met_id', 'mass_isotope']] = ms_measurements['Unnamed: 0'].str.split(' ', expand=True)
ms_measurements['ms_id'] = ms_measurements['met_id']
ms_measurements['mass_isotope'] = ms_measurements['mass_isotope'].str.extract('(\d+)').astype(int)
ms_measurements['measurement_replicate'] = 1
ms_measurements.head()

Unnamed: 0.1,Unnamed: 0,cell_time,intensity,cell_type,time,experiment_id,met_id,mass_isotope,ms_id,measurement_replicate
0,3PG (M0),N_0.33 h,0.639,N,0.33,Neural_Stem_Cells,3PG,0,3PG,1
1,3PG (M1),N_0.33 h,0.361,N,0.33,Neural_Stem_Cells,3PG,1,3PG,1
2,PEP (M0),N_0.33 h,0.695,N,0.33,Neural_Stem_Cells,PEP,0,PEP,1
3,PEP (M1),N_0.33 h,0.305,N,0.33,Neural_Stem_Cells,PEP,1,PEP,1
4,Lac (M0),N_0.33 h,0.906,N,0.33,Neural_Stem_Cells,Lac,0,Lac,1


In [28]:
ms_measurements['met_id'].unique()

array(['3PG', 'PEP', 'Lac', 'Ala', 'Cit', 'Suc', 'Fum', 'Mal', 'Asp',
       'Glu', 'Gln'], dtype=object)

In [29]:
molecular_formulars = {
    'PEP': 'C3H2O6P',
    '3PG': 'C3H4O7P',
    'Lac': 'C3H5O3',
    'Ala': 'C3H7NO2',
    'Cit': 'C6H5O7',
    'Suc': 'C4H4O4',
    'Fum': 'C4H2O4',
    'Mal': 'C4H4O5',
    'Asp': 'C4H6NO4',
    'Glu': 'C5H8NO4',
    'Gln': 'C5H10N2O3'
}

In [30]:
ms_measurements['unlabelled_atoms'] = ms_measurements['met_id'].map(molecular_formulars).str.replace(r'(C\d+)','', regex=True)
ms_measurements

Unnamed: 0.1,Unnamed: 0,cell_time,intensity,cell_type,time,experiment_id,met_id,mass_isotope,ms_id,measurement_replicate,unlabelled_atoms
0,3PG (M0),N_0.33 h,0.639,N,0.33,Neural_Stem_Cells,3PG,0,3PG,1,H4O7P
1,3PG (M1),N_0.33 h,0.361,N,0.33,Neural_Stem_Cells,3PG,1,3PG,1,H4O7P
2,PEP (M0),N_0.33 h,0.695,N,0.33,Neural_Stem_Cells,PEP,0,PEP,1,H2O6P
3,PEP (M1),N_0.33 h,0.305,N,0.33,Neural_Stem_Cells,PEP,1,PEP,1,H2O6P
4,Lac (M0),N_0.33 h,0.906,N,0.33,Neural_Stem_Cells,Lac,0,Lac,1,H5O3
...,...,...,...,...,...,...,...,...,...,...,...
299,Glu (M3),A_24 h,0.001,A,24.00,Astrocytes,Glu,3,Glu,1,H8NO4
300,Gln (M0),A_24 h,0.716,A,24.00,Astrocytes,Gln,0,Gln,1,H10N2O3
301,Gln (M1),A_24 h,0.240,A,24.00,Astrocytes,Gln,1,Gln,1,H10N2O3
302,Gln (M2),A_24 h,0.043,A,24.00,Astrocytes,Gln,2,Gln,1,H10N2O3


In [31]:
def create_labelled_atoms_list(molecular_formular):
    """Return a list of length equal to the number of carbon atoms. Assumes that C is the first element in the molecular formular."""
    carbon_atoms = int(molecular_formular[1])
    return list(range(1, carbon_atoms + 1))

ms_measurements['labelled_atom_ids'] = ms_measurements['met_id'].map(molecular_formulars).map(create_labelled_atoms_list)

In [32]:
relative_ms_measurement_error = 0.05
ms_measurements['intensity_std_error'] = ms_measurements['intensity'] * relative_ms_measurement_error

In [33]:
# dropping unnecessary columns and rearrange order of columns
ms_measurements = ms_measurements[[
    'experiment_id', 'met_id', 'ms_id', 'time', 'labelled_atom_ids', 'unlabelled_atoms', 'measurement_replicate' ,
    'mass_isotope', 'intensity', 'intensity_std_error'
]]

In [34]:
try:
    dataschemas.MSMeasurementsSchema.validate(ms_measurements)
except pa.schemas.errors.SchemaError as e:
    print(type(e))
    print(e)