# Processing biomass composition data

Import different packages

In [1]:
from libchebipy._chebi_entity import ChebiEntity
import pandas as pd
import numpy as np
import copy
import re
from DataProcessingFuncs import *
import os

Import element details (MW etc) and generate empty dataframes for population

In [3]:

# Import elements dataframe, downloaded from "https://gist.github.com/GoodmanSciences/c2dd862cd38f21b0ad36b8f96b4bf1ee#file-periodic-table-of-elements-csv" on 24/01/2022
elements_df = pd.read_csv("../data/Elements.csv", index_col="Symbol")
# Generate dicitonary with elemental symbol as keys, MW as values
mw_dict = elements_df.loc[:,"AtomicMass"].to_dict()

elemental_order = 'CHNOSP'
element_re = re.compile("([A-Z][a-z]?)([0-9.]+[0-9.]?|(?=[A-Z])?)")

In [4]:
AllStaticComponents = pd.DataFrame(columns=["Name", "Average Formula"])

AllMeasurements = pd.DataFrame(columns=["Name", "Value", "SD", "Method", "Macrocomponent"])
VariableStoichMeasurements = pd.DataFrame(columns=["Cultivation Group", "Sample Point", "Name", "Family","Macrocomponent", "Value"])

Import Chebi data for different molecules/metabolites used and then create metabolite classess for each one, including metabolite formulas etc.

Code below used to search chebi for model ID and download information including names, formulas, MW etc. Results of paper use the details listed in ChebiData.csv, as Chebi may be updated.

In [5]:
mols = pd.read_csv("../data/ChebiModelIDs.csv", dtype=str, index_col=0)
tochange ={}
AllMets = {}
for name, i_d in mols["ChEBI"].iteritems():
    try: # if want to use CHEBI ID, sometimes the unicode minus is present in chebi names
        # which causes problems with python and most programming languages
        met = metabolite(chebi_id=i_d)
        
    except:
        met = metabolite(name=name, formula=mols.loc[name,"Formula"])
    
    met.name = met.name.replace(u"\u2212", "-")
    AllMets[met.name] = met#, name=name)

mols.reset_index(inplace=True)
mask = mols.ChEBI.isna()
mols.loc[~mask,"ChEBI Name"] = mols.loc[~mask,"ChEBI"].apply(lambda x: metabolite(x).name)
mols.set_index("Name", inplace=True)
# mols.to_csv("../data/UpdatedChebiNew.csv")
mols.to_csv("../results/dataframes/biomass/UpdatedChebiNew.csv")

In [6]:
mols = pd.read_csv("../data/ChebiData.csv", dtype=str, index_col=0)

AllMets = {}
for name, i_d in mols["ChEBI"].iteritems():
    try: # if want to use CHEBI ID, sometimes the unicode minus is present in chebi names
        # which causes problems with python and most programming languages
        met = metabolite(chebi_id=i_d)
    except:
        met = metabolite(name=name, formula=mols.loc[name,"Formula"])
    met.name = met.name.replace(u"\u2212", "-")
    AllMets[met.name] = met
    mols.loc[name,"ChEBI Name"] = met.name



# Glycogen no formula, is a polysaccharide of glucose so replace with glucose - H2O formula
mols.loc["glycogen","Formula"] = "H2O(C6H10O5)n" 

mols.loc[:,"Average Formula"] = mols["Formula"]
# As in Gamisans, we consider glyocgen and beta-glucan as 6 carbon molecules (glucose residues without water),
# Mannan is considered as 6 carbon mannose polymer and chitin as a single N-acetylglucosamine residue.
#  Additionally, remove R units from lipids and save in a seperate column
wo_ps = mols.loc[:,"Formula"].str.extract(r'\(([^()]+)\)').squeeze()

mask = wo_ps.notnull()

# This column gets updated later once an average fatty acyl (R) has been generated. This is a restirctive apporach to lipid composition.

mols.loc[mask,"Average Formula"] = wo_ps[mask]

mols.loc[:,"Core Formula"] = mols["Average Formula"].str.replace(r'R(.*)',"", regex=True)
## Do we want to update the dictionary with the average formula for each molecule?
mols.loc[:,"Core Formula Mass"] = mols["Core Formula"].apply(lambda x: metabolite(formula=x).mass)

# # Lipids
lipids_df = mols[mols["Macrocomponent"] == "Lipids"].copy()

lipids_df.loc[:,"Number of Rs"] = lipids_df.loc[:,"Average Formula"].str.extract(r'R(.*)').squeeze()
lipids_df.loc[:,"Number of Rs"] = lipids_df.loc[:,"Number of Rs"].replace({np.nan:0,"":1})

## 2. Process all "static" compositions

Static components = same for all sampling points

- a) Fatty acids - average fatty acid
    - Fatty acid distribution from grillitsch et al as in carnicer et al
- b) Lipids
    - Generate
- c) DNA 
    - Nucleotide compositions
- d) RNA 
    - Nucleotide composition

- e) Cofactors 
    - using the cofactor composition from Saccharomyces
- f) Sphingolipids
    - Static representation due to different FA distribution (palmitoyl co-A derived). As in Gamisans (et al) used from Grillitsch 

#### 2. a) Calculate average Fatty Acid

In [7]:
FA_df = pd.read_csv("../data/biomass/static/FattyAcids.csv", dtype=str)
FA_df = FA_df.astype({"g/g FA": float})

# We don't consider the "Others", but is imported and then removed
FA_df = FA_df.iloc[:-1, :]

FA_df.loc[:,"Formula"] = FA_df["Name"].apply(lambda x: AllMets[x].formula)
FA_df.loc[:,"Mass"] = FA_df["Name"].apply(lambda x: AllMets[x].mass)

fractions_calculator(FA_df, "g/g FA", "Mass", "FA", value_type="mass_fraction")


# Remvoe acyl group as it is included in all of the core formulas 
# for the compounds with an acyl group associated and which contain 
# an R group for the average fatty acid

# coo_array = formula_dict_to_array(formula_to_elements_dict("COO"))
coo_array = elements_dict_to_array(metabolite(formula="COO").elements())
coo_array = coo_array.reshape((-1,1)) # convert to column vector
# Extract formulas for the fatty acids

formulas = FA_df.loc[:, "Formula"].to_dict()
mol_fractions = FA_df.loc[:, "mmol/mmol FA"].values
FA_av_formula_array = weighted_average_formula(formulas, mol_fractions, return_type="array")
FA_av_formula_array = FA_av_formula_array - coo_array
FA_av_formula = array_to_formula(FA_av_formula_array.round(1), elemental_order=elemental_order)

mols.loc["Average Fatty Acid (R)",["Family", "Macrocomponent", "Average Formula"]] = ["Fatty Acids", "Lipid", FA_av_formula]


In [8]:
for name, row in mols.iterrows():
    met = AllMets.get(name)
    if met:
        met.name = name
        met.formula = row["Average Formula"]
    else:
        AllMets[name] = metabolite(name=name, 
                                          formula=row["Average Formula"])

#### 2. b) Calculate a representative formula for each lipid, by adding x R groups to the core formula, using the average fatty acid (R) calculated above

In [9]:
# extract number of R groups per lipid 
R_quantity = lipids_df.loc[:,"Number of Rs"].astype(int).to_numpy()
# product of number of R groups and the average R group
R_groups_formula = FA_av_formula_array.dot(R_quantity.reshape((1,-1))) 

# extract to dict, lipid names as keys, formulas as values
core_lipid_formulas = lipids_df.loc[:, "Core Formula"].to_dict() 
# Formula to elemental dictionary
core_lipids = [metabolite(name=k, formula=v) for k, v in core_lipid_formulas.items()] 

# Convert the elemental dictionary to an array, reshape to column vector
cl_arrays = [elements_dict_to_array(lip.elements()) for lip in core_lipids]
cl_arrays = [x.reshape((-1,1)) for x in cl_arrays]

# Concatenate all of the lipids with R groups formulas into a matrix.
# rows of elements, columns of different lipids
cl_matrix = np.concatenate(cl_arrays, axis=1)

# Add the extra R group formulas to the original lipid compound formulas
lipid_av_formula_mx = cl_matrix + R_groups_formula
# Convert the representative "average" lipid formula array to text
av_lipid_formulas = [array_to_formula(row, elemental_order=elemental_order) for row in lipid_av_formula_mx.T.round(2)]
# Add the text formulas as a column
# lipids_df.loc[:, "Formula"] = av_lipid_formulas
lipids_df.loc[:, "Average Formula"] = av_lipid_formulas
# update the average formula in the mols dataframe for the lipids
mols.update(lipids_df.loc[:,["Average Formula"]])

# Recalcualte mass according to updated formulas in both dataframes
# lipids_df["Average Mass"] = lipids_df["Formula"].apply(lambda x: metabolite(formula=x).mass)
mols["Average Mass"] = mols["Average Formula"].apply(lambda x: metabolite(formula=x).mass)

lipids_df["Average Mass"] = lipids_df["Average Formula"].apply(lambda x: metabolite(formula=x).mass)
mols["Average Mass"] = mols["Average Formula"].apply(lambda x: metabolite(formula=x).mass)
# lipids_df = lipids_df.reset_index()
AllStaticComponents = pd.concat([AllStaticComponents, 
                                 lipids_df.reset_index().loc[:,["Name","Family",
                                                                "Macrocomponent", 
                                                                "Average Formula"]]])
                                                

#### 2. c) DNA

In [10]:
nts_df = pd.read_csv("../data/biomass/static/Nucleotides.csv", index_col=0, dtype=str)
nts_df = nts_df.astype({"moles/moles Family": float})

DNA_df = pd.merge(nts_df[nts_df.Family == "DNA"].iloc[:,1:],
                  mols[mols.Family == "DNA"].loc[:,["Average Formula","Average Mass"]], 
                  on="Name")

fractions_calculator(DNA_df, "moles/moles Family", "Average Mass", "DNA", 
                                     value_type="molar_fraction")

formulas = DNA_df.loc[:, "Average Formula"].to_dict()
mol_fractions = DNA_df.loc[:, "moles/moles Family"].values
DNA_av_formula = weighted_average_formula(formulas, mol_fractions)

# AllStaticComponents.loc["Average Fatty Acid (R)",["Family", "Macrocomponent", "Average Formula"]] = ["Fatty Acids", "Lipid", FA_av_formula]
AllStaticComponents = pd.concat([AllStaticComponents, 
                                 pd.Series({"Name":"DNA",
                                            "Family":"DNA",
                                            "Macrocomponent":"DNA",
                                            "Average Formula":DNA_av_formula}).to_frame().T], ignore_index=True) 


#### 2. d) RNA

In [11]:
RNA_df = nts_df[nts_df.Macrocomponent == "RNA"].copy()
RNA_class_comp = RNA_df[RNA_df.Family == "RNA"].copy()
RNA_df.drop(RNA_class_comp.index, inplace=True)
RNA_df = pd.merge(RNA_df, 
                  mols[mols.Family == "RNA"].loc[:,["Average Formula","Average Mass"]],
                  on="Name", how="left")
RNA_types = RNA_class_comp.index.to_list()
for RNA_class in RNA_types:
    
    x_mask = RNA_df.Family == RNA_class
    RNA_df.loc[x_mask, "moles/moles Family"] = RNA_df.loc[x_mask, "moles/moles Family"] * \
    RNA_class_comp.loc[RNA_class, "moles/moles Family"]

av_RNA = RNA_df.reset_index().groupby("Name").sum()
RNA_average = pd.concat([av_RNA, 
                         RNA_df.loc[:,["CheBI", "Macrocomponent","Average Formula"]
                                   ].drop_duplicates()],
                        axis=1)

RNA_average["Family"] = 'RNA'

fractions_calculator(RNA_average, "moles/moles Family", "Average Mass", "RNA", 
                                     value_type="molar_fraction")


RNA_df = RNA_df.append(RNA_average)

formulas = RNA_average.loc[:, "Average Formula"].to_dict()
mol_fractions = RNA_average.loc[:, "moles/moles Family"].values
RNA_av_formula = weighted_average_formula(formulas, mol_fractions)

AllStaticComponents = pd.concat([AllStaticComponents,
                                 pd.Series({"Name":"RNA",
                                            "Family":"RNA",
                                            "Macrocomponent":"RNA",
                                            "Average Formula":RNA_av_formula}).to_frame().T], ignore_index=True)

#### 2. f) Sphingolipids Representation
FA chain distribution is quite different in the sphingolipids, use the same as published previosuly, from Grillitsch

In [12]:
SphLip = pd.read_csv(f"../data/biomass/static/Sphingolipids.csv",index_col=0)
SphLip["Formula"] = SphLip.index.to_series().apply(lambda x: AllMets[x].formula)
SphLip["Mass"] = SphLip.index.to_series().apply(lambda x: AllMets[x].mass)


for family, group in SphLip.groupby("Family"):
    mol_fractions = group["mol/mol Family"].to_numpy()
    formulas = group.Formula.to_dict()
    family_av_formula = weighted_average_formula(formulas, mol_fractions)
    
    AllStaticComponents = pd.concat([AllStaticComponents,
                                     pd.Series({"Name":family,
                                                "Family":"Sphingolipids",
                                                "Macrocomponent":"Lipids",
                                                "Average Formula":family_av_formula}).to_frame().T], ignore_index=True)
    

### Update dictionary entries

In [13]:

        
for ix, row in AllStaticComponents.iterrows():
    met = AllMets.get(row["Name"])
    if met:
        met.name = row["Name"]
        met.formula = row["Average Formula"]
    else:
        AllMets[row["Name"]] = metabolite(name=row["Name"], 
                                          formula=row["Average Formula"])
        

## 3. Incorporate variable measurements for the different sample points 

Variable components
- Carbohydrates
    - Glycogen content
    - Trehalose content
    - Total cabrohydrate
- Lipids
    - Lipid distribution
    - Sum of lipid distribution
- Protein
    - Amino acid content
    - Total amino acids
    - Total protein - Biuret
- DNA
    - Total DNA - 
- RNA
    - Total RNA -

### Import the measured biomass compositions

In [14]:
sample_info = pd.read_csv(f"../data/SampleInfo.csv", index_col=0)
cul_grp_dict = sample_info["Chemostat Length"].to_dict()

#### 3. a) Amino acid composition and the sum of amino acids.

Amino Acid data is all present as nmol/g DCW

### Import and do initial processing on Amino Acid data
- Also includes calcualtion mean and SD of the total protein composition from the AA data 

In [15]:
# Data is is nmol/mg CDW
AA_df = pd.read_csv(f"../data/biomass/measured/AAComp.csv")
cols = list(AA_df.columns)
# Remove end bits from aminoa cids and use the three letter code
cols = cols[:1] + [x[:3] for x in cols[1:]]
AA_df.columns = cols
AA_df.set_index("Sample", inplace=True)

# Arbitrary split between Asp and Asn for the Asx.Half to each
AA_df["Asp"] = AA_df.loc[:,"Asx"]/2
AA_df["Asn"] = AA_df.loc[:,"Asx"]/2
AA_df.loc["WFR Lysozyme on yeast (%)",["Asp", "Asn"]] = AA_df.loc["WFR Lysozyme on yeast (%)","Asx"]

lysrec = AA_df.loc["WFR Lysozyme on yeast (%)","Glu"]
AA_df["Gln"] = AA_df.loc[:,"Glu"]/2
AA_df["Glu"] = AA_df.loc[:,"Glu"]/2
# Want to represent tryptophan but acid hydrolysis affects residue recovery, so use tyrosine value
AA_df["Trp"] = AA_df["Tyr"]



AA_df.loc["WFR Lysozyme on yeast (%)",["Glu", "Gln"]] = lysrec

AA_df.drop(columns="Asx", inplace=True)

rename_dict = mols[mols.Family == "Amino Acid"]["Alternative Name"].to_dict()
rename_dict = {v:k for k,v in rename_dict.items()}

# AA_df.set_index("Sample", inplace=True)
AA_df.rename(columns=rename_dict, inplace=True)
lysozyme = AA_df.loc["WFR Lysozyme on yeast (%)",:]
AA_df.drop("WFR Lysozyme on yeast (%)", inplace=True)
AA_df.reset_index(inplace=True)

AA_df[["Cultivation","Sample Point"]] = AA_df.Sample.str.split("_", expand=True)
AA_df["Cultivation Group"] = AA_df["Cultivation"].apply(lambda x: cul_grp_dict.get(x))

AA_df.drop(columns="Sample", inplace=True)
AA_df.set_index(["Cultivation Group", "Sample Point", "Cultivation"], inplace=True)
# Convert to mmoles/gCDW. Originally in nmoles/mg
AA_df = AA_df * 1e-3

AA_adjusted = AA_df.copy() # Can adjust for lysozyme by uncommenting the AA_df comment
AA_adjusted = AA_adjusted / (lysozyme/100)


# Reconciliaiton measurements

AA_mws = mols.loc[AA_df.columns, "Average Mass"].values/1000
for i, df in enumerate([AA_df, AA_adjusted]):
    
    temp = (df * AA_mws).sum(axis=1).to_frame()
    temp = temp.groupby(["Cultivation Group", "Sample Point"]).agg([("Value", "mean"), ("SD", "std")])
    temp = temp.droplevel(0, axis=1)

    temp.reset_index(inplace=True)

    
    temp.set_index(["Cultivation Group", "Sample Point"], inplace=True)
    temp.loc[("SC", "R3"), "Value"] = temp.loc[("SC", ["SS","R6"]), "Value"].mean()
    # Add a higher SD value, relative SD of 10% to the R3 value due to being an average of the two neighbouring smapling points
    temp.loc[("SC", "R3"), "SD"] = temp.loc[("SC", "R3"), "Value"] * 0.1
    
    temp.reset_index(inplace=True)
    temp["Name"] = ["Protein-AA_Sum", "Protein-AA_Sum_Adjusted"][i]
    temp["Method"] = ["AA_Sum", "AA_Sum_Adjusted"][i]
    temp["Macrocomponent"] = "Protein"


    AllMeasurements = pd.concat([AllMeasurements,
                                            temp], ignore_index=True)

#### 3. b) Representative formula for protein as well as the stoichiometry for each condition
    - Using the non adjusted amino acid composition

In [16]:
AA_grouped = AA_df.groupby(["Cultivation Group", "Sample Point"]).agg(["mean","std"]).stack(0).sort_index()

AA_grouped.reset_index(inplace=True)

AA_grouped.rename(columns={"mean":"mmol/g CDW", "level_2":"Name"}, inplace=True)
# No AA measured for R3 sampling point
R3_distribution = AA_grouped.set_index(["Cultivation Group", "Sample Point", "Name"]
                                      )["mmol/g CDW"].unstack().loc[("SC", ["SS", "R6"]),:].mean()
R3_distribution.name = "mmol/g CDW"
R3_distribution = R3_distribution.to_frame().reset_index()
R3_distribution["Cultivation Group"] = "SC"
R3_distribution["Sample Point"] = "R3"

AA_grouped = pd.concat([AA_grouped, R3_distribution], ignore_index=True)

AA_grouped["Average Formula"] = AA_grouped.Name.apply(lambda x: AllMets[x].formula)
AA_grouped["Average Mass"] = AA_grouped.Name.apply(lambda x: AllMets[x].mass)

AA_grouped.set_index(["Cultivation Group", "Sample Point", "Name"], inplace=True)



all_groups = []
all_sp_formulas = []

for name, group in AA_grouped.groupby(["Cultivation Group", "Sample Point"]):
    
    group = fractions_calculator(group, "mmol/g CDW", "Average Mass", "Protein", 
                         value_type="mmol_gcdw_fraction", inplace=False)
    
    all_groups.append(group)

AA_grouped = pd.concat(all_groups)
AA_grouped["Family"] = "Amino Acids"
AA_grouped["Macrocomponent"] = "Protein"

temp = AA_grouped.reset_index()





temp.rename(columns={"g/g Protein":"Value"}, inplace=True)
temp = temp.loc[:, ["Cultivation Group", "Sample Point", "Name", "Value", 
                    "Family", "Macrocomponent"]]

VariableStoichMeasurements = pd.concat([VariableStoichMeasurements,
                                        temp], ignore_index=True)




#### 3. c) Representative representative lipid for each sampling point

In [17]:
lipid_name_dict = {"DG":"1,2-diglyceride",
                   "TG":"triglyceride",
                   "Ergosterol":"ergosterol",
                   "Cer":"Ceramide",
                   "LPC":"lysophosphatidylcholine",
                   "PC":"phosphatidylcholine",
                   "PE":"phosphatidylethanolamine",
                   "PG":"phosphatidylglycerol",
                   "PI":"1-phosphatidyl-1D-myo-inositol(1-)",
                   "PS":"phosphatidyl-L-serine(1-)",
                   "ZE":"zymosterol ester",
                   "EE":"ergosteryl ester",
                   "HexCer":"Glucosylceramide"}

Import results from manual clacualtion of ceramide and sphingolipids amounts and ratios to inositol containing sphingolipids

In [18]:

ManualCer = pd.read_csv(f"../data/biomass/measured/ManualCer.csv", index_col=0)
SphingoRatios = pd.read_csv(f"../data/biomass/static/SphingoRatios.csv", index_col=0)

ManualCer.set_index(["Family","Sample","Sample Point","Cultivation Group"], inplace=True)
ManualCer.rename(index=lipid_name_dict, inplace=True)

CerHexCerRatios = ManualCer.loc["Ceramide"]/ManualCer.loc["Glucosylceramide"]

ManualCer.reset_index(inplace=True)
ManualCer.set_index(["Sample","Sample Point","Cultivation Group", "Family"], inplace=True)

In [19]:
TotLip = pd.read_csv(f"../data/biomass/measured/LipidConc.csv", index_col=[0])
TotLip.Sample = TotLip.Sample.str.replace('CO',"C0")
TotLip = TotLip[TotLip.Type == "mean"].drop(columns="Type")
TotLip[["Cultivation","Sample Point"]] = TotLip.Sample.str.split("_", expand=True)
TotLip["Cultivation Group"] = TotLip["Cultivation"].apply(lambda x: cul_grp_dict.get(x))

SamplePointTotals = TotLip.groupby(["Cultivation Group","Sample Point"]).agg(["mean", "std"])
SamplePointTotals = SamplePointTotals.stack(0).T.stack()


# SamplePointTotals.to_csv(f"../data/biomass/lipids/SamplePointTotals.csv")

# Drop cultivation column and create variable:value columns of lipid:g/g %
TotLip = TotLip.drop(columns="Cultivation").melt(id_vars=["Sample", "Sample Point", "Cultivation Group"])
# Split lipid names to Family and lipid, before pivot table aggregates and calcualtes mean and
# # stdev for each lipid Family at each sampling point. Final transformation such that rows = smapling points
TotLip[["Family", "Lipid"]] = TotLip.variable.str.split(" ", 1, expand=True)

TotLipFamily = TotLip.groupby(["Sample", "Sample Point", "Cultivation Group", "Family"]).sum()
TotLipFamily.rename(index=lipid_name_dict, columns={"value":"Value"}, inplace=True)

TotLipFamily.loc[(slice(None), ["SS", "C0.1"], ["SC", "C0.1"], "Ceramide"),"Value"] = ManualCer.loc[(slice(None), ["SS", "C0.1"], ["SC", "C0.1"], "Ceramide"),"Value"]

temp_index = TotLipFamily.loc[(slice(None), slice(None), slice(None), "Ceramide"),:].droplevel(-1).index
HexCer = TotLipFamily.loc[(slice(None), slice(None), slice(None), "Ceramide"),:] / CerHexCerRatios.loc[temp_index, :] #values
HexCer = HexCer.droplevel(-1)


SphingoValues = pd.DataFrame(HexCer.values * SphingoRatios.values.T,
                             columns=SphingoRatios.index,
                             index=HexCer.index)

SphingoValues = SphingoValues.stack().reset_index()
SphingoValues.rename(columns={0: "Value"}, inplace=True) # SHOULD CHANGE IT TO g/g and only change it back later
TotLipFamily = pd.concat([TotLipFamily.reset_index(),
                          SphingoValues], ignore_index=True)

TotLipFamily.rename(columns={"Family":"Name"}, inplace=True)



#### 3. d) Sum of lipid classes
The sum of the lipids from the lipidomic analysis has been demonstrated to be a reasonable approximation of the total lipid content. We included the extra "assumed" sphingolipids as an addition to the total lipid calcualtion. These values are small and make little relative difference 

In [20]:
SumLip = TotLipFamily.groupby(["Sample", "Cultivation Group", "Sample Point"]).sum()


SumLip = SumLip.groupby(["Cultivation Group", "Sample Point"]).agg(["mean", "std"])
SumLip = SumLip.rename(columns={"value":"Lipid"})

temp = SumLip.droplevel(0, axis=1)  # No measurement for R3, so take an average between SS and R6

temp.rename(columns={"mean":"Value", "std":"SD"}, inplace=True)
temp.loc[("SC", "R3"),"Value"] = temp.loc[("SC", ["SS","R6"]),"Value"].mean()
# Add a higher SD value, relative SD of 10% to the R3 value due to being an average of the two neighbouring smapling points
temp.loc[("SC", "R3"),"SD"] = temp.loc[("SC", "R3"),"Value"] * 0.10
temp.reset_index(inplace=True)

temp["Name"] = "Lipid"
temp["Method"] = "Sum of Lipid"
temp["Macrocomponent"] = "Lipid"
AllMeasurements = pd.concat([AllMeasurements,
                                        temp], ignore_index=True)


In [21]:
TotLipFamily_pivot = TotLipFamily.pivot_table(values='Value', index=["Sample", "Sample Point", "Cultivation Group"], columns='Name')

TLF_grouped = TotLipFamily_pivot.groupby(["Cultivation Group", "Sample Point"]).agg(["mean","std"]).stack(0).sort_index()
TLF_grouped.reset_index(inplace=True)

TLF_grouped.rename(columns={"mean":"g/g CDW", "level_2":"Name"}, inplace=True)

R3_distribution = TLF_grouped.set_index(["Cultivation Group", "Sample Point", "Name"])["g/g CDW"].unstack().loc[("SC", ["SS", "R6"]),:].mean()
R3_distribution.name = "g/g CDW"
R3_distribution = R3_distribution.to_frame().reset_index()
R3_distribution["Cultivation Group"] = "SC"
R3_distribution["Sample Point"] = "R3"
TLF_grouped = pd.concat([TLF_grouped, R3_distribution], ignore_index=True)

# TLF_grouped = pd.merge(TLF_grouped, mols[["Average Formula", "Average Mass"]].reset_index())
TLF_grouped["Average Formula"] = TLF_grouped.Name.apply(lambda x: AllMets[x].formula)
TLF_grouped["Average Mass"] = TLF_grouped.Name.apply(lambda x: AllMets[x].mass)

TLF_grouped.set_index(["Cultivation Group", "Sample Point", "Name"], inplace=True)


all_groups = []

for name, group in TLF_grouped.groupby(["Cultivation Group", "Sample Point"]):
    
    group = fractions_calculator(group, "g/g CDW", "Average Mass", "Lipid", 
                         value_type="mass_fraction", inplace=False)
    
    all_groups.append(group)
    
TLF_grouped = pd.concat(all_groups)
TLF_grouped["Family"] = "Lipid"
TLF_grouped["Macrocomponent"] = "Lipid"

temp = TLF_grouped.reset_index()



temp.rename(columns={"g/g Lipid":"Value"}, inplace=True)
temp = temp.loc[:, ["Cultivation Group", "Sample Point", "Name", "Value", 
                    "Family", "Macrocomponent"]]

VariableStoichMeasurements = pd.concat([VariableStoichMeasurements,
                                        temp], ignore_index=True)



#### 3. e) Calcualte carbohydrate composition
Measurements of trehaloise and glycogen are combined with total carbohydrate content. Assuming a chitin-glucan complex as 22% of the total cell dry weight and clacualting accordingly.

Import of carbohydrate data and initial processing

In [22]:
Carb_df = pd.read_csv("../data/biomass/measured/CarbComp.csv")

sample_info = pd.read_csv("../data/SampleInfo.csv", index_col=0)
cul_grp_dict = sample_info["Chemostat Length"].to_dict()

Carb_df[["Cultivation","Sample Point"]] = Carb_df.Sample.str.split("_", expand=True)

Carb_df["Cultivation Group"] = Carb_df["Cultivation"].apply(lambda x: cul_grp_dict.get(x))


Carb_df.drop(columns="Cultivation", inplace=True)

CGC_df = pd.read_csv("../data/biomass/static/CGCComp.csv", index_col=0)
CGC_prop = CGC_df.loc["CGC","g/g Family"]
CGC_df = CGC_df.drop("CGC")
CGC_df.loc[:,"g/g DCW"] = CGC_df.loc[:,"g/g Family"]* CGC_prop
CGC_g_g = CGC_df.T.loc["g/g DCW",:]

Carb_df = Carb_df[Carb_df["Variable"] == "Mean"]
Carb_df.drop(columns=["Sample", "Variable"], inplace=True)
Carb_df = Carb_df.assign(**CGC_g_g.to_dict())
Carb_df.set_index(["Cultivation Group", "Sample Point"], inplace=True)
other = Carb_df.Total - Carb_df[Carb_df.columns.drop("Total")].sum(axis=1)
# # # Assume other as Glucan and then added to the glucan columns
Carb_df["(1->4)-beta-D-glucan"] = other + Carb_df["(1->4)-beta-D-glucan"]
Carb_df.rename(columns={"Glycogen":"glycogen",
                             "Trehalose":"alpha,alpha-trehalose"},
                   inplace=True)

# No total Carb values for some
Carb_df.dropna(inplace=True)

Grouping and calcualting stoichiometric ratios etc

In [23]:
Carb_df_grouped = Carb_df.groupby(["Cultivation Group", "Sample Point"]).agg(["mean","std"])


temp = Carb_df_grouped.loc[:,["Total"]]
temp = temp.droplevel(0,axis=1)
temp.reset_index(inplace=True)
temp.rename(columns={"mean":"Value", "std":"SD"}, inplace=True)
temp["Name"] = "Carbohydrate"
temp["Method"] = "Measured"
temp["Macrocomponent"] = "Carbohydrate"

AllMeasurements = pd.concat([AllMeasurements,
                                        temp], ignore_index=True)

Carb_df_grouped.drop(columns="Total", inplace=True)
Carb_df_grouped = Carb_df_grouped.stack(0)
Carb_df_grouped.reset_index(inplace=True)


Carb_df_grouped.rename(columns={"mean":"g/g CDW", "level_2":"Name"}, inplace=True)

Carb_df_grouped["Average Formula"] = Carb_df_grouped.Name.apply(lambda x: AllMets[x].formula)
Carb_df_grouped["Average Mass"] = Carb_df_grouped.Name.apply(lambda x: AllMets[x].mass)
Carb_df_grouped.set_index(["Cultivation Group", "Sample Point", "Name"], inplace=True)

all_groups = []

for name, group in Carb_df_grouped.groupby(["Cultivation Group", "Sample Point"]):
    
    group = fractions_calculator(group, "g/g CDW", "Average Mass", "Carbohydrate", 
                         value_type="mass_fraction", inplace=False)
    
    all_groups.append(group)
    
Carb_df_grouped = pd.concat(all_groups)
Carb_df_grouped["Family"] = "Carbohydrate"
Carb_df_grouped["Macrocomponent"] = "Carbohydrate"

temp = Carb_df_grouped.reset_index()

temp.rename(columns={"g/g Carbohydrate":"Value"}, inplace=True)
temp = temp.loc[:, ["Cultivation Group", "Sample Point", "Name", "Value", 
                    "Family", "Macrocomponent"]]
VariableStoichMeasurements = pd.concat([VariableStoichMeasurements,
                                        temp], ignore_index=True)


## 5. Importing macrocompoentn measurements
- Protein
    - Total protein - Biuret and BSA method
- DNA
    - Total content
- RNA 
    - Total content
    

#### 5. a) Total protein content

In [24]:
TotalProt = pd.read_csv(f"../data/biomass/measured/ProteinComp.csv")

TotalProt[["Cultivation","Sample Point"]] = TotalProt.Sample.str.split("_", expand=True)

TotalProt["Cultivation Group"] = TotalProt["Cultivation"].apply(lambda x: cul_grp_dict.get(x))
TotalProt = TotalProt[TotalProt["Variable"] == "Mean"]
TotalProt.drop(columns=["Variable", "Cultivation"])
TotalProt.drop(columns=["Variable", "Cultivation", "Sample"], inplace=True)
TotalProt = TotalProt.groupby(["Cultivation Group", "Sample Point", "Method"]).agg(["mean", "std"])
TotalProt = TotalProt.droplevel(0,axis=1)
TotalProt.reset_index(inplace=True)
TotalProt["Name"] = "Protein-" + TotalProt["Method"]
TotalProt["Macrocomponent"] = "Protein"

TotalProt.rename(columns={"mean":"Value", "std":"SD"}, inplace=True)
AllMeasurements = pd.concat([AllMeasurements,
                                        TotalProt])

#### 5. b) Total DNA

In [25]:
DNAComp = pd.read_csv(f"../data/biomass/measured/DNAComp.csv")
DNAComp = DNAComp[DNAComp["Variable"] == "Mean"]
DNAComp[["Cultivation","Sample Point"]] = DNAComp.Sample.str.split("_", expand=True)

DNAComp["Cultivation Group"] = DNAComp["Cultivation"].apply(lambda x: cul_grp_dict.get(x))
DNAComp.drop(columns=['Sample', 'Variable', 'Cultivation'], inplace=True)
DNAComp = DNAComp.groupby(["Cultivation Group", "Sample Point"]).agg(["mean", "std"])


DNAComp = DNAComp.droplevel(0, axis=1)

DNAComp.reset_index(inplace=True)
DNAComp.rename(columns={"level_2":"Name", "mean":"Value", "std":"SD"}, inplace=True)
DNAComp["Name"] = "DNA"
DNAComp["Macrocomponent"] = "DNA"
DNAComp["Method"] = "Total DNA"

AllMeasurements = pd.concat([AllMeasurements,
                                        DNAComp])

#### 5. c) Total RNA

In [26]:
RNAComp = pd.read_csv(f"../data/biomass/measured/RNAComp.csv")
RNAComp = RNAComp[RNAComp["Variable"] == "Mean"]
RNAComp[["Cultivation","Sample Point"]] = RNAComp.Sample.str.split("_", expand=True)

RNAComp["Cultivation Group"] = RNAComp["Cultivation"].apply(lambda x: cul_grp_dict.get(x))
RNAComp.drop(columns=['Sample', 'Variable', 'Cultivation'], inplace=True)
RNAComp = RNAComp.groupby(["Cultivation Group", "Sample Point"]).agg(["mean", "std"])


RNAComp = RNAComp.droplevel(0, axis=1)

RNAComp.reset_index(inplace=True)
RNAComp.rename(columns={"level_2":"Name", "mean":"Value", "std":"SD"}, inplace=True)
RNAComp["Name"] = "RNA"
RNAComp["Macrocomponent"] = "RNA"
RNAComp["Method"] = "Total RNA"

AllMeasurements = pd.concat([AllMeasurements,
                                        RNAComp]) 

## 6. Bring all data together and export


In [27]:
AllMeasurements["Cultivation"] = AllMeasurements["Cultivation Group"] + "_" + AllMeasurements["Sample Point"]
AllMeasurements.drop(columns=["Cultivation Group", "Sample Point"], inplace=True)

protein = AllMeasurements[AllMeasurements.Name == "Protein-AA_Sum"].copy()
protein["Name"] = "Protein"

AllMeasurements = AllMeasurements[~(AllMeasurements.Macrocomponent == "Protein")]
AllMeasurements = pd.concat([AllMeasurements, protein])
AllMeasurements.sort_index(inplace=True)
MeasurementsLong = AllMeasurements.reset_index().melt(id_vars=["Cultivation", "Name"], 
                   value_vars=["Value","SD"],value_name="g/g CDW")

MeasurementsLong.set_index(["Cultivation", "Name", "variable"],inplace=True)
MeasurementsLong.to_csv("../results/dataframes/biomass/MeasurementsLong.csv")

VariableStoichMeasurements.to_csv("../results/dataframes/biomass/VariableStoichMeasurements.csv")

AllStaticComponents.set_index("Name", inplace=True)

mols = pd.concat([mols, AllStaticComponents[~AllStaticComponents.index.isin(mols.index)]])
mols.drop(columns=["Core Formula", "Core Formula Mass"], inplace=True)
mols["Average Mass"] = mols.apply(lambda row: AllMets[row.name].mass, axis=1)
emm = pd.read_csv(f"../data/ExtraMetsModel.csv", index_col=0)
mols.loc[emm.index,"Model ID"] = emm["Model ID"]
mols.to_csv("../results/dataframes/biomass/AllMolswFormula.csv")