In [1]:
import pandas as pd
import numpy as np
import re
import pickle

In [95]:
data_folder = 'data/230623_Kinetics_DA/'

# get all relevant files: source plates, mixing tables, data
buffer_sp = pd.read_excel(data_folder + 'buffers-sp.xlsx', engine='openpyxl').dropna(0,how='all').dropna(1,how='all')
plasmid_sp = pd.read_excel(data_folder + 'plasmids_sp.xlsx', engine='openpyxl').dropna(0,how='all').dropna(1,how='all')
genex_mt = pd.read_csv(data_folder + 'genex-mt.csv').dropna(0,how='all').dropna(1,how='all')
buffers_mt = pd.read_csv(data_folder + 'buffers-mt.csv').dropna(0,how='all').dropna(1,how='all')
data = pd.read_excel(data_folder + 'output.xlsx', engine='openpyxl').dropna(0,how='all').dropna(1,how='all')
# how do we handle values with no standard curve

# get species and reaction columns
species_index_b = [addition in buffer_sp.iloc[:,0].values for addition in list(buffers_mt.iloc[0,1:].keys())]
reactions_index_b = [addition not in buffer_sp.iloc[:,0].values for addition in list(buffers_mt.iloc[0,1:].keys())]
species_index_g = [addition in buffer_sp.iloc[:,0].values for addition in list(genex_mt.iloc[0,1:].keys())]
plasmids_index_g = [addition not in buffer_sp.iloc[:,0].values for addition in list(genex_mt.iloc[0,1:].keys())]

In [96]:
# get standards and parse equations
standards = pd.read_excel('data/LCMS_Standards.xlsx', engine='openpyxl').dropna(0,how='all').dropna(1,how='all')

# get mesurement columns
measurement_index = [np.any([slabel in dlabel for slabel in standards.iloc[:,0].values]) for dlabel in list(data.iloc[0,1:].keys())] # will only recognize label if in standards 

peak_to_concentration = {}
for row in standards.iterrows():
    equation = row[1]['Equation']
    # Use regular expressions to extract the slope and y-intercept
    match_pos = re.search(r'Y = (\d+)X \+ (\d+)', equation.replace('*',''))
    match_neg = re.search(r'Y = (\d+)X \- (\d+)', equation.replace('*',''))

    if match_pos:
        slope = int(match_pos.group(1))
        y_intercept = int(match_pos.group(2))

    if match_neg:
        slope = int(match_neg.group(1))
        y_intercept = -int(match_neg.group(2))

    peak_to_concentration[row[1]['Metabolite']] = lambda peak_area: (peak_area-y_intercept)/slope # what are the units?

In [97]:
# get information for simulation for each sample
# assuming the format for kinetic data is the same

# average measurements across replicates and convert to concentrations
data_entries = []
for i in range(len(data)): # for every data point
    d = data.iloc[:,1:].iloc[i,measurement_index].groupby(lambda x: x.split('_')[0]).mean().to_dict()
    d = {k:peak_to_concentration[k](v) for k,v in d.items()}  
    d['sample'] = data['Unnamed: 0'][i]
    d['time'] = data['Time'][i]*60*60
    data_entries.append(d)
data_to_fit = pd.DataFrame(data_entries).set_index('sample')

metadata = {}
metadata['dilution_factor'] = data.groupby('Unnamed: 0')['Dilution'].apply(lambda x: np.unique(x)[0]).to_dict()
metadata['timepoints'] = data_to_fit.groupby('sample')['time'].apply(lambda x: np.sort(x)).to_dict()
metadata['measurement_labels'] = list(data_to_fit.columns)[:-1]
metadata['sample_labels'] = list(data_to_fit.index.unique())

measurements = {}
enzyme_concentrations = {}
species_concentrations = {}

for sample in data.iloc[:,0].unique(): # for every sample name
    species = buffers_mt[buffers_mt['Unnamed: 0'] == sample].iloc[:,1:].iloc[0,species_index_b] # get which species were added
    reactions = buffers_mt[buffers_mt['Unnamed: 0'] == sample].iloc[:,1:].iloc[0, reactions_index_b] # get which txtl reactions were added
    for reaction in reactions.iteritems(): # for every reaction, get which plasmids and cofactors were added
        if reaction[1] > 0:
            plasmids = genex_mt[genex_mt['Unnamed: 0'] == reaction[0]].iloc[:,1:].iloc[0, plasmids_index_g]
            enzyme_concentrations[sample] =  plasmids/plasmids.sum()*reaction[1]/200 # still need to convert to molar based on txtl capacity, and make sure the dilution math is right
            # also need to get the cofactors added during genex, dilute them, and add them to the biosyn concentrations. eh, maybe not
            # need to fix the labeling across mixing tables (biotin vs Biotin)
    species_concentrations[sample] = species # need to make sure they all have the same units, and add any cofactors added during genex
    try:
        measurements[sample] = data_to_fit.loc[sample].sort_values(by='time').drop(columns='time').to_numpy() # if there is kinetic data
    except:
        measurements[sample] = data_to_fit.loc[sample][:-1].to_numpy() # if there is just one timepoint


In [83]:
# need to convert all labels into kegg ids and EC numbers for the model

# converting species labels to kegg ids can be done with equilibrator_api
from equilibrator_api import ComponentContribution
cc = ComponentContribution()

try:
    with open('src/fitting/speciess_kegg.pkl', 'rb') as f:
        speciess_kegg = pickle.load(f)
except:
    speciess_kegg = {}

for species in buffer_sp.iloc[:,0].values:
    if species not in speciess_kegg.keys():
        try: # the inhibitor string may be too off, or has no kegg id
            for i in cc.search_compound(species).identifiers:
                if i.registry.namespace == 'kegg':
                    speciess_kegg[species] = i.accession
                    break # if there are multiple kegg ids, just take the first one which is usually C#### instead of D/G####
        except:
            speciess_kegg[species] = '' # some of these will fail, maybe this is something we run a priori, curate, and then just read a file

with open('src/fitting/speciess_kegg.pkl', 'wb') as f:
    pickle.dump(speciess_kegg, f, pickle.HIGHEST_PROTOCOL)

In [98]:
# enzyme namre to kegg converter -> enzymes_kegg
# info to convert enzyme labels to EC should be in the file Maggie inputs to FRENDA-BRENDA (?) 
# for k,v in enzyme_concentrations.items():
#     enzyme_concentrations[k] = v.rename({k:enzymes_kegg[k] for k in v.index})

for k,v in species_concentrations.items():
    species_concentrations[k] = v.rename({k:speciess_kegg[k] for k in v.index})

init_concentrations = {sample:species_concentrations[sample].append(enzyme_concentrations[sample]) for sample in metadata['sample_labels']}

metadata['measurement_labels'] = [speciess_kegg[l] for l in metadata['measurement_labels']]

simulation_ready = {'init_concentrations':init_concentrations, 
                    'metadata':metadata,
                    'measurements':measurements}

with open(data_folder + 'simulation_ready.pkl', 'wb') as f:
    pickle.dump(simulation_ready, f, pickle.HIGHEST_PROTOCOL)