In [1]:
import cobra
import GEMS
import pandas as pd
import numpy as np
import os
import re


import pickle
from etcpy import etc
from etcpy import tempDep
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score

import matplotlib.pyplot as plt

In [8]:
def sample_data_uncertainty_with_constraint_increasing_topt(inpt,columns=None):
    if type(inpt)==tuple:
        params,seed = inpt
        np.random.seed(seed+int(time.time()))
    else: params = inpt
    '''
    # params is a dataframe with following columns:
    # Tm,Tm_std:  melting temperature. Given in K
    #
    # T90, temperature at which 90% of an enzyme is denatured. This is not mandentory. If missing, protein length will be used
    #      to calculate denaturation curve. Given in K
    # 
    # dCpt,dcpt_std: the heat capacity difference between transition state and ground state in enzyme catalyzed reaction. Given
    #                in J/mol/K
    # 
    # Topt,Topt_std: the optimal temprature at which the specific activity is maximized. Given in K
    #
    # Length, protein length. This will not be used in this function, but it will be used later in the calculation of thermal
    #         parameters.
    #
    # xx_std, corresponding uncertainty given by standard deviation.
    # 
    # columns: a list of columns to be sampled, could be any combination of ['Tm','dCpt','Topt]. 
    #          If it is None, then sample all three columns
    # 
    # The script will return an new dataframe with the same columns but with randomly sampled data
    '''
    
    sampled_params = params.copy()
    if columns is None: columns = ['Topt', 'dCpt', 'Tm']
    
    for col in columns:
        if col == 'Topt':
            for ind in sampled_params.index:
                if params.loc[ind, 'topt_source']=='BullShit':
                    M = (params.loc[ind, 'Tm']-params.loc[ind,col])*0.5 + params.loc[ind,col]
                    SD = (params.loc[ind, 'Tm']-params.loc[ind,col])*0.2
                    sampled_params.loc[ind, 'Topt'] = np.random.normal(M,SD)
                else: 
                    M = (params.loc[ind, 'Tm']-params.loc[ind,col])*0.5 + params.loc[ind,col]
                    SD = abs((params.loc[ind, 'Tm']-params.loc[ind,col])*0.1)
                    #print(M, SD)
                    sampled_params.loc[ind, 'Topt'] = np.random.normal(M,SD)
            
        if col == 'dCpt':
            lst = [np.random.normal(params.loc[ind,col],params.loc[ind,col+'_std']) for ind in sampled_params.index]
            sampled_params[col] = lst
        if col == 'Tm':
            for ind in sampled_params.index:
                if params.loc[ind, 'TmTag']=='Mean':
                    sampled_params.loc[ind, 'Tm'] = np.random.normal(params.loc[ind,col],4)
                else:
                    sampled_params.loc[ind, 'Tm'] = np.random.normal(params.loc[ind,col],2)
    return sampled_params

In [4]:
data = GEMS.load_exp_data('../data/ExpGrowth.csv')
path = '../'
params = pd.read_csv(os.path.join(path,'data/model_enzyme_params_new_tagged.csv'),index_col=0)
df = etc.calculate_thermal_params(params)

dfae_batch = data[0].set_index('Ts').rename_axis(None)
params

Unnamed: 0,Topt,Topt_std,Length,Tm,Tm_std,T90,dCpt,dCpt_std,topt_source,TmTag
P77467,306.15,10,262,328.717325,7.588735,,-4000,1000,predicted,Mean
P76077,311.15,10,309,328.717325,7.588735,,-4000,1000,predicted,Mean
P76078,310.15,10,95,328.717325,7.588735,,-4000,1000,BullShit,Mean
P76079,309.15,10,248,328.717325,7.588735,,-4000,1000,predicted,Mean
P76081,310.15,10,356,328.717325,7.588735,,-4000,1000,BullShit,Mean
...,...,...,...,...,...,...,...,...,...,...
P0AG80,313.15,10,438,329.166303,4.070000,330.45,-4000,1000,predicted,Exp
P77718,307.15,10,482,319.125751,4.070000,320.15,-4000,1000,predicted,Exp
P0A6D5,308.15,10,288,328.762604,7.710000,,-4000,1000,predicted,Mean
standard,308.15,10,272,328.762604,7.710000,,-4000,1000,BullShit,Mean


In [9]:
newParams = sample_data_uncertainty_with_constraint_increasing_topt(params)
newParams

317.43366249999997 2.256732500000004
319.93366249999997 1.756732500000004
318.93366249999997 1.956732500000004
316.91119133999996 1.3522382680000022
319.8168700673775 2.5333740134755036
317.24458538174747 1.8189170763495044
317.46302862193147 1.8626057243863046
318.394957511224 1.4489915022448032
317.806226670815 1.9312453341630034
317.201842415 2.210368483000002
315.015354095 1.5730708190000031
317.5957987598185 2.2891597519637004
322.988141145 1.7676282290000018
326.86723066 3.143446132000003
317.167059945 1.003411989
320.91167466 2.1523349320000023
319.93366249999997 1.756732500000004
320.43366249999997 1.656732500000004
316.43366249999997 2.456732500000004
318.93366249999997 1.956732500000004
316.498591724823 1.8697183449646049
316.095381395 1.3890762789999997
317.43366249999997 2.256732500000004
318.43366249999997 2.056732500000004
322.3360706054035 3.2372141210807004
317.16538214499997 2.0030764290000036
312.43366249999997 3.256732500000004
319.43366249999997 1.8567325000000041
3

Unnamed: 0,Topt,Topt_std,Length,Tm,Tm_std,T90,dCpt,dCpt_std,topt_source,TmTag
P77467,314.940008,10,262,329.664261,7.588735,,-2411.347056,1000,predicted,Mean
P76077,316.654232,10,309,329.529076,7.588735,,-2613.946099,1000,predicted,Mean
P76078,324.326343,10,95,330.418974,7.588735,,-4665.212841,1000,BullShit,Mean
P76079,318.909001,10,248,338.441477,7.588735,,-4494.029941,1000,predicted,Mean
P76081,319.364474,10,356,329.333683,7.588735,,-2795.187059,1000,BullShit,Mean
...,...,...,...,...,...,...,...,...,...,...
P0AG80,318.388046,10,438,329.484452,4.070000,330.45,-4408.794280,1000,predicted,Exp
P77718,313.920770,10,482,317.038108,4.070000,320.15,-5293.721071,1000,predicted,Exp
P0A6D5,322.317607,10,288,333.349632,7.710000,,-4832.433672,1000,predicted,Mean
standard,325.274131,10,272,323.698866,7.710000,,-3438.560130,1000,BullShit,Mean
