In [9]:
import os
from dotenv import load_dotenv, find_dotenv
from os.path import join, dirname, basename, exists, isdir

### Load environmental variables from the project root directory ###
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

# now you can get the variables using their names

# Check whether a network drive has been specified
DATABASE = os.environ.get("NETWORK_URL")
if DATABASE == 'None':
    pass
else:
    pass
    #mount network drive here

# set up directory paths
CURRENT_DIR = os.getcwd()
PROJ = dirname(dotenv_path) # project root directory

DATA = join(PROJ, 'data') #data directory
RAW_EXTERNAL = join(DATA, 'raw_external') # external data raw directory
RAW_INTERNAL = join(DATA, 'raw_internal') # internal data raw directory
INTERMEDIATE = join(DATA, 'intermediate') # intermediate data directory
FINAL = join(DATA, 'final') # final data directory

RESULTS = join(PROJ, 'results') # output directory
FIGURES = join(RESULTS, 'figures') # figure output directory
PICTURES = join(RESULTS, 'pictures') # picture output directory


# make folders specific for certain data
folder_name = ''
if folder_name != '':
    #make folders if they don't exist
    if not exists(join(RAW_EXTERNAL, folder_name)):
        os.makedirs(join(RAW_EXTERNAL, folder_name))

    if not exists(join(INTERMEDIATE, folder_name)):
        os.makedirs(join(INTERMEDIATE, folder_name))

    if not exists(join(FINAL, folder_name)):
        os.makedirs(join(FINAL, folder_name))


print('Standard variables loaded, you are good to go!')

Standard variables loaded, you are good to go!


In [10]:
import pandas as pd
import re

# Extract of supplementary table 5:
data = pd.read_csv(f"{RAW_INTERNAL}/proteomics/protein_values.csv", index_col=0)

# # Molecular weights:
MW = data["Molecular weight (Da)"] # Da = g/mol
MW = MW / 1000 # kDa = g/mmol
print(MW)

# start and end columns
start_column = data.columns.get_loc("Glucose")
end_column = data.columns.get_loc("Fructose")
start_column_uncertainty = data.columns.get_loc("Glucose.2")
end_column_uncertainty = data.columns.get_loc("Fructose.2")
# rename columns from .2 to uncertainty
rename_dict = {col_name:re.sub(r".2$", "_uncertainty", col_name) for col_name in \
 data.columns[start_column_uncertainty:end_column_uncertainty+1]}
data = data.rename(columns=rename_dict)
# list of indices of data needed
list_of_indices = list(range(start_column,end_column+1))
list_of_indices.extend(list(range(start_column_uncertainty,end_column_uncertainty+1)))
data = data.iloc[:, list_of_indices]

# get cell volumes
cell_volumes = pd.read_csv(f"{RAW_INTERNAL}/proteomics/growth_conditions.csv", index_col=0)
cell_volumes = cell_volumes["Single cell volume [fl]1"]
# remove the first two rows of LB
cell_volumes = cell_volumes.loc[~cell_volumes.index.duplicated(keep='first')]
# rename the number 3 in there
cell_volumes = cell_volumes.rename({'Osmotic-stress glucose3':'Osmotic-stress glucose_uncertainty'}, axis='index')




Uniprot Accession
P0A8T7    155.045008
P0A8V2    150.520276
P36683     93.420946
P15254    141.295898
P09831    163.176315
             ...    
P36667     31.022762
P0AC78     40.912094
P76164      8.702816
P38506     28.130364
Q46810     21.481914
Name: Molecular weight (Da), Length: 2359, dtype: float64


# 2.-Convert-Units:
-------
First of all, note that the variation values come as coefficients of variation (%), so let's transform them to the same units as the mean values (molecules/cell):

In [11]:
for (col_name, d) in data.iteritems():
    if col_name.endswith("_uncertainty"):
        mean_name = col_name.replace("_uncertainty", "")
        data[col_name] = data[col_name] / 100 * data[mean_name]

print(data.columns)
print(cell_volumes.index)

Index(['Glucose', 'LB', 'Glycerol + AA', 'Acetate', 'Fumarate', 'Glucosamine',
       'Glycerol', 'Pyruvate', 'Chemostat µ=0.5', 'Chemostat µ=0.35',
       'Chemostat µ=0.20', 'Chemostat µ=0.12', 'Stationary phase 1 day',
       'Stationary phase 3 days', 'Osmotic-stress glucose', '42°C glucose',
       'pH6 glucose', 'Xylose', 'Mannose', 'Galactose ', 'Succinate',
       'Fructose', 'Glucose_uncertainty', 'LB_uncertainty',
       'Glycerol + AA_uncertainty', 'Acetate_uncertainty',
       'Fumarate_uncertainty', 'Glucosamine_uncertainty',
       'Glycerol_uncertainty', 'Pyruvate_uncertainty',
       'Chemostat µ=0.5_uncertainty', 'Chemostat µ=0.35_uncertainty',
       'Chemostat µ=0.20_uncertainty', 'Chemostat µ=0.12_uncertainty',
       'Stationary phase 1 day_uncertainty',
       'Stationary phase 3 days_uncertainty',
       'Osmotic-stress glucose_uncertainty', '42°C glucose_uncertainty',
       'pH6 glucose_uncertainty', 'Xylose_uncertainty', 'Mannose_uncertainty',
       'Galactos

In [12]:
# Convert values to mmol/cell:
data = data / 6.022e+23 * 1000

# convert names
cell_volumes.index = [re.sub(r'\W+', '', i).lower() for i in cell_volumes.index]
data.columns = [re.sub(r'\W+', '', i).lower() for i in data.columns]


# Iterate through the dataset and divide by the corresponding cell volume, to get mmol/fL:
for (col_name, d) in data.iteritems():
    chemo_name = col_name.replace("_uncertainty", "").replace("_mean", "")
    try:
        data[col_name] = data[col_name] / cell_volumes.loc[chemo_name]#["cell_volume"]
    except:
        print(chemo_name)

# Finally, convert to mmol/gDW:
water_content = 0.3
cell_density = 1.105e-12
data = data / cell_density / water_content

data.head()

osmoticstressglucose
osmoticstressglucose


Unnamed: 0_level_0,glucose,lb,glycerolaa,acetate,fumarate,glucosamine,glycerol,pyruvate,chemostatµ05,chemostatµ035,...,stationaryphase1day_uncertainty,stationaryphase3days_uncertainty,osmoticstressglucose_uncertainty,42cglucose_uncertainty,ph6glucose_uncertainty,xylose_uncertainty,mannose_uncertainty,galactose_uncertainty,succinate_uncertainty,fructose_uncertainty
Uniprot Accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P0A8T7,5e-06,8.365155e-06,5.889506e-06,5e-06,5e-06,6e-06,5e-06,5e-06,9e-06,8e-06,...,3.671994e-07,7.212293e-07,2.354648e-06,9.284339e-07,3.505039e-07,4.088697e-07,1.0422e-06,1.170506e-06,8.827952e-07,7.101495e-07
P0A8V2,7e-06,1.037821e-05,6.79981e-06,6e-06,6e-06,8e-06,7e-06,7e-06,1e-05,9e-06,...,1.074028e-06,7.795029e-07,2.849277e-06,8.697451e-07,4.08774e-07,3.401644e-07,1.062043e-06,1.262408e-06,1.017933e-06,6.928404e-07
P36683,1.3e-05,1.938325e-05,2.295116e-05,5e-05,3.8e-05,2.6e-05,2e-05,3.3e-05,2.9e-05,4.2e-05,...,1.147968e-07,2.376355e-07,2.340789e-06,7.78926e-07,3.552836e-07,2.618844e-06,9.90353e-07,6.040891e-07,1.597785e-06,4.830152e-07
P15254,4e-06,9.586533e-07,3.059195e-06,3e-06,4e-06,4e-06,4e-06,5e-06,4e-06,4e-06,...,1.35632e-07,9.250583e-08,8.563102e-07,3.605895e-07,5.197757e-08,2.434886e-07,4.083478e-08,2.736588e-07,1.099418e-07,2.524476e-07
P09831,5e-06,7.052699e-07,8.527555e-07,3e-06,3e-06,4e-06,4e-06,3e-06,4e-06,4e-06,...,1.255828e-07,2.280289e-08,1.186897e-06,8.708223e-07,3.649823e-07,8.985256e-07,6.986262e-07,7.085368e-07,3.223333e-07,5.817333e-07


In [13]:
# save the whole thing
data.to_csv(f"{INTERMEDIATE}/proteomics_concentrations.csv")
