In [69]:
import os
from dotenv import load_dotenv, find_dotenv
from os.path import join, dirname, basename, exists, isdir

### Load environmental variables from the project root directory ###
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

# now you can get the variables using their names

# Check whether a network drive has been specified
DATABASE = os.environ.get("NETWORK_URL")
if DATABASE == 'None':
    pass
else:
    pass
    #mount network drive here

# set up directory paths
CURRENT_DIR = os.getcwd()
PROJ = dirname(dotenv_path) # project root directory

DATA = join(PROJ, 'data') #data directory
RAW_EXTERNAL = join(DATA, 'raw_external') # external data raw directory
RAW_INTERNAL = join(DATA, 'raw_internal') # internal data raw directory
INTERMEDIATE = join(DATA, 'intermediate') # intermediate data directory
FINAL = join(DATA, 'final') # final data directory

RESULTS = join(PROJ, 'results') # output directory
FIGURES = join(RESULTS, 'figures') # figure output directory
PICTURES = join(RESULTS, 'pictures') # picture output directory


# make folders specific for certain data
folder_name = ''
if folder_name != '':
    #make folders if they don't exist
    if not exists(join(RAW_EXTERNAL, folder_name)):
        os.makedirs(join(RAW_EXTERNAL, folder_name))

    if not exists(join(INTERMEDIATE, folder_name)):
        os.makedirs(join(INTERMEDIATE, folder_name))

    if not exists(join(FINAL, folder_name)):
        os.makedirs(join(FINAL, folder_name))


print('Standard variables loaded, you are good to go!')

Standard variables loaded, you are good to go!


In [70]:
import pandas as pd
import re

# Extract of supplementary table 5:
data = pd.read_csv(f"{RAW_INTERNAL}/proteomics/protein_values.csv", index_col=0)
data_counts = pd.read_csv(f"{RAW_EXTERNAL}/raw_proteomics_all.csv", index_col=0)
weight_data = pd.read_csv(f"{RAW_INTERNAL}/proteomics/protein_values.csv", index_col=0)


data_counts = data_counts.drop(data_counts.index[0])

# remove original index and duplicates
data_counts.index = data_counts['UP']



# # Molecular weights:
MW = data["Molecular weight (Da)"] # Da = g/mol
MW = MW / 1000 # kDa = g/mmol
print(MW)



Uniprot Accession
P0A8T7    155.045008
P0A8V2    150.520276
P36683     93.420946
P15254    141.295898
P09831    163.176315
             ...    
P36667     31.022762
P0AC78     40.912094
P76164      8.702816
P38506     28.130364
Q46810     21.481914
Name: Molecular weight (Da), Length: 2359, dtype: float64


In [71]:
data_counts.head()

Unnamed: 0_level_0,Frist_Gene,Description,Peptides.used.for.quantitation,Confidence.score,UP,LB,LB.1,LB.2,glucose,glucose.1,...,mannose.2,galactose,galactose.1,galactose.2,succinate,succinate.1,succinate.2,fructose,fructose.1,fructose.2
UP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P0CE47,tufA,Elongation factor Tu 1 OS=Escherichia coli (st...,57.0,4565.07,P0CE47,398685.786674634,482028.355944304,443852.263239685,264507.587848866,250623.919700076,...,199386.604396731,145252.048613904,140284.546835391,136227.933319329,198158.172792286,195882.718691441,184072.062628029,245574.634683969,227392.663852649,233115.927926071
P0ACF0,hupA,DNA-binding protein HU-alpha OS=Escherichia co...,5.0,342.65,P0ACF0,283738.081206682,235127.603013499,278632.630250941,143835.71420576,128990.787701723,...,123301.370229059,91754.0099919115,104087.335390467,91216.7172453987,122431.936747179,121273.467958263,112148.173231466,125311.632564919,116903.081417265,118655.586936874
P0A853,tnaA,Tryptophanase OS=Escherichia coli (strain K12)...,31.0,1872.16,P0A853,153240.411082344,37589.708558399,44378.6128015376,610.220399493163,616.556129857228,...,1721.73823989213,1391.63135865292,1264.0615397766,1434.08367376889,1601.86520198587,1815.38805570844,1725.51848420745,738.416851763985,699.660066170104,651.494347600294
P0A910,ompA,Outer membrane protein A OS=Escherichia coli (...,29.0,2046.35,P0A910,146530.058925354,152491.132663327,145394.412658922,110799.422532295,119679.886373329,...,75848.3150470031,65969.0358350591,73019.9918729732,80202.8037896066,103760.705176563,82902.0652642184,89206.4088465786,90636.2748067698,91351.6019364097,96585.2450542378
P02359,rpsG,30S ribosomal protein S7 OS=Escherichia coli (...,17.0,1287.47,P02359,136334.014120813,142022.329518216,132224.843664576,54021.6761924568,57520.4000908413,...,35065.4535108461,20266.0312994271,18446.7888592354,19366.5481715565,34068.6127489098,39256.3960971002,27320.6033535062,39133.5947749083,46643.2403608802,44051.5175210565


In [72]:
# start and end columns
start_column = data_counts.columns.get_loc("LB")
end_column = data_counts.columns.get_loc("fructose.2")

# rename columns from .2 to uncertainty
# rename_dict = {col_name:re.sub(r".2$", "_uncertainty", col_name) for col_name in \
#  data.columns[start_column_uncertainty:end_column_uncertainty+1]}
# data = data.rename(columns=rename_dict)
# list of indices of data needed
list_of_indices = list(range(start_column,end_column+1))
data_counts = data_counts.iloc[:, list_of_indices]

# get cell volumes
cell_volumes = pd.read_csv(f"{RAW_INTERNAL}/proteomics/growth_conditions.csv", index_col=0)
cell_volumes = cell_volumes["Single cell volume [fl]1"]
# remove the first two rows of LB
cell_volumes = cell_volumes.loc[~cell_volumes.index.duplicated(keep='first')]
# rename the number 3 in there
cell_volumes = cell_volumes.rename({'Osmotic-stress glucose3':'Osmotic-stress glucose_uncertainty'}, axis='index')


In [73]:

# get Molecular weights 
weights = data_counts
data_counts['Molecular weights'] = weight_data['Molecular weight (Da)'].drop_duplicates()[weights.index]


In [8]:
# Convert values to mmol/cell:
data_counts = data_counts.apply(lambda x: pd.to_numeric(x, errors='coerce'))
data_counts = data_counts.astype(float) / 6.022e+23 * 1000

# convert names
cell_volumes.index = [re.sub(r'\W+', '', i).lower() for i in cell_volumes.index]
data_counts.columns = [re.sub(r'\W+', '', i).lower() for i in data_counts.columns]


# Iterate through the dataset and divide by the corresponding cell volume, to get mmol/fL:
for (col_name, d) in data_counts.iteritems():
    chemo_name = re.sub(r'2$', '', re.sub(r'1$', '', col_name))
    try:
        data_counts[col_name] = data_counts[col_name] / cell_volumes.loc[chemo_name]#["cell_volume"]
    except:
        print(chemo_name)

# Finally, convert to mmol/gDW:
water_content = 0.3
cell_density = 1.105e-12
data_counts = data_counts / cell_density / water_content



chemostatµ01
chemostatµ01
stationary1day
stationary1day
stationary1day
stationary3days
stationary3days
stationary3days
glucose3
glucose4
glucose5
50mmnacl
50mmnacl
50mmnacl
42c
42c
42c
ph6
ph6
ph6


In [9]:
data_counts.to_csv(f"{INTERMEDIATE}/proteomics/proteomics_concentrations.csv")

In [74]:
for i in range(0,len(weights.columns)-1):
    try:
        weights.iloc[:,i] = pd.to_numeric(weights.iloc[:,i], errors = 'coerce') * weights.iloc[:,-1]
    except:
        print(i)

weights.to_csv(f"{INTERMEDIATE}/proteomics/proteomics_masses.csv")

UP
P0CE47    1.091560e+10
P0ACF0    1.264339e+09
P0A853    3.233116e+07
P0A910    4.472026e+09
P02359    1.076531e+09
              ...     
P05704    1.129396e+04
P76398    4.923147e+02
P21517    7.761316e+04
P77783    3.688062e+02
P00936    3.124257e+05
Length: 2058, dtype: float64