In [22]:
import os
from dotenv import load_dotenv, find_dotenv
from os.path import join, dirname, basename, exists, isdir

### Load environmental variables from the project root directory ###
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

# now you can get the variables using their names

# Check whether a network drive has been specified
DATABASE = os.environ.get("NETWORK_URL")
if DATABASE == 'None':
    pass
else:
    pass
    #mount network drive here

# set up directory pathsa
CURRENT_DIR = os.getcwd()
PROJ = dirname(dotenv_path) # project root directory

DATA = join(PROJ, 'data') #data directory
RAW_EXTERNAL = join(DATA, 'raw_external') # external data raw directory
RAW_INTERNAL = join(DATA, 'raw_internal') # internal data raw directory
INTERMEDIATE = join(DATA, 'intermediate') # intermediate data directory
FINAL = join(DATA, 'final') # final data directory

RESULTS = join(PROJ, 'results') # output directory
FIGURES = join(RESULTS, 'figures') # figure output directory
PICTURES = join(RESULTS, 'pictures') # picture output directory


# make folders specific for certain data
folder_name = ''
if folder_name != '':
    #make folders if they don't exist
    if not exists(join(RAW_EXTERNAL, folder_name)):
        os.makedirs(join(RAW_EXTERNAL, folder_name))

    if not exists(join(INTERMEDIATE, folder_name)):
        os.makedirs(join(INTERMEDIATE, folder_name))

    if not exists(join(FINAL, folder_name)):
        os.makedirs(join(FINAL, folder_name))


print('Standard variables loaded, you are good to go!')

Standard variables loaded, you are good to go!


In [23]:
import pandas as pd
import re
# import data 
data = pd.read_csv(f"{INTERMEDIATE}/proteomics/acetate_usages.csv", index_col=0)



# get cell volumes
cell_volumes = pd.read_csv(f"{RAW_INTERNAL}/proteomics/growth_conditions.csv", index_col=0)
cell_volumes = cell_volumes["Single cell volume [fl]1"]
# remove the first two rows of LB
cell_volumes = cell_volumes.loc[~cell_volumes.index.duplicated(keep='first')]
# rename the number 3 in there
cell_volumes = cell_volumes.rename({'Osmotic-stress glucose3':'Osmotic-stress glucose_uncertainty'}, axis='index')
rename_dict = {i:re.sub(r'\W+', '', i).lower() for i in cell_volumes.index}
cell_volumes = cell_volumes.rename(rename_dict, axis='index')


In [24]:
cell_volumes

Growth condition
lb                                  4.29
glycerolaa                          3.83
acetate                             2.30
fumarate                            2.54
galactose                           2.21
glucose                             2.84
glucosamine                         2.62
glycerol                            2.64
pyruvate                            2.50
succinate                           2.58
fructose                            2.96
mannose                             2.64
xylose                              2.79
osmoticstressglucose_uncertainty    2.79
42cglucose                          2.98
ph6glucose                          2.93
stationaryphase1day                 1.60
stationaryphase3days                1.60
chemostatµ012                       1.90
chemostatµ020                       2.08
chemostatµ035                       2.40
chemostatµ05                        2.69
Name: Single cell volume [fl]1, dtype: float64

In [25]:


# Finally, convert to mmol/gDW:
water_content = 0.3
cell_density = 1.105e-12



# Iterate through the dataset and multiply by the corresponding cell volume, to get mmol/fL:
for (col_name, d) in data.iteritems():
    chemo_name = "acetate"
    try:
        data[col_name] = data[col_name] * cell_volumes.loc[chemo_name]
    except:
        print(chemo_name)
        
        
data = data * cell_density * water_content
        

# convert into counts
data = data * 6.022e+23 / 1000


In [26]:
data_for_UP2genes = pd.read_csv(f"{RAW_EXTERNAL}/raw_proteomics_all.csv")
data_for_UP2genes = data_for_UP2genes.drop([0])
UP2gene = dict(zip(data_for_UP2genes["UP"], data_for_UP2genes["Frist_Gene"]))

In [27]:
# translate index uniprot names to gene names and remove enzymes not included in the data 
data = data.loc[list(set(data.index).intersection(UP2gene.keys()))]
data.index = [UP2gene[i] for i in list(set(data.index).intersection(UP2gene.keys()))]
#data["pyr_1_original"] = 

In [28]:
data_for_UP2genes.index = data_for_UP2genes['Frist_Gene']

data_for_UP2genes = data_for_UP2genes.loc[~data_for_UP2genes.index.duplicated(keep='first')]
data = data.loc[~data.index.duplicated(keep='first')]
data['ac_original'] = data_for_UP2genes.loc[data.index, ['acetate']]
data['ac_original_1'] = data_for_UP2genes.loc[data.index, ['acetate.1']]
data['ac_original_2'] = data_for_UP2genes.loc[data.index, ['acetate.2']]


data

Unnamed: 0,pyr_1,pyr_2,pyr_3,pyr_original,pyr_original_1,pyr_original_2
fruB,0.000000,0.000000,0.000000,135.640649876311,134.115358651721,190.698118029366
yidA,0.000000,0.000000,0.000000,169.874860452754,163.030636516562,154.619350286062
wzzB,0.000000,0.000000,0.000000,1107.70800185475,1123.9709840312,1089.71224648648
pdxK,0.000000,0.000000,0.000000,120.739951251735,151.701598573648,119.40983162795
acnA,339091.828617,324000.315521,179672.666517,789.610101001916,765.036028540011,785.354857015051
...,...,...,...,...,...,...
gutQ,5.419646,5.415451,3.900609,67.9806661287923,60.7404801316662,59.1434971256975
agp,0.000000,0.000000,0.000000,908.761562746676,836.462307502063,1237.89732953236
curA,0.000000,0.000000,0.000000,480.238188848693,501.380206619107,404.650152086062
yeiR,0.000000,0.000000,0.000000,20.7105430910652,24.6931893158493,23.9921762304017


In [30]:
data.to_csv(f"{INTERMEDIATE}/proteomics/pyruvate_usages_counts.csv")
