In [1]:
import os
from dotenv import load_dotenv, find_dotenv
from os.path import join, dirname, basename, exists, isdir

### Load environmental variables from the project root directory ###
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

# now you can get the variables using their names

# Check whether a network drive has been specified
DATABASE = os.environ.get("NETWORK_URL")
if DATABASE == 'None':
    pass
else:
    pass
    #mount network drive here

# set up directory pathsa
CURRENT_DIR = os.getcwd()
PROJ = dirname(dotenv_path) # project root directory

DATA = join(PROJ, 'data') #data directory
RAW_EXTERNAL = join(DATA, 'raw_external') # external data raw directory
RAW_INTERNAL = join(DATA, 'raw_internal') # internal data raw directory
INTERMEDIATE = join(DATA, 'intermediate') # intermediate data directory
FINAL = join(DATA, 'final') # final data directory

RESULTS = join(PROJ, 'results') # output directory
FIGURES = join(RESULTS, 'figures') # figure output directory
PICTURES = join(RESULTS, 'pictures') # picture output directory


# make folders specific for certain data
folder_name = ''
if folder_name != '':
    #make folders if they don't exist
    if not exists(join(RAW_EXTERNAL, folder_name)):
        os.makedirs(join(RAW_EXTERNAL, folder_name))

    if not exists(join(INTERMEDIATE, folder_name)):
        os.makedirs(join(INTERMEDIATE, folder_name))

    if not exists(join(FINAL, folder_name)):
        os.makedirs(join(FINAL, folder_name))


print('Standard variables loaded, you are good to go!')

Standard variables loaded, you are good to go!


In [2]:
import pandas as pd
import re
# import data 
data = pd.read_csv(f"{INTERMEDIATE}/proteomics/acetate_usages.csv", index_col=0)

data.head()

# get cell volumes
cell_volumes = pd.read_csv(f"{RAW_INTERNAL}/proteomics/growth_conditions.csv", index_col=0)
cell_volumes = cell_volumes["Single cell volume [fl]1"]
# remove the first two rows of LB
cell_volumes = cell_volumes.loc[~cell_volumes.index.duplicated(keep='first')]
# rename the number 3 in there
cell_volumes = cell_volumes.rename({'Osmotic-stress glucose3':'Osmotic-stress glucose_uncertainty'}, axis='index')
rename_dict = {i:re.sub(r'\W+', '', i).lower() for i in cell_volumes.index}
cell_volumes = cell_volumes.rename(rename_dict, axis='index')


In [3]:
data.head()

Unnamed: 0,ac_1,ac_2,ac_3
O32583,0.0,0.0,0.0
P00350,0.0,0.0,0.0
P00363,0.0,0.0,0.0
P00370,0.0,0.0,0.0
P00393,0.0,0.0,0.0


In [4]:


# Finally, convert to mmol/gDW:
water_content = 0.3
cell_density = 1.105e-12



# Iterate through the dataset and multiply by the corresponding cell volume, to get mmol/fL:
for (col_name, d) in data.iteritems():
    chemo_name = "acetate"
    try:
        data[col_name] = data[col_name] * cell_volumes.loc[chemo_name]
    except:
        print(chemo_name)
        
        
data = data * cell_density * water_content
        

# convert into counts
data = data * 6.022e+23 / 1000


In [5]:
data_for_UP2genes = pd.read_csv(f"{RAW_EXTERNAL}/raw_proteomics_all.csv")
data_for_UP2genes = data_for_UP2genes.drop([0])
UP2gene = dict(zip(data_for_UP2genes["UP"], data_for_UP2genes["Frist_Gene"]))

In [6]:
# translate index uniprot names to gene names and remove enzymes not included in the data 
data = data.loc[list(set(data.index).intersection(UP2gene.keys()))]
data.index = [UP2gene[i] for i in list(set(data.index).intersection(UP2gene.keys()))]
#data["pyr_1_original"] = 

In [7]:
data_for_UP2genes.index = data_for_UP2genes['Frist_Gene']

data_for_UP2genes = data_for_UP2genes.loc[~data_for_UP2genes.index.duplicated(keep='first')]
data = data.loc[~data.index.duplicated(keep='first')]
data['ac_original'] = data_for_UP2genes.loc[data.index, ['acetate']]
data['ac_original_1'] = data_for_UP2genes.loc[data.index, ['acetate.1']]
data['ac_original_2'] = data_for_UP2genes.loc[data.index, ['acetate.2']]


data

Unnamed: 0,ac_1,ac_2,ac_3,ac_original,ac_original_1,ac_original_2
chbB,0.000000,0.000000,0.000000,778.52347098222,722.842713843346,643.763656524197
yghU,0.000000,0.000000,0.000000,995.133432621134,1040.64573696801,927.386633178478
glpA,0.000000,0.000000,0.000000,15.3096198203812,12.5791009044582,16.8251266013508
rffG,0.000000,0.000000,0.000000,17.116024889242,13.1699483088472,14.6052208502821
acnB,38842.730483,38842.730483,38842.730483,22426.9657144047,25951.6240916731,22843.9674488163
...,...,...,...,...,...,...
lpxD,0.000000,0.000000,0.000000,36.3124976737935,48.5688880622485,48.0793517713787
ggt,0.000000,0.000000,0.000000,182.562525556641,135.380378182929,152.896653173984
aceE,0.000000,0.000000,0.000000,2292.42314647829,2736.26880518675,2445.225832105
yqaB,0.000000,0.000000,0.000000,52.5929585078648,64.8631209087853,31.5812142132445


In [8]:
data.to_csv(f"{INTERMEDIATE}/proteomics/acetate_usages_counts.csv")
