In [1]:
import os
from dotenv import load_dotenv, find_dotenv
from os.path import join, dirname, basename, exists, isdir

### Load environmental variables from the project root directory ###
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

# now you can get the variables using their names

# Check whether a network drive has been specified
DATABASE = os.environ.get("NETWORK_URL")
if DATABASE == 'None':
    pass
else:
    pass
    #mount network drive here

# set up directory pathsa
CURRENT_DIR = os.getcwd()
PROJ = dirname(dotenv_path) # project root directory

DATA = join(PROJ, 'data') #data directory
RAW_EXTERNAL = join(DATA, 'raw_external') # external data raw directory
RAW_INTERNAL = join(DATA, 'raw_internal') # internal data raw directory
INTERMEDIATE = join(DATA, 'intermediate') # intermediate data directory
FINAL = join(DATA, 'final') # final data directory

RESULTS = join(PROJ, 'results') # output directory
FIGURES = join(RESULTS, 'figures') # figure output directory
PICTURES = join(RESULTS, 'pictures') # picture output directory


# make folders specific for certain data
folder_name = ''
if folder_name != '':
    #make folders if they don't exist
    if not exists(join(RAW_EXTERNAL, folder_name)):
        os.makedirs(join(RAW_EXTERNAL, folder_name))

    if not exists(join(INTERMEDIATE, folder_name)):
        os.makedirs(join(INTERMEDIATE, folder_name))

    if not exists(join(FINAL, folder_name)):
        os.makedirs(join(FINAL, folder_name))


print('Standard variables loaded, you are good to go!')

Standard variables loaded, you are good to go!


In [8]:
import pandas as pd
pd.read_csv(f"{FINAL}/usages/usages_combined_gecko.csv", index_col = 0)

import pandas as pd
import numpy as np
import os
import re 


# get data from the percental usages
folder = os.listdir(f"{FINAL}/abs_usages_gecko")

df = pd.DataFrame()
for file in folder:
    f = pd.read_csv(f"{FINAL}/abs_usages_gecko/{file}", index_col = "Unnamed: 0")
    df = pd.concat([df, f], axis=1)


data_counts = pd.read_csv(f"{RAW_EXTERNAL}/raw_proteomics_all.csv", index_col="UP")
data_counts = data_counts.iloc[2:,:]
data_counts.head()

# make the columns have the same names 
errors = []
counts_usages = pd.DataFrame()
for i in data_counts.columns[4:]:
    try:
        data_counts.loc[:, i][data_counts.loc[:, i] == " "] = 0
        column = pd.to_numeric(df.loc[:, i]) * pd.to_numeric(data_counts.loc[:, i])
        counts_usages[i] = column
    except:
        errors.append(i)
        pass
counts_usages = counts_usages.dropna().round()

In [9]:
errors

['Confidence.score',
 'LB',
 'LB.1',
 'LB.2',
 'glycerol + AA',
 'glycerol + AA.1',
 'glycerol + AA.2',
 'fumarate',
 'fumarate.1',
 'fumarate.2',
 'glucosamine',
 'glucosamine.1',
 'glucosamine.2',
 'glycerol',
 'glycerol.1',
 'glycerol.2',
 'chemostat µ=0.5',
 'chemostat µ=0.5.1',
 'chemostat µ=0.5.2',
 'chemostat µ=0.35',
 'chemostat µ=0.35.1',
 'chemostat µ=0.35.2',
 'chemostat µ=0.20',
 'chemostat µ=0.20.1',
 'chemostat µ=0.20.2',
 'chemostat µ=0.12',
 'chemostat µ=0.12.1',
 'chemostat µ=0.12.2',
 'stationary 1 day',
 'stationary 1 day.1',
 'stationary 1 day.2',
 'stationary 3 days',
 'stationary 3 days.1',
 'stationary 3 days.2',
 'glucose.3',
 'glucose.4',
 'glucose.5',
 '50 mM NaCl',
 '50 mM NaCl.1',
 '50 mM NaCl.2',
 '42°C',
 '42°C.1',
 '42°C.2',
 'pH 6',
 'pH 6.1',
 'pH 6.2',
 'xylose',
 'xylose.1',
 'xylose.2',
 'mannose',
 'mannose.1',
 'mannose.2',
 'galactose',
 'galactose.1',
 'galactose.2',
 'succinate',
 'succinate.1',
 'succinate.2',
 'fructose',
 'fructose.1',
 'fruc

In [7]:
data_counts.head()

Unnamed: 0_level_0,Accession,Frist_Gene,Description,Peptides.used.for.quantitation,Confidence.score,LB,LB.1,LB.2,glucose,glucose.1,...,mannose.2,galactose,galactose.1,galactose.2,succinate,succinate.1,succinate.2,fructose,fructose.1,fructose.2
UP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P0ACF0,sp|P0ACF0|DBHA_ECOLI,hupA,DNA-binding protein HU-alpha OS=Escherichia co...,5.0,342.65,283738.081206682,235127.603013499,278632.630250941,143835.71420576,128990.787701723,...,123301.370229059,91754.0099919115,104087.335390467,91216.7172453987,122431.936747179,121273.467958263,112148.173231466,125311.632564919,116903.081417265,118655.586936874
P0A853,sp|P0A853|TNAA_ECOLI,tnaA,Tryptophanase OS=Escherichia coli (strain K12)...,31.0,1872.16,153240.411082344,37589.708558399,44378.6128015376,610.220399493163,616.556129857228,...,1721.73823989213,1391.63135865292,1264.0615397766,1434.08367376889,1601.86520198587,1815.38805570844,1725.51848420745,738.416851763985,699.660066170104,651.494347600294
P0A910,sp|P0A910|OMPA_ECOLI,ompA,Outer membrane protein A OS=Escherichia coli (...,29.0,2046.35,146530.058925354,152491.132663327,145394.412658922,110799.422532295,119679.886373329,...,75848.3150470031,65969.0358350591,73019.9918729732,80202.8037896066,103760.705176563,82902.0652642184,89206.4088465786,90636.2748067698,91351.6019364097,96585.2450542378
P02359,sp|P02359|RS7_ECOLI,rpsG,30S ribosomal protein S7 OS=Escherichia coli (...,17.0,1287.47,136334.014120813,142022.329518216,132224.843664576,54021.6761924568,57520.4000908413,...,35065.4535108461,20266.0312994271,18446.7888592354,19366.5481715565,34068.6127489098,39256.3960971002,27320.6033535062,39133.5947749083,46643.2403608802,44051.5175210565
P0A7K2,sp|P0A7K2|RL7_ECOLI,rplL,50S ribosomal protein L7/L12 OS=Escherichia co...,8.0,542.05,112716.845385665,145618.842924432,145688.592087552,65231.5520465074,57713.5527380154,...,36841.2723536272,27375.4061514013,22721.2220347675,23952.2377541095,36653.7606084462,35374.7738456648,30568.5610639047,52381.0509278605,48723.5470146048,46758.9932676408


In [121]:
# translate the index to gene names
dictydict = data_counts['Frist_Gene'].to_dict()
counts_usages.index = [dictydict[i] for i in counts_usages.index]

In [125]:
# save the counts 
counts_usages.to_csv(f"{FINAL}/usages_go_analysis/usages_combined_gecko.csv")

# Okay whats next?
- change names from UP to gene name
- export data
- do analysis in R 
- plots, all of the plots!!!!


