In [2]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn import linear_model, datasets
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [4]:

# proteomics data 
proteomics_data_raw = pd.read_csv("../data/proteomics/protein_values.csv")

# getting only the values for the second dataset
proteomics_data_raw_raw = pd.read_csv("../data/proteomics/protein_values.csv")
proteomics_data_raw = proteomics_data_raw[proteomics_data_raw["Dataset"] == 2]

# cell size data 
cell_size_data_raw = pd.read_csv("../data/proteomics/cell_volumes.csv")
cell_size_data = cell_size_data_raw[["Growth condition" ,"Total cell volume calculated (fl)1"]]

# cell growth data 
cell_growth_data_raw = pd.read_csv("../data/proteomics/growth_conditions.csv")

# molecular weight (g/mmol)
mw = proteomics_data_raw["Molecular weight (Da)"] / 1000

# rename proteomics data 
proteomics_data_raw.columns = [re.sub(r'\W+', '', "_".join(i.split()).lower()) for i in proteomics_data_raw.columns]


# get additional information stored 
proteomics_data_additional_info = proteomics_data_raw.iloc[:, 0:5]
negative_slice = list(map(lambda x: -1*x,list(range(1,4))))
proteomics_data_additional_info = pd.concat([proteomics_data_additional_info, proteomics_data_raw.iloc[:, negative_slice]], axis=1)

# get main information out of df
start = proteomics_data_raw.columns.get_loc("glucose")
end = proteomics_data_raw.columns.get_loc("fructose")
proteomics_data = proteomics_data_raw.iloc[:, start:(end+1)]
error_data = proteomics_data_raw.iloc[:, (start+22):(end+23)]



FileNotFoundError: [Errno 2] File b'../data/proteomics/protein_values.csv' does not exist: b'../data/proteomics/protein_values.csv'

In [6]:
proteomics_data_raw_raw['Dataset'].unique()

array([2, 1])

In [7]:
# replace names in errors and convert to n_molecules/cell
replace_names = dict(zip(error_data.columns, [re.sub(r'.1$', '', i) for i in error_data.columns]))
error_data.rename(columns=replace_names, inplace=True)
error_data = proteomics_data / 100 * error_data
replace_names = dict(zip(error_data.columns, [re.sub(r'$', '_uncertainty', i) for i in error_data.columns]))
error_data.rename(columns=replace_names, inplace=True)

In [8]:
# protein counts to moles
proteomics_data = proteomics_data / 6.022e+23 * 1000
water_content = 0.3
cell_density = 1.105e-12
proteomics_data = proteomics_data / cell_density / water_content
proteomics_data_additional_info

Unnamed: 0,uniprot_accession,description,gene,peptidesusedforquantitation,confidencescore,unnamed_78,annotated_functional_cog_class,annotated_functional_cog_group_description
0,P0A8T7,DNA-directed RNA polymerase subunit beta' OS=E...,rpoC,91,6045.53,,INFORMATION STORAGE AND PROCESSING,Transcription
1,P0A8V2,DNA-directed RNA polymerase subunit beta OS=Es...,rpoB,89,5061.29,,INFORMATION STORAGE AND PROCESSING,Transcription
2,P36683,Aconitate hydratase 2 OS=Escherichia coli (str...,acnB,67,4505.67,,METABOLISM,Energy production and conversion
3,P15254,Phosphoribosylformylglycinamidine synthase OS=...,purL,65,4277.71,,METABOLISM,Nucleotide transport and metabolism
4,P09831,Glutamate synthase [NADPH] large chain OS=Esch...,gltB,64,4111.74,,METABOLISM,Amino acid transport and metabolism
...,...,...,...,...,...,...,...,...
2352,P0A890,Sulfurtransferase tusA OS=Escherichia coli (st...,tusA,1,125.85,,,
2353,P45531,Protein tusC OS=Escherichia coli (strain K12) ...,tusC,1,46.56,,,
2355,P0AC78,Undecaprenyl-phosphate alpha-N-acetylglucosami...,wecA,1,37.97,,,
2356,P76164,Uncharacterized protein ydfW in Qin prophage r...,ydfW,1,39.51,,,


In [27]:
# putting all the necessary data together for later export 
proteomics_data_combined = pd.concat([proteomics_data_raw["dataset"], proteomics_data_additional_info['uniprot_accession'],proteomics_data, error_data], axis=1)

# putting proteomics data together with cog for later plotting 

proteomics_data_to_plot = pd.concat([proteomics_data, proteomics_data_raw["annotated_functional_cog_group_description"]],axis=1)

# renaming for easier access 
#proteomics_data_combined.columns = [re.sub(r'\W+', '', "_".join(i.split()).lower()) for i in proteomics_data_combined.columns]
#proteomics_data.columns = [re.sub(r'\W+', '', "_".join(i.split()).lower()) for i in proteomics_data.columns]



In [28]:
# cell growth data names to the same convention as the headers on the rest of the data as well as only from the common strain
cell_growth_data = cell_growth_data_raw[cell_growth_data_raw["Strain"] == "BW25113"]

new_rows = [re.sub(r'\W+', '', "_".join(i.split()).lower()).replace("glucose3", "glucose") for i in cell_growth_data["Growth condition"]]
cell_growth_data["Growth condition"] = new_rows



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [29]:
# getting all data necessary for future work into one place
data = cell_size_data['Total cell volume calculated (fl)1'] * water_content * cell_density * 1e12
names = cell_size_data['Growth condition']


In [30]:
proteomics_data_raw[proteomics_data_raw["uniprot_accession"] == "P63284"]

Unnamed: 0,uniprot_accession,description,gene,peptidesusedforquantitation,confidencescore,molecular_weight_da,dataset,glucose,lb,glycerol__aa,...,mannose2,galactose_2,succinate2,fructose2,gene1,bnumber,annotated_functional_cog_groups_letter,annotated_functional_cog_group_description,annotated_functional_cog_class,unnamed_78
999,P63284,Chaperone protein ClpB OS=Escherichia coli (st...,clpB,6,3517.57,95507.92428,2,119,423,156.0,...,9.69,11.13,10.64,1.75,clpB,b2592,O,"Posttranslational modification, protein turnov...",CELLULAR PROCESSES AND SIGNALING,
2123,P63284,Isoform ClpB-3 of Chaperone protein ClpB OS=Es...,clpB,1,3038.05,95507.92428,2,0,1,0.0,...,9.69,11.13,10.64,1.75,clpB,b2592,O,"Posttranslational modification, protein turnov...",CELLULAR PROCESSES AND SIGNALING,


In [31]:
# saving proteomics data as well as growth data 
proteomics_data_combined.to_csv("../data/proteomics/proteomics_ecoli_combined.csv")
cell_growth_data.to_csv("../data/proteomics/proteomics_ecoli_growth_data_corrected_names.csv")

# generate tsvs for Proteomap
# first translate the uniprot identifiers to eco
translation_dict = pd.read_csv("../data/uniprot_to_eco.tsv", sep="\t")
translation_dict["uniprot eco"] = translation_dict["uniprot eco"].str.replace("eco:", "")
translation_dict.index = translation_dict.index.str.replace("up:", "")
translation_dict = translation_dict.to_dict()

# 
proteomics_data_raw = proteomics_data_raw[proteomics_data_raw["uniprot_accession"] != "P63284"]
proteomaps_data_raw = proteomics_data_raw
proteomaps_data_raw["uniprot_accession"] = [translation_dict[i] if i in translation_dict else i for i in proteomaps_data_raw["uniprot_accession"]]
for i in proteomaps_data_raw.columns[proteomaps_data_raw.columns.get_loc("glucose"):proteomaps_data_raw.columns.get_loc("fructose")]:
    proteomaps_data = proteomaps_data_raw[['uniprot_accession', i]][proteomaps_data_raw["dataset"] == 2]
    proteomaps_data.to_csv("../data/proteomics/proteomaps_{}.csv".format(i), sep='\t', index=False)
    


In [32]:
translation_dict = pd.read_csv("../data/uniprot_to_eco.tsv", sep="\t")
translation_dict["uniprot eco"] = translation_dict["uniprot eco"].str.replace("eco:", "")
translation_dict.index = translation_dict.index.str.replace("up:", "")
translation_dict = translation_dict.to_dict()

In [33]:
# data handling for plotting the protein content vs growth rate by category and total
proteomics_sums = proteomics_data.sum(axis=0).to_frame()
proteomics_sums_by_cog = pd.concat([proteomics_data, proteomics_data_raw["annotated_functional_cog_group_description"]])


cell_growth_data = cell_growth_data[["Growth condition", "Growth rate (h-1)"]]
proteomics_vs_growth = proteomics_sums.join(cell_growth_data.set_index("Growth condition"))
proteomics_vs_growth.columns = ["protein_sum","growth_rate"]
proteomics_data_to_plot

Unnamed: 0,glucose,lb,glycerol__aa,acetate,fumarate,glucosamine,glycerol,pyruvate,chemostat_µ05,chemostat_µ035,...,stationary_phase_3_days,osmoticstress_glucose,42c_glucose,ph6_glucose,xylose,mannose,galactose,succinate,fructose,annotated_functional_cog_group_description
0,1.392080e-05,3.588652e-05,2.255681e-05,1.092024e-05,1.224269e-05,1.661079e-05,1.425141e-05,1.367535e-05,2.394438e-05,1.953621e-05,...,9.552706e-06,1.126087e-05,1.996200e-05,1.672600e-05,1.819372e-05,1.703658e-05,1.131096e-05,1.384566e-05,2.262694e-05,Transcription
1,1.982174e-05,4.452252e-05,2.604327e-05,1.332971e-05,1.602470e-05,2.003213e-05,1.877981e-05,1.860448e-05,2.627370e-05,2.198074e-05,...,1.115568e-05,1.630021e-05,2.466071e-05,2.019744e-05,2.222620e-05,2.075347e-05,1.412618e-05,1.931079e-05,2.639392e-05,Transcription
2,3.805053e-05,8.315413e-05,8.790293e-05,1.144321e-04,9.763597e-05,6.889269e-05,5.406020e-05,8.212722e-05,7.881108e-05,1.014931e-04,...,3.787019e-06,1.410114e-05,2.425496e-05,2.328816e-05,3.354718e-05,8.273836e-05,6.449454e-05,9.791649e-05,4.399154e-05,Energy production and conversion
3,1.230280e-05,4.112623e-06,1.171672e-05,7.203351e-06,1.037423e-05,9.813189e-06,1.035920e-05,1.356514e-05,1.144622e-05,8.666063e-06,...,3.681824e-06,6.602237e-06,9.442502e-06,8.555858e-06,8.620979e-06,8.836378e-06,7.864577e-06,9.582762e-06,1.205234e-05,Nucleotide transport and metabolism
4,1.432154e-05,3.025608e-06,3.266054e-06,6.827655e-06,6.416894e-06,9.903356e-06,1.104046e-05,8.470700e-06,1.162655e-05,9.813189e-06,...,9.166991e-07,4.177743e-06,1.492266e-05,1.364028e-05,1.250317e-05,1.041430e-05,7.002980e-06,7.779419e-06,1.932582e-05,Amino acid transport and metabolism
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2352,1.372544e-06,1.848426e-06,1.217256e-06,1.462711e-06,1.572915e-06,1.542860e-06,1.282377e-06,1.532841e-06,1.432655e-06,1.337479e-06,...,8.215227e-07,1.377553e-06,1.758259e-06,1.572915e-06,1.838407e-06,1.808352e-06,1.537850e-06,2.133955e-06,1.848426e-06,
2353,1.051950e-07,1.152135e-07,1.452693e-07,1.001857e-07,1.102043e-07,1.402600e-07,8.014856e-08,1.552878e-07,9.517641e-08,7.513927e-08,...,1.051950e-07,1.853435e-07,2.905385e-07,2.204085e-07,1.853435e-07,1.001857e-07,1.051950e-07,9.517641e-08,1.152135e-07,
2355,5.009285e-09,0.000000e+00,0.000000e+00,2.003714e-08,1.502785e-08,1.001857e-08,2.003714e-08,3.005571e-08,4.508356e-08,2.003714e-08,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,
2356,8.565877e-07,1.001857e-08,2.504642e-08,2.654921e-07,1.152135e-07,2.053807e-07,2.454550e-07,2.805199e-07,2.254178e-07,1.102043e-07,...,6.512070e-08,2.654921e-07,4.758820e-07,3.656778e-07,4.207799e-07,3.155849e-07,1.202228e-07,9.517641e-08,4.458263e-07,


In [34]:
# aggregate sum of protein mass per cog
proteomics_data_to_plot_aggregated = proteomics_data_to_plot.groupby(["annotated_functional_cog_group_description"]).sum()

# sum of all protein mass per sample in percent
cog_in_percent = proteomics_data_to_plot_aggregated.div(proteomics_data_to_plot.sum()) * 100


amino_vs_growth = cog_in_percent.loc["Amino acid transport and metabolism"].to_frame().join(cell_growth_data.set_index("Growth condition"))
energy_vs_growth = cog_in_percent.loc["Energy production and conversion"].to_frame().join(cell_growth_data.set_index("Growth condition"))
translation_vs_growth = cog_in_percent.loc["Translation, ribosomal structure and biogenesis"].to_frame().join(cell_growth_data.set_index("Growth condition"))



In [35]:
# three paper subplots recreated 
plots = [amino_vs_growth, energy_vs_growth, translation_vs_growth]

# fitting robust linear models to the calculated percentages
# dotted line standard linear model, non-dotted ransac linear model
fig = plt.figure(figsize=[20,10])


fig = make_subplots(rows=1, cols=3)






for i in range(0, 3):
    x = amino_vs_growth["Growth rate (h-1)"]
    y = plots[i].iloc[:, 0]
    fig.add_trace(go.Scatter(x=x, y=y, mode='markers',text=amino_vs_growth.index),row=1, col=i+1)

    ransac = linear_model.RANSACRegressor()
    ransac.fit(x.to_numpy().reshape(-1, 1), y)
    line_X = np.arange(x.min(), x.max())[:, np.newaxis]
    line_y_ransac = ransac.predict(line_X)
    fig.add_trace(
    go.Scatter(x=line_X, y=line_y_ransac, mode='lines+markers'),row=1, col=i+1)

fig.update_layout(height=600, width=1200, title_text="Subplots")
fig.show()


<Figure size 1440x720 with 0 Axes>

In [36]:
cog_in_percent_with_means = cog_in_percent
cog_in_percent_with_means["means"] = cog_in_percent.mean(axis=1)
cog_in_percent_with_means = cog_in_percent_with_means.sort_values(by=['means'], ascending=False)
means = cog_in_percent_with_means["means"]
#cog_in_percent_sorted = cog_in_percent_with_means.drop(["means"])


# readying for plotting
x = list(range(1, cog_in_percent_with_means.shape[0]))
y = [tuple(list(cog_in_percent_with_means.iloc[i])) for i in range(0,cog_in_percent_with_means.shape[0])]


fig = go.Figure()
for xe, ye in zip(x, y):
    #plt.scatter([xe] * len(ye), ye, color="blue")
    fig.add_trace(go.Scatter(
    x=[xe] * len(ye), y=ye,
    mode='markers',
    marker_color='rgba(152, 0, 0, .8)'
    ))

fig.add_trace(go.Scatter(x=x, y=means, mode='markers',
    marker_color='rgba(255, 255, 255, 1)'))

fig.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = x,
        ticktext = cog_in_percent_with_means.index
    )
)


fig.show()



In [18]:
# proteomics data basic analysis
from sklearn.decomposition import PCA
X = proteomics_data.transpose()
pca = PCA(n_components=2)
pca.fit(X)
PCA(n_components=2)
print(pca.explained_variance_ratio_)

principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2'], index=X.index)

principal

[0.60833932 0.20432166]


In [25]:
pca_plot = go.Figure()
pca_plot.add_trace(go.Scatter(x=principalDf['principal component 1'], y=principalDf['principal component 2'], mode='markers+text', text=principalDf.index, marker=dict(color=[f'rgb({np.random.randint(0,256)}, {np.random.randint(0,256)},  {np.random.randint(0,256)})' \
 for _ in range(principalDf.shape[0])],size=10)))

pca_plot.update_layout(showlegend=True)



reactions left: 100.0%
spontaneous reactions removed
reactions left: 83.55457227138643%
0.4298431538787622


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [32]:
len(holes)

614