In [6]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn import linear_model, datasets
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import urllib.parse
import urllib.request
import ssl
import json
ssl._create_default_https_context = ssl._create_unverified_context



# read in proteomics data 

In [None]:
proteomics_data_raw = pd.read_csv("/Users/jonas/Documents/masters/thesis/thesis/data/raw_internal/proteomics/proteomics_ecoli_.csv")

# query uniprot to gene name translations 

In [9]:
query = "\t".join(proteomics_data_raw["uniprot_accession"])

In [10]:
url = 'https://www.uniprot.org/uploadlists/'
params = {
'from': 'ACC',
'to': 'GENENAME',
'format': 'tab',
'query': query#proteomics_data_raw["uniprot_accession"][i]#'P40925 P40926 O43175 Q9UM73 P97793'
}

data = urllib.parse.urlencode(params)
data = data.encode('utf-8')
req = urllib.request.Request(url, data)
with urllib.request.urlopen(req) as f:
   response = f.read()
result = response.decode('utf-8')
#print(response.decode('utf-8'))

<class 'str'>


# create translation dictionaries

In [30]:
uniprot2name = dict([tuple(line.split("\t")) for line in result.split("\n") if len(line.split("\t")) > 1 ])
name2uniprot = dict([tuple(line.split("\t")[::-1]) for line in result.split("\n") if len(line.split("\t")) > 1 ])

In [None]:
# compare if all can be found in model

In [20]:
import cameo

#E. coli model:
eColi_model = cameo.load_model("iML1515")

proteomics_data_raw = pd.read_csv("~/Documents/masters/thesis/thesis/data/raw_internal/proteomics/protein_values.csv")


In [60]:
%%capture
# identify all reactions not covered in the proteomics dataset
# find all gene names in the model
reaction_gene_names = [[i.name for i in list(eColi_model.reactions[j].genes)] for j in range(len(eColi_model.reactions))]
print("reactions left: " + str(len(reaction_gene_names)/len(eColi_model.reactions)*100) + "%")

# remove spontaneous reactions by removing reactions with no gene name in their gene rule
no_spon_react = [i for i in reaction_gene_names if len(i) != 0]
print("spontaneous reactions removed")
print("reactions left: " + str(len(no_spon_react)/len(eColi_model.reactions)*100) + "%")

# find genes that are in proteomics data as well as in model genes
all_genes = []
[all_genes.extend(i) for i in no_spon_react]
print(len([i for i in proteomics_data_raw["Gene"] if i in all_genes])/len(proteomics_data_raw["Gene"]))
mappable_proteomics = [i for i in proteomics_data_raw["Gene"] if i in all_genes]

# all reactions that are found in 
holes = []
[holes.extend(i) for i in no_spon_react if len(list(set(mappable_proteomics).intersection(i))) == 0]



In [33]:
#filter all empty strings in translation 
all_genes_without_empty = list(filter(lambda x: x != '',all_genes))
len([name2uniprot[i] for i in all_genes_without_empty if i in name2uniprot])

3002

# find out how many reactions are covered
non covered reactions are reactions that have a corresponding gene, that 
but do not have a corresponding gene in the data 

In [59]:
#print("Number of reactions involving a gene:",len(no_spon_react))
no_spon_rxns_no_emp = [i for i in no_spon_react if not (len(i) == 1 and '' in i)]
print("Number of reactions involving a gene:", len(no_spon_rxns_no_emp))

# 
mappable = [i for i in no_spon_rxns_no_emp if set(i) & set(list(name2uniprot.keys()))]
not_mappable = [i for i in no_spon_rxns_no_emp if not (set(i) & set(list(name2uniprot.keys())))]
print("number of reactions covered by data:", len(mappable))
print("number of reactions not by data:", len(not_mappable))
print("percent covered:", len(mappable)/len(no_spon_rxns_no_emp))

Number of reactions involving a gene: 2229
number of reactions covered by data: 1794
number of reactions not by data: 435
percent covered: 0.8048452220726783


In [57]:
not_mappable

[['pck'],
 ['trpD'],
 ['lpxH'],
 ['thiE'],
 ['xapA'],
 ['rhaA'],
 ['dfp'],
 ['gcl'],
 ['araB'],
 ['araD', 'sgbE', 'ulaF'],
 ['ttdA', 'ttdB'],
 ['umpH', 'umpG'],
 ['cyoE'],
 ['adiA'],
 ['uxaA'],
 ['lpxP'],
 ['cpsG'],
 ['lacZ'],
 ['hcaF', 'hcaE', 'hcaD', 'hcaC'],
 ['hcaB'],
 ['hcaB'],
 ['citE', 'citX', 'citD', 'citF'],
 ['wcaG'],
 ['rhaD'],
 ['ydiF', 'atoA', 'atoD'],
 ['atoA', 'atoD'],
 ['cobC'],
 ['allB'],
 ['caiB'],
 ['caiB'],
 ['umpG'],
 ['adeD'],
 ['ubiA'],
 ['astA'],
 ['astE'],
 ['cynS'],
 ['metA'],
 ['ulaE', 'sgbU'],
 ['yiaK'],
 ['araA'],
 ['fucK'],
 ['allA'],
 ['mhpA'],
 ['gmd'],
 ['scpC'],
 ['bioD'],
 ['glsB', 'glsA'],
 ['ulaG'],
 ['cobS'],
 ['mhpB'],
 ['mhpB'],
 ['mhpC'],
 ['mhpC'],
 ['garK'],
 ['entD'],
 ['allD'],
 ['ybcF'],
 ['dfp'],
 ['cpsB'],
 ['tdcD'],
 ['mtn'],
 ['thiK'],
 ['hcaF', 'hcaE', 'hcaD', 'hcaC'],
 ['ttdT'],
 ['mhpA'],
 ['phoA'],
 ['xapB'],
 ['torY', 'torZ', 'torC', 'torA'],
 ['torY', 'torZ', 'torC', 'torA'],
 ['xapB'],
 ['xapB'],
 ['ssuD'],
 ['atoE'],
 ['nanT'],
