Skip to content

Commit

Permalink
Merge pull request #91 from cdanielmachado/development
Browse files Browse the repository at this point in the history
integrate development changes
  • Loading branch information
cdanielmachado committed Sep 16, 2020
2 parents 63a55cc + 032bfc7 commit 72f8e11
Show file tree
Hide file tree
Showing 48 changed files with 747,502 additions and 14 deletions.
5 changes: 2 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ carveme.egg-info/*
.eggs/*
docs/_build/*
Dockerfile
.idea/encodings.xml
.idea/*
*.DS_Store
carveme/data/input/bigg_proteins.dmnd
carveme/data/input/bigg_proteins.dmnd
.vscode/*
198 changes: 198 additions & 0 deletions carveme/cli/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
import pandas as pd
from reframed import load_cbmodel
from carveme.reconstruction.utils import load_media_db
from carveme.reconstruction.benchmark import benchmark_biolog, benchmark_essentiality, mcc
from carveme import project_dir
from subprocess import call
import argparse


organisms = {
'bsub': 'Bacillus subtilis (168)',
'ecol': 'Escherichia coli (K-12 MG1655)',
'mgen': 'Mycoplasma genitalium (G-37)',
'paer': 'Pseudomonas aeruginosa (PA01)',
'rsol': 'Ralstonia solenacearum (GMI1000)',
'sone': 'Shewanella oneidensis (MR-1)'
}

genomes = {
'bsub': 'Bsubtilis_168.faa',
'ecol': 'Ecoli_K12_MG1655.faa',
'mgen': 'M_genitalium_G37.faa',
'paer': 'Paeruginosa_PAO1.faa',
'rsol': 'Rsolanacearum_GMI1000.faa',
'sone': 'Soneidensis_MR1.faa'
}

data_path = project_dir + '/data/benchmark'

biolog_media = {
'bsub': 'M9',
'ecol': 'M9',
'paer': 'M9',
'rsol': 'M9',
'sone': 'ShewMM'
}

essentiality_media = {
'bsub': 'LB',
'ecol': 'M9',
'mgen': None,
'paer': 'M9[succ]',
'sone': 'LB'
}

biolog_sources = {
'bsub': ['C', 'N', 'P', 'S'],
'ecol': ['C', 'N', 'P', 'S'],
'paer': ['C'],
'rsol': ['C', 'N', 'P', 'S'],
'sone': ['C', 'N'],
}

elements = {
'C': 'carbon',
'N': 'nitrogen',
'P': 'phosphorus',
'S': 'sulfur',
}

biolog_compounds = {
'C': {'glc__D', 'lac__D', 'lac__L'},
'N': {'nh4'},
'P': {'pi'},
'S': {'so4'},
}


def build_models():
for org_id, genome in genomes.items():
print(f'Carving model for {organisms[org_id]}')

fasta_file = f"{data_path}/fasta/{genome}"
model_file = f"{data_path}/models/{org_id}.xml"
mediadb = f"{data_path}/media_db.tsv"

media = set()
if org_id in biolog_media and biolog_media[org_id]:
media.add(biolog_media[org_id])
if org_id in essentiality_media and essentiality_media[org_id]:
media.add(essentiality_media[org_id])
media = ','.join(media)

gapfill = f'-g "{media}" --mediadb {mediadb}' if media else ''

call(f'carve {fasta_file} -o {model_file} {gapfill} --fbc2', shell=True)


def load_models():
models = {}
for org_id in organisms:
models[org_id] = load_cbmodel(f"{data_path}/models/{org_id}.xml", flavor='bigg')
return models


def load_biolog_data():
biolog_data = {}
for org_id, sources in biolog_sources.items():
biolog_data[org_id] = {}
for source in sources:
biolog_data[org_id][source] = \
pd.read_csv(f'{data_path}/biolog/{org_id}/biolog_{elements[source]}.tsv', sep='\t')

return biolog_data


def load_essentiality_data():
essential = {}
non_essential = {}

for org_id in essentiality_media:
df = pd.read_csv(f'{data_path}/essentiality/{org_id}.tsv', sep='\t')
essential[org_id] = {'G_' + x for x in df.query('phenotype == "E"')['bigg_id']}
non_essential[org_id] = {'G_' + x for x in df.query('phenotype == "NE"')['bigg_id']}

return essential, non_essential


def run_biolog_benchmark(models, biolog_data, media_db):

biolog_results = []

for org_id, medium in biolog_media.items():
print(f'Running biolog benchmark for {organisms[org_id]}')
model = models[org_id]

for source in biolog_sources[org_id]:
compounds = set(media_db[medium]) - biolog_compounds[source]
data = biolog_data[org_id][source]
result = benchmark_biolog(model, compounds, data)
result = [(org_id, source, met, res) for met, res in result.items()]
biolog_results.extend(result)

return pd.DataFrame(biolog_results, columns=['org', 'source', 'met', 'value'])


def run_essentiality_benchmark(models, essential, non_essential, media_db):
essentiality_results = []

for org_id, medium in essentiality_media.items():
print(f'Running essentiality benchmark for {organisms[org_id]}')
model = models[org_id]
in_vivo = {x: True for x in essential[org_id] & set(model.genes)}
if non_essential[org_id]:
in_vivo.update({x: False for x in non_essential[org_id] & set(model.genes)})
else:
in_vivo.update({x: False for x in set(model.genes) - set(essential[org_id])})

compounds = media_db[medium] if medium else None
result = benchmark_essentiality(model, compounds, in_vivo)
result = [(org_id, gene, res) for gene, res in result.items()]
essentiality_results.extend(result)

return pd.DataFrame(essentiality_results, columns=['org', 'gene', 'value'])


def benchmark(rebuild=True, biolog=True, essentiality=True):

if rebuild:
build_models()

models = load_models()
media_db = load_media_db(f'{data_path}/media_db.tsv')

if biolog:
biolog_data = load_biolog_data()
df_biolog = run_biolog_benchmark(models, biolog_data, media_db)
df_biolog.to_csv(f'{data_path}/results/biolog.tsv', sep='\t', index=False)
value = mcc(df_biolog)
print(f'Biolog final MCC value: {value:.3f}')

if essentiality:
essential, non_essential = load_essentiality_data()
df_essentiality = run_essentiality_benchmark(models, essential, non_essential, media_db)
df_essentiality.to_csv(f'{data_path}/results/essentiality.tsv', sep='\t', index=False)
value = mcc(df_essentiality)
print(f'Essentiality final MCC value: {value:.3f}')


def main():
parser = argparse.ArgumentParser(description="Benchmark CarveMe using biolog and gene essentiality data")

parser.add_argument('--skip-rebuild', action='store_true', dest='no_rebuild',
help="Do not rebuild models during this call.")
parser.add_argument('--skip-biolog', action='store_true', dest='no_biolog',
help="Skip biolog benchmark.")
parser.add_argument('--skip-essentiality', action='store_true', dest='no_essentiality',
help="Skip biolog benchmark.")

args = parser.parse_args()

benchmark(rebuild=(not args.no_rebuild),
biolog=(not args.no_biolog),
essentiality=(not args.no_essentiality))


if __name__ == '__main__':
main()
11 changes: 4 additions & 7 deletions carveme/cli/carve.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,12 @@ def maincall(inputfile, input_type='protein', outputfile=None, diamond_args=None
else:
hard_constraints = None

if input_type == 'refseq' or input_type == 'genbank':
if input_type == 'refseq':

if verbose:
print('Downloading genome {} from NCBI...'.format(inputfile))

ncbi_table = load_ncbi_table(project_dir + config.get('ncbi', input_type))
ncbi_table = load_ncbi_table(project_dir + config.get('input', 'refseq'))
inputfile = download_ncbi_genome(inputfile, ncbi_table)

if not inputfile:
Expand Down Expand Up @@ -255,7 +255,6 @@ def main():
input_type_args.add_argument('--egg', action='store_true', help="Build from eggNOG-mapper output file")
input_type_args.add_argument('--diamond', action='store_true', help=argparse.SUPPRESS)
input_type_args.add_argument('--refseq', action='store_true', help="Download genome from NCBI RefSeq and build")
input_type_args.add_argument('--genbank', action='store_true', help="Download genome from NCBI GenBank and build")

parser.add_argument('--diamond-args', help="Additional arguments for running diamond")

Expand Down Expand Up @@ -310,8 +309,8 @@ def main():
if args.mediadb and not args.gapfill:
parser.error('--mediadb can only be used with --gapfill')

if args.recursive and (args.refseq or args.genbank):
parser.error('-r cannot be combined with --refseq or --genbank')
if args.recursive and args.refseq:
parser.error('-r cannot be combined with --refseq')

if args.egg:
input_type = 'eggnog'
Expand All @@ -321,8 +320,6 @@ def main():
input_type = 'diamond'
elif args.refseq:
input_type = 'refseq'
elif args.genbank:
input_type = 'genbank'
else:
input_type = 'protein'

Expand Down
5 changes: 1 addition & 4 deletions carveme/config.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,7 @@ bigg_models = data/input/bigg_models.csv
metabolomics = data/input/metabolomics_park2016.csv
unbalanced_metabolites = data/input/unbalanced_metabolites.csv
manually_curated = data/input/manually_curated.csv

[ncbi]
refseq = data/input/refseq_release_92.tsv.gz
genbank = data/input/genbank_release_230.tsv.gz
refseq = data/input/refseq_release_201.tsv.gz

[generated]
folder = data/generated/
Expand Down
85 changes: 85 additions & 0 deletions carveme/data/benchmark/biolog/bsub/biolog_carbon.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
bigg_id compound growth seed
4abut g-Amino-Butyric Acid - cpd00281
abt__L L-Arabitol + cpd00417
ac Acetic Acid ++ cpd00029
acac Acetoacetic Acid ++ cpd00142
acgam N-Acetyl-D-Glucosamine ++ cpd00122
acmana N-Acetyl-b-D-Mannosamine + cpd00492
acnam N-Acetyl-Neuraminic Acid ++ cpd00232
adn Adenosine ++ cpd00182
akg a-Keto-Glutaric Acid + cpd00024
ala__D D-Alanine ++ cpd00117
ala__L L-Alanine + cpd00035
arab__D D-Arabinose ++ cpd00185
arab__L L-Arabinose + cpd19109
arbt Arbutin ++ cpd03696
asn__L L-Asparagine ++ cpd00132
asp__L L-Aspartic Acid ++ cpd00041
btd_RR 2,3-Butanediol -
cellb D-Cellobiose ++ cpd03845
cit Citric Acid + cpd00137
crn D,L-Carnitine -- cpd00266
dad_2 2`-Deoxy-Adenosine ++ cpd00438
dextrin Dextrin ++ cpd11594
dha Dihydroxy-Acetone + cpd00157
drib 2-Deoxy-D-Ribose ++ cpd01242
f6p D-Fructose-6-Phosphate ++ cpd19035
for Formic Acid + cpd00047
fru D-Fructose ++ cpd00082
fum Fumaric Acid ++ cpd00106
g1p D-Glucose-1-Phosphate ++ cpd00089
g6p D-Glucose-6-Phosphate ++ cpd00079
gal D-Galactose ++ cpd00108
galctr__D Mucic Acid ++ cpd00652
galt Dulcitol ++ cpd01171
galur D-Galacturonic Acid ++ cpd00280
gam D-Glucosamine ++ cpd00276
glc__D a-D-Glucose ++ cpd00027
glcn__D D-Gluconic Acid ++ cpd00222
glcr D-Saccharic Acid ++ cpd00609
glcur D-Glucuronic Acid ++ cpd00164
gln__L L-Glutamine ++ cpd00053
glu__L L-Glutamic Acid ++ cpd00023
glx Glyoxylic Acid + cpd00040
gly Glycine -- cpd00033
glyc Glycerol ++ cpd00100
glyc3p D,L-a-Glycerol- Phosphate ++ cpd00080
glyclt Glycolic Acid + cpd00139
glycogen Glycogen ++ cpd00155
ile__L L-Isoleucine -- cpd00322
ins Inosine ++ cpd00246
lac__L L-Lactic Acid ++ cpd00159
leu__L L-Leucine -- cpd00107
lys__L L-Lysine -- cpd00039
mal__D D-Malic Acid ++ cpd00386
mal__L D,L-Malic Acid ++ cpd00130
mal__L L-Malic Acid ++ cpd00130
malt Maltose ++ cpd00179
malttr Maltotriose ++ cpd01262
man D-Mannose ++ cpd00138
mbdg b-Methyl-D-Glucoside ++ cpd15585
melib D-Melibiose ++ cpd03198
met__L L-Methionine -- cpd00060
mnl D-Mannitol ++ cpd00314
orn__L L-Ornithine + cpd00064
pala Palatinose ++ cpd01200
phe__L L-Phenylalanine -- cpd00066
ppa Propionic Acid + cpd00141
pro__L L-Proline ++ cpd00129
pyr Pyruvic Acid + cpd00020
raffin D-Raffinose ++ cpd00382
rib__D D-Ribose ++ cpd00105
rmn L-Rhamnose + cpd00396
salcn Salicin ++ cpd01030
sbt__D D-Sorbitol ++ cpd00588
ser__D D-Serine ++ cpd00550
ser__L L-Serine ++ cpd00054
srb__L L-Sorbose + cpd00212
succ Succinic Acid ++ cpd00036
sucr Sucrose ++ cpd00076
thr__L L-Threonine ++ cpd00161
thymd Thymidine ++ cpd00184
tre D-Trehalose ++ cpd00794
uri Uridine ++ cpd00249
val__L L-Valine -- cpd00156
xyl__D D-Xylose ++ cpd00154
32 changes: 32 additions & 0 deletions carveme/data/benchmark/biolog/bsub/biolog_nitrogen.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
bigg_id compound growth seed
4abut g-Amino-N-Butyric Acid + cpd00281
acgam N-Acetyl-D-Glucosamine + cpd00122
ade Adenine - cpd00128
ala__D D-Alanine ++ cpd00117
ala__L L-Alanine + cpd00035
arg__L L-Arginine ++ cpd00051
asn__L L-Asparagine ++ cpd00132
asp__L L-Aspartic Acid ++ cpd00041
citr__L L-Citrulline + cpd00274
cys__L L-Cysteine ++ cpd00084
cytd Cytidine ++ cpd00367
etha Ethanolamine ++ cpd00162
gam D-Glucosamine ++ cpd00276
gln__L L-Glutamine ++ cpd00053
glu__D D-Glutamic Acid + cpd00186
glu__L L-Glutamic Acid + cpd00023
gly Glycine + cpd00033
ins Inosine - cpd00246
met__L L-Methionine + cpd00060
nh4 Ammonia ++ cpd00013
orn__L L-Ornithine + cpd00064
pro__L L-Proline ++ cpd00129
ser__L L-Serine ++ cpd00054
thr__L L-Threonine + cpd00161
thym Thymine - cpd00151
ura Uracil - cpd00092
urate Uric Acid + cpd00300
urea Urea + cpd00073
uri Uridine - cpd00249
val__L L-Valine + cpd00156
xan Xanthine ++ cpd00309

0 comments on commit 72f8e11

Please sign in to comment.