Merge pull request #91 from cdanielmachado/development

integrate development changes
cdanielmachado · Sep 16, 2020 · 72f8e11 · 72f8e11
2 parents 63a55cc + 032bfc7
commit 72f8e11
Show file tree

Hide file tree

Showing 48 changed files with 747,502 additions and 14 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,7 +6,6 @@ carveme.egg-info/*
 .eggs/*
 docs/_build/*
 Dockerfile
-.idea/encodings.xml
+.idea/*
 *.DS_Store
-carveme/data/input/bigg_proteins.dmnd
-carveme/data/input/bigg_proteins.dmnd
+.vscode/*
diff --git a/carveme/cli/benchmark.py b/carveme/cli/benchmark.py
@@ -0,0 +1,198 @@
+import pandas as pd
+from reframed import load_cbmodel
+from carveme.reconstruction.utils import load_media_db
+from carveme.reconstruction.benchmark import benchmark_biolog, benchmark_essentiality, mcc
+from carveme import project_dir
+from subprocess import call
+import argparse
+
+
+organisms = {
+    'bsub': 'Bacillus subtilis (168)',
+    'ecol': 'Escherichia coli (K-12 MG1655)',
+    'mgen': 'Mycoplasma genitalium (G-37)',
+    'paer': 'Pseudomonas aeruginosa (PA01)',
+    'rsol': 'Ralstonia solenacearum (GMI1000)',
+    'sone': 'Shewanella oneidensis (MR-1)'
+}
+
+genomes = {
+    'bsub': 'Bsubtilis_168.faa',
+    'ecol': 'Ecoli_K12_MG1655.faa',
+    'mgen': 'M_genitalium_G37.faa',
+    'paer': 'Paeruginosa_PAO1.faa',
+    'rsol': 'Rsolanacearum_GMI1000.faa',
+    'sone': 'Soneidensis_MR1.faa'
+}
+
+data_path = project_dir + '/data/benchmark'
+
+biolog_media = {
+    'bsub': 'M9',
+    'ecol': 'M9',
+    'paer': 'M9',
+    'rsol': 'M9',
+    'sone': 'ShewMM'
+}
+
+essentiality_media = {
+    'bsub': 'LB',
+    'ecol': 'M9',
+    'mgen': None,
+    'paer': 'M9[succ]',
+    'sone': 'LB'
+}
+
+biolog_sources = {
+    'bsub': ['C', 'N', 'P', 'S'],
+    'ecol': ['C', 'N', 'P', 'S'],
+    'paer': ['C'],
+    'rsol': ['C', 'N', 'P', 'S'],
+    'sone': ['C', 'N'],
+}
+
+elements = {
+    'C': 'carbon',
+    'N': 'nitrogen',
+    'P': 'phosphorus',
+    'S': 'sulfur',
+}
+
+biolog_compounds = {
+    'C': {'glc__D', 'lac__D', 'lac__L'},
+    'N': {'nh4'},
+    'P': {'pi'},
+    'S': {'so4'},
+}
+
+
+def build_models():
+    for org_id, genome in genomes.items():
+        print(f'Carving model for {organisms[org_id]}')
+
+        fasta_file = f"{data_path}/fasta/{genome}"
+        model_file = f"{data_path}/models/{org_id}.xml"
+        mediadb = f"{data_path}/media_db.tsv"
+
+        media = set()
+        if org_id in biolog_media and biolog_media[org_id]:
+            media.add(biolog_media[org_id])
+        if org_id in essentiality_media and essentiality_media[org_id]:
+            media.add(essentiality_media[org_id])
+        media = ','.join(media)
+
+        gapfill = f'-g "{media}" --mediadb {mediadb}' if media else ''
+
+        call(f'carve {fasta_file} -o {model_file} {gapfill} --fbc2', shell=True)
+
+
+def load_models():
+    models = {}
+    for org_id in organisms:
+        models[org_id] = load_cbmodel(f"{data_path}/models/{org_id}.xml", flavor='bigg')
+    return models
+
+
+def load_biolog_data():
+    biolog_data = {}
+    for org_id, sources in biolog_sources.items():
+        biolog_data[org_id] = {}
+        for source in sources:
+            biolog_data[org_id][source] = \
+                pd.read_csv(f'{data_path}/biolog/{org_id}/biolog_{elements[source]}.tsv', sep='\t')
+
+    return biolog_data
+
+
+def load_essentiality_data():
+    essential = {}
+    non_essential = {}
+
+    for org_id in essentiality_media:
+        df = pd.read_csv(f'{data_path}/essentiality/{org_id}.tsv', sep='\t')
+        essential[org_id] = {'G_' + x for x in df.query('phenotype == "E"')['bigg_id']}
+        non_essential[org_id] = {'G_' + x for x in df.query('phenotype == "NE"')['bigg_id']}
+
+    return essential, non_essential
+
+
+def run_biolog_benchmark(models, biolog_data, media_db):
+
+    biolog_results = []
+
+    for org_id, medium in biolog_media.items():
+        print(f'Running biolog benchmark for {organisms[org_id]}')
+        model = models[org_id]
+
+        for source in biolog_sources[org_id]:
+            compounds = set(media_db[medium]) - biolog_compounds[source]
+            data = biolog_data[org_id][source]
+            result = benchmark_biolog(model, compounds, data)
+            result = [(org_id, source, met, res) for met, res in result.items()]
+            biolog_results.extend(result)
+
+    return pd.DataFrame(biolog_results, columns=['org', 'source', 'met', 'value'])
+
+
+def run_essentiality_benchmark(models, essential, non_essential, media_db):
+    essentiality_results = []
+
+    for org_id, medium in essentiality_media.items():
+        print(f'Running essentiality benchmark for {organisms[org_id]}')
+        model = models[org_id]
+        in_vivo = {x: True for x in essential[org_id] & set(model.genes)}
+        if non_essential[org_id]:
+            in_vivo.update({x: False for x in non_essential[org_id] & set(model.genes)})
+        else:
+            in_vivo.update({x: False for x in set(model.genes) - set(essential[org_id])})
+
+        compounds = media_db[medium] if medium else None
+        result = benchmark_essentiality(model, compounds, in_vivo)
+        result = [(org_id, gene, res) for gene, res in result.items()]
+        essentiality_results.extend(result)
+
+    return pd.DataFrame(essentiality_results, columns=['org', 'gene', 'value'])
+
+
+def benchmark(rebuild=True, biolog=True, essentiality=True):
+
+    if rebuild:
+        build_models()
+
+    models = load_models()
+    media_db = load_media_db(f'{data_path}/media_db.tsv')
+
+    if biolog:
+        biolog_data = load_biolog_data()
+        df_biolog = run_biolog_benchmark(models, biolog_data, media_db)
+        df_biolog.to_csv(f'{data_path}/results/biolog.tsv', sep='\t', index=False)
+        value = mcc(df_biolog)
+        print(f'Biolog final MCC value: {value:.3f}')
+
+    if essentiality:
+        essential, non_essential = load_essentiality_data()
+        df_essentiality = run_essentiality_benchmark(models, essential, non_essential, media_db)
+        df_essentiality.to_csv(f'{data_path}/results/essentiality.tsv', sep='\t', index=False)
+        value = mcc(df_essentiality)
+        print(f'Essentiality final MCC value: {value:.3f}')
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Benchmark CarveMe using biolog and gene essentiality data")
+
+    parser.add_argument('--skip-rebuild', action='store_true', dest='no_rebuild',
+                        help="Do not rebuild models during this call.")
+    parser.add_argument('--skip-biolog', action='store_true', dest='no_biolog',
+                        help="Skip biolog benchmark.")
+    parser.add_argument('--skip-essentiality', action='store_true', dest='no_essentiality',
+                        help="Skip biolog benchmark.")
+
+    args = parser.parse_args()
+
+    benchmark(rebuild=(not args.no_rebuild),
+              biolog=(not args.no_biolog),
+              essentiality=(not args.no_essentiality))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/carveme/cli/carve.py b/carveme/cli/carve.py
@@ -70,12 +70,12 @@ def maincall(inputfile, input_type='protein', outputfile=None, diamond_args=None
     else:
         hard_constraints = None
 
-    if input_type == 'refseq' or input_type == 'genbank':
+    if input_type == 'refseq':
 
         if verbose:
             print('Downloading genome {} from NCBI...'.format(inputfile))
 
-        ncbi_table = load_ncbi_table(project_dir + config.get('ncbi', input_type))
+        ncbi_table = load_ncbi_table(project_dir + config.get('input', 'refseq'))
         inputfile = download_ncbi_genome(inputfile, ncbi_table)
 
         if not inputfile:
@@ -255,7 +255,6 @@ def main():
     input_type_args.add_argument('--egg', action='store_true', help="Build from eggNOG-mapper output file")
     input_type_args.add_argument('--diamond', action='store_true', help=argparse.SUPPRESS)
     input_type_args.add_argument('--refseq', action='store_true', help="Download genome from NCBI RefSeq and build")
-    input_type_args.add_argument('--genbank', action='store_true', help="Download genome from NCBI GenBank and build")
 
     parser.add_argument('--diamond-args', help="Additional arguments for running diamond")
 
@@ -310,8 +309,8 @@ def main():
     if args.mediadb and not args.gapfill:
         parser.error('--mediadb can only be used with --gapfill')
 
-    if args.recursive and (args.refseq or args.genbank):
-        parser.error('-r cannot be combined with --refseq or --genbank')
+    if args.recursive and args.refseq:
+        parser.error('-r cannot be combined with --refseq')
 
     if args.egg:
         input_type = 'eggnog'
@@ -321,8 +320,6 @@ def main():
         input_type = 'diamond'
     elif args.refseq:
         input_type = 'refseq'
-    elif args.genbank:
-        input_type = 'genbank'
     else:
         input_type = 'protein'
 

diff --git a/carveme/config.cfg b/carveme/config.cfg
@@ -7,10 +7,7 @@ bigg_models = data/input/bigg_models.csv
 metabolomics = data/input/metabolomics_park2016.csv
 unbalanced_metabolites = data/input/unbalanced_metabolites.csv
 manually_curated = data/input/manually_curated.csv
-
-[ncbi]
-refseq = data/input/refseq_release_92.tsv.gz
-genbank = data/input/genbank_release_230.tsv.gz
+refseq = data/input/refseq_release_201.tsv.gz
 
 [generated]
 folder = data/generated/

diff --git a/carveme/data/benchmark/biolog/bsub/biolog_carbon.tsv b/carveme/data/benchmark/biolog/bsub/biolog_carbon.tsv
@@ -0,0 +1,85 @@
+bigg_id	compound	growth	seed
+4abut	g-Amino-Butyric Acid	-	cpd00281
+abt__L	L-Arabitol	+	cpd00417
+ac	Acetic Acid	++	cpd00029
+acac	Acetoacetic Acid	++	cpd00142
+acgam	N-Acetyl-D-Glucosamine	++	cpd00122
+acmana	N-Acetyl-b-D-Mannosamine	+	cpd00492
+acnam	N-Acetyl-Neuraminic Acid	++	cpd00232
+adn	Adenosine	++	cpd00182
+akg	a-Keto-Glutaric Acid	+	cpd00024
+ala__D	D-Alanine	++	cpd00117
+ala__L	L-Alanine	+	cpd00035
+arab__D	D-Arabinose	++	cpd00185
+arab__L	L-Arabinose	+	cpd19109
+arbt	Arbutin	++	cpd03696
+asn__L	L-Asparagine	++	cpd00132
+asp__L	L-Aspartic Acid	++	cpd00041
+btd_RR	2,3-Butanediol	-	
+cellb	D-Cellobiose	++	cpd03845
+cit	Citric Acid	+	cpd00137
+crn	D,L-Carnitine	--	cpd00266
+dad_2	2`-Deoxy-Adenosine	++	cpd00438
+dextrin	Dextrin	++	cpd11594
+dha	Dihydroxy-Acetone	+	cpd00157
+drib	2-Deoxy-D-Ribose	++	cpd01242
+f6p	D-Fructose-6-Phosphate	++	cpd19035
+for	Formic Acid	+	cpd00047
+fru	D-Fructose	++	cpd00082
+fum	Fumaric Acid	++	cpd00106
+g1p	D-Glucose-1-Phosphate	++	cpd00089
+g6p	D-Glucose-6-Phosphate	++	cpd00079
+gal	D-Galactose	++	cpd00108
+galctr__D	Mucic Acid	++	cpd00652
+galt	Dulcitol	++	cpd01171
+galur	D-Galacturonic Acid	++	cpd00280
+gam	D-Glucosamine	++	cpd00276
+glc__D	a-D-Glucose	++	cpd00027
+glcn__D	D-Gluconic Acid	++	cpd00222
+glcr	D-Saccharic Acid	++	cpd00609
+glcur	D-Glucuronic Acid	++	cpd00164
+gln__L	L-Glutamine	++	cpd00053
+glu__L	L-Glutamic Acid	++	cpd00023
+glx	Glyoxylic Acid	+	cpd00040
+gly	Glycine	--	cpd00033
+glyc	Glycerol	++	cpd00100
+glyc3p	D,L-a-Glycerol- Phosphate	++	cpd00080
+glyclt	Glycolic Acid	+	cpd00139
+glycogen	Glycogen	++	cpd00155
+ile__L	L-Isoleucine	--	cpd00322
+ins	Inosine	++	cpd00246
+lac__L	L-Lactic Acid	++	cpd00159
+leu__L	L-Leucine	--	cpd00107
+lys__L	L-Lysine	--	cpd00039
+mal__D	D-Malic Acid	++	cpd00386
+mal__L	D,L-Malic Acid	++	cpd00130
+mal__L	L-Malic Acid	++	cpd00130
+malt	Maltose	++	cpd00179
+malttr	Maltotriose	++	cpd01262
+man	D-Mannose	++	cpd00138
+mbdg	b-Methyl-D-Glucoside	++	cpd15585
+melib	D-Melibiose	++	cpd03198
+met__L	L-Methionine	--	cpd00060
+mnl	D-Mannitol	++	cpd00314
+orn__L	L-Ornithine	+	cpd00064
+pala	Palatinose	++	cpd01200
+phe__L	L-Phenylalanine	--	cpd00066
+ppa	Propionic Acid	+	cpd00141
+pro__L	L-Proline	++	cpd00129
+pyr	Pyruvic Acid	+	cpd00020
+raffin	D-Raffinose	++	cpd00382
+rib__D	D-Ribose	++	cpd00105
+rmn	L-Rhamnose	+	cpd00396
+salcn	Salicin	++	cpd01030
+sbt__D	D-Sorbitol	++	cpd00588
+ser__D	D-Serine	++	cpd00550
+ser__L	L-Serine	++	cpd00054
+srb__L	L-Sorbose	+	cpd00212
+succ	Succinic Acid	++	cpd00036
+sucr	Sucrose	++	cpd00076
+thr__L	L-Threonine	++	cpd00161
+thymd	Thymidine	++	cpd00184
+tre	D-Trehalose	++	cpd00794
+uri	Uridine	++	cpd00249
+val__L	L-Valine	--	cpd00156
+xyl__D	D-Xylose	++	cpd00154
diff --git a/carveme/data/benchmark/biolog/bsub/biolog_nitrogen.tsv b/carveme/data/benchmark/biolog/bsub/biolog_nitrogen.tsv
@@ -0,0 +1,32 @@
+bigg_id	compound	growth	seed
+4abut	g-Amino-N-Butyric Acid	+	cpd00281
+acgam	N-Acetyl-D-Glucosamine	+	cpd00122
+ade	Adenine	-	cpd00128
+ala__D	D-Alanine	++	cpd00117
+ala__L	L-Alanine	+	cpd00035
+arg__L	L-Arginine	++	cpd00051
+asn__L	L-Asparagine	++	cpd00132
+asp__L	L-Aspartic Acid	++	cpd00041
+citr__L	L-Citrulline	+	cpd00274
+cys__L	L-Cysteine	++	cpd00084
+cytd	Cytidine	++	cpd00367
+etha	Ethanolamine	++	cpd00162
+gam	D-Glucosamine	++	cpd00276
+gln__L	L-Glutamine	++	cpd00053
+glu__D	D-Glutamic Acid	+	cpd00186
+glu__L	L-Glutamic Acid	+	cpd00023
+gly	Glycine	+	cpd00033
+ins	Inosine	-	cpd00246
+met__L	L-Methionine	+	cpd00060
+nh4	Ammonia	++	cpd00013
+orn__L	L-Ornithine	+	cpd00064
+pro__L	L-Proline	++	cpd00129
+ser__L	L-Serine	++	cpd00054
+thr__L	L-Threonine	+	cpd00161
+thym	Thymine	-	cpd00151
+ura	Uracil	-	cpd00092
+urate	Uric Acid	+	cpd00300
+urea	Urea	+	cpd00073
+uri	Uridine	-	cpd00249
+val__L	L-Valine	+	cpd00156
+xan	Xanthine	++	cpd00309