Merge pull request #51 from cdanielmachado/develop

integrate development branch
cdanielmachado · Mar 6, 2019 · 7ed7557 · 7ed7557
2 parents 8f4314b + 9213968
commit 7ed7557
Show file tree

Hide file tree

Showing 11 changed files with 77 additions and 40 deletions.
diff --git a/carveme/config.cfg b/carveme/config.cfg
@@ -9,8 +9,8 @@ unbalanced_metabolites = data/input/unbalanced_metabolites.csv
 manually_curated = data/input/manually_curated.csv
 
 [ncbi]
-refseq = data/input/refseq_release_84.tsv.gz
-genbank = data/input/genbank_release_221.tsv.gz
+refseq = data/input/refseq_release_92.tsv.gz
+genbank = data/input/genbank_release_230.tsv.gz
 
 [generated]
 folder = data/generated/

diff --git a/carveme/data/input/genbank_release_221.tsv.gz b/carveme/data/input/genbank_release_221.tsv.gz
diff --git a/carveme/data/input/genbank_release_230.tsv.gz b/carveme/data/input/genbank_release_230.tsv.gz
diff --git a/carveme/data/input/refseq_release_84.tsv.gz b/carveme/data/input/refseq_release_84.tsv.gz
diff --git a/carveme/data/input/refseq_release_92.tsv.gz b/carveme/data/input/refseq_release_92.tsv.gz
diff --git a/carveme/reconstruction/carving.py b/carveme/reconstruction/carving.py
@@ -37,8 +37,9 @@ def inactive_reactions(model, solution):
     return inactive + inactive_ext
 
 
-def minmax_reduction(model, scores, min_growth=0.1, min_atpm=0.1, eps=1e-3, bigM=1e3, default_score=-1, uptake_score=0,
-                     soft_score=1, soft_constraints=None, hard_constraints=None, solver=None, debug_output=None):
+def minmax_reduction(model, scores, min_growth=0.1, min_atpm=0.1, eps=1e-3, bigM=1e3, default_score=-1.0,
+                     uptake_score=0.0, soft_score=1.0, soft_constraints=None, hard_constraints=None,
+                     ref_reactions=None, ref_score=0.0, solver=None, debug_output=None):
     """ Apply minmax reduction algorithm (MILP).
 
     Computes a binary reaction vector that optimizes the agreement with reaction scores (maximizes positive scores,
@@ -71,21 +72,29 @@ def minmax_reduction(model, scores, min_growth=0.1, min_atpm=0.1, eps=1e-3, bigM
     scores = scores.copy()
     reactions = list(scores.keys())
 
-    if not soft_constraints:
+    if soft_constraints:
+        reactions += [r_id for r_id in soft_constraints if r_id not in reactions]
+    else:
         soft_constraints = {}
 
-    reactions += [r_id for r_id in soft_constraints if r_id not in reactions]
+    if not ref_reactions:
+        ref_reactions = {}
 
     if hard_constraints:
         solver.set_bounds(hard_constraints)
 
-    # Add default score
     if default_score != 0:
         for r_id in model.reactions:
-            if r_id not in reactions and not r_id.startswith('R_EX') and r_id != 'R_ATPM':
+            if r_id not in reactions and r_id not in ref_reactions and not r_id.startswith('R_EX') and r_id != 'R_ATPM':
                 scores[r_id] = default_score
                 reactions.append(r_id)
 
+    if ref_score != 0:
+        for r_id in ref_reactions:
+            if r_id not in reactions and r_id != 'R_ATPM':
+                scores[r_id] = ref_score
+                reactions.append(r_id)
+
     if not hasattr(solver, '_carveme_flag'):
         solver._carveme_flag = True
 
@@ -147,16 +156,20 @@ def minmax_reduction(model, scores, min_growth=0.1, min_atpm=0.1, eps=1e-3, bigM
                 w_f, w_r = -soft_score, -soft_score
 
         if y_f in solver.pos_vars:
-            if r_id in scores:
-                objective[y_f] = scores[r_id]
             if r_id in soft_constraints:
-                objective[y_f] = w_f
+                 objective[y_f] = w_f
+            elif r_id in ref_reactions:
+                objective[y_f] = 2 * scores[r_id] + ref_score
+            else:
+                objective[y_f] = scores[r_id]
 
         if y_r in solver.neg_vars:
-            if r_id in scores:
-                objective[y_r] = scores[r_id]
             if r_id in soft_constraints:
                 objective[y_r] = w_r
+            elif r_id in ref_reactions:
+                objective[y_r] = 2 * scores[r_id] + ref_score
+            else:
+                objective[y_r] = scores[r_id]
 
     if uptake_score != 0:
         for r_id in model.reactions:
@@ -175,7 +188,7 @@ def minmax_reduction(model, scores, min_growth=0.1, min_atpm=0.1, eps=1e-3, bigM
 
 def carve_model(model, reaction_scores, outputfile=None, flavor=None, inplace=True,
                 default_score=-1.0, uptake_score=0.0, soft_score=1.0,
-                soft_constraints=None, hard_constraints=None,
+                soft_constraints=None, hard_constraints=None, ref_model=None, ref_score=0.0,
                 init_env=None, debug_output=None):
     """ Reconstruct a metabolic model using the CarveMe approach.
 
@@ -201,15 +214,6 @@ def carve_model(model, reaction_scores, outputfile=None, flavor=None, inplace=Tr
 
     scores = dict(reaction_scores[['reaction', 'normalized_score']].values)
 
-    if default_score is None:
-        default_score = -1.0
-
-    if uptake_score is None:
-        uptake_score = 0.0
-
-    if soft_score is None:
-        soft_score = 1.0
-
     if soft_constraints:
         not_in_model = set(soft_constraints) - set(model.reactions)
         if not_in_model:
@@ -222,9 +226,14 @@ def carve_model(model, reaction_scores, outputfile=None, flavor=None, inplace=Tr
             hard_constraints = {r_id: (lb, ub) for r_id, (lb, ub) in hard_constraints.items() if r_id in model.reactions}
             warnings.warn("Hard constraints contain reactions not in the model:\n" + "\n".join(not_in_model))
 
+    if ref_model:
+        ref_reactions = set(model.reactions) & set(ref_model.reactions)
+    else:
+        ref_reactions = None
+
     sol = minmax_reduction(model, scores, default_score=default_score, uptake_score=uptake_score, soft_score=soft_score,
                            soft_constraints=soft_constraints, hard_constraints=hard_constraints,
-                           debug_output=debug_output)
+                           ref_reactions=ref_reactions, ref_score=ref_score, debug_output=debug_output)
 
     if sol.status == Status.OPTIMAL:
         inactive = inactive_reactions(model, sol)

diff --git a/carveme/reconstruction/scoring.py b/carveme/reconstruction/scoring.py
@@ -113,13 +113,15 @@ def reaction_scoring(annotation, gprs, spontaneous_score=0.0, debug_output=None)
 
     avg_score = reaction_scores['score'].median()
 
+    if avg_score != 0:
+        reaction_scores['normalized_score'] = reaction_scores['score'] / avg_score
+
     if debug_output:
         gene_scores.to_csv(debug_output + '_gene_scores.tsv', sep='\t', index=False)
         protein_scores.to_csv(debug_output + '_protein_scores.tsv', sep='\t', index=False)
         reaction_scores.to_csv(debug_output + '_reaction_scores.tsv', sep='\t', index=False)
 
     if avg_score != 0:
-        reaction_scores['normalized_score'] = reaction_scores['score'] / avg_score
         return reaction_scores
     else:
         return None
diff --git a/carveme/reconstruction/utils.py b/carveme/reconstruction/utils.py
@@ -103,11 +103,14 @@ def biomass_weight(biomass_id, coeffs, model):
     for m_id, coeff in coeffs.items():
         metabolite = model.metabolites[m_id]
         if 'FORMULA' in metabolite.metadata:
-            formulae = metabolite.metadata['FORMULA'].split(';')
-            met_weight = np.mean([molecular_weight(formula) for formula in formulae])
-            contribution = -coeff * met_weight
-            bio_weight += contribution
-#            print '\t'.join([biomass_id, m_id, str(met_weight), str(coeff), str(contribution)])
+            try:
+                formulae = metabolite.metadata['FORMULA'].split(';')
+                met_weight = np.mean([molecular_weight(formula) for formula in formulae])
+                contribution = -coeff * met_weight
+                bio_weight += contribution
+            except:
+                warn('Unable to normalize {} due to invalid formula for {}:'.format(biomass_id, m_id))
+                break
         else:
             warn('Unable to normalize {} due to missing formula for {}:'.format(biomass_id, m_id))
             break

diff --git a/docs/usage.rst b/docs/usage.rst
@@ -94,7 +94,7 @@ If you already have a model, and you just want to gap-fill it, you can do it wit
 
 .. code-block:: console
 
-    $ gapfill model.xml -g M9 -o new_model.xml
+    $ gapfill model.xml -m M9 -o new_model.xml
 
 Please note that the result is not the same if you gap-fill during reconstruction. When you gap-fill during
 reconstruction, the gene annotation scores are used to prioritize the reactions selected for gap-filling based on

diff --git a/scripts/carve b/scripts/carve
@@ -27,8 +27,9 @@ def build_model_id(name):
 
 
 def main(inputfile, input_type='protein', outputfile=None, diamond_args=None, universe=None, universe_file=None,
-         ensemble_size=None, verbose=False, debug=False, flavor=None, gapfill=None, blind_gapfill=False, init=None, mediadb=None,
-         default_score=None, uptake_score=None, soft_score=None, soft=None, hard=None, recursive_mode=False):
+         ensemble_size=None, verbose=False, debug=False, flavor=None, gapfill=None, blind_gapfill=False, init=None,
+         mediadb=None, default_score=None, uptake_score=None, soft_score=None, soft=None, hard=None, reference=None,
+         ref_score=None, recursive_mode=False):
 
     if recursive_mode:
         model_id = os.path.splitext(os.path.basename(inputfile))[0]
@@ -127,6 +128,17 @@ def main(inputfile, input_type='protein', outputfile=None, diamond_args=None, un
         available = '\n'.join(glob("{}{}universe_*.xml.gz".format(project_dir, config.get('generated', 'folder'))))
         raise IOError('Failed to load universe model: {}\nAvailable universe files:\n{}'.format(universe_file, available))
 
+    if reference:
+        if verbose:
+            print('Loading reference model...')
+
+        try:
+            ref_model = load_cbmodel(reference)
+        except:
+            raise IOError('Failed to load reference model.')
+    else:
+        ref_model = None
+
     if gapfill or init:
 
         if verbose:
@@ -170,6 +182,7 @@ def main(inputfile, input_type='protein', outputfile=None, diamond_args=None, un
     if ensemble_size is None or ensemble_size <= 1:
         if verbose:
             print('Reconstructing a single model')
+
         if not gapfill:
             carve_model(universe_model, scores,
                         outputfile=outputfile,
@@ -179,6 +192,8 @@ def main(inputfile, input_type='protein', outputfile=None, diamond_args=None, un
                         soft_score=soft_score,
                         soft_constraints=soft_constraints,
                         hard_constraints=hard_constraints,
+                        ref_model=ref_model,
+                        ref_score=ref_score,
                         init_env=init_env,
                         debug_output=debug_output)
         else:
@@ -189,6 +204,8 @@ def main(inputfile, input_type='protein', outputfile=None, diamond_args=None, un
                                 soft_score=soft_score,
                                 soft_constraints=soft_constraints,
                                 hard_constraints=hard_constraints,
+                                ref_model=ref_model,
+                                ref_score=ref_score,
                                 init_env=init_env,
                                 debug_output=debug_output)
     else:
@@ -276,12 +293,14 @@ if __name__ == '__main__':
     parser.add_argument('--soft', help="Soft constraints file")
     parser.add_argument('--hard', help="Hard constraints file")
 
-    parser.add_argument('--default-score', type=float, help=argparse.SUPPRESS)
-    parser.add_argument('--uptake-score', type=float, help=argparse.SUPPRESS)
-    parser.add_argument('--soft-score', type=float, help=argparse.SUPPRESS)
+    parser.add_argument('--reference', help="Manually curated model of a close reference species.")
 
-    parser.add_argument('--blind-gapfill', action='store_true', help=argparse.SUPPRESS)
+    parser.add_argument('--default-score', type=float, default=-1.0, help=argparse.SUPPRESS)
+    parser.add_argument('--uptake-score', type=float, default=0.0, help=argparse.SUPPRESS)
+    parser.add_argument('--soft-score', type=float, default=1.0, help=argparse.SUPPRESS)
+    parser.add_argument('--reference-score', type=float, default=0.0, help=argparse.SUPPRESS)
 
+    parser.add_argument('--blind-gapfill', action='store_true', help=argparse.SUPPRESS)
 
     args = parser.parse_args()
 
@@ -340,7 +359,9 @@ if __name__ == '__main__':
             uptake_score=args.uptake_score,
             soft_score=args.soft_score,
             soft=args.soft,
-            hard=args.hard
+            hard=args.hard,
+            reference=args.reference,
+            ref_score=args.reference_score
         )
 
     else:
@@ -365,6 +386,8 @@ if __name__ == '__main__':
                 soft_score=args.soft_score,
                 soft=args.soft,
                 hard=args.hard,
+                reference=args.reference,
+                ref_score=args.reference_score,
                 recursive_mode=True
             )
 

diff --git a/scripts/carveme_init b/scripts/carveme_init
@@ -18,8 +18,8 @@ if __name__ == '__main__':
     downloads = [
         'data/input/bigg_proteins.faa',
         'data/input/equilibrator_compounds.tsv.gz',
-        'data/input/refseq_release_84.tsv.gz',
-        'data/input/genbank_release_221.tsv.gz',
+        'data/input/refseq_release_92.tsv.gz',
+        'data/input/genbank_release_230.tsv.gz',
         'data/generated/bigg_gprs.csv.gz',
         'data/generated/model_specific_data.csv.gz',
         'data/generated/universe_draft.xml.gz',