Merge pull request #15 from dougli1sqrd/ontobio-questions

some comment TODOs and a few small edits
biolink · May 16, 2017 · a6e0b8b · a6e0b8b
2 parents d548d93 + a20987e
commit a6e0b8b
Showing 1 changed file with 71 additions and 58 deletions.
diff --git a/ontobio/io/gafparser.py b/ontobio/io/gafparser.py
@@ -40,7 +40,7 @@ class Report():
     FATAL = 'FATAL'
     ERROR = 'ERROR'
     WARNING = 'WARNING'
-    
+
     # Warnings: TODO link to gorules
     INVALID_ID = "Invalid identifier"
     INVALID_IDSPACE = "Invalid identifier prefix"
@@ -50,7 +50,7 @@ class Report():
     3 warning levels
     """
     LEVELS = [FATAL, ERROR, WARNING]
-    
+
     def __init__(self):
         self.messages = []
         self.n_lines = 0
@@ -113,34 +113,36 @@ def to_markdown(self):
         """
         json = self.to_report_json()
         summary = json['summary']
-        
+
         s = ""
-        s = s + "\n## SUMMARY\n\n";
+        s += "\n## SUMMARY\n\n"
+
+        s += " * Associations: {}\n" . format(summary['association_count'])
+        s += " * Lines in file (incl headers): {}\n" . format(summary['line_count'])
+        s += " * Lines skipped: {}\n" . format(summary['skipped_line_count'])
 
-        s = s + " * Associations: {}\n" . format(summary['association_count'])
-        s = s + " * Lines in file (incl headers): {}\n" . format(summary['line_count'])
-        s = s + " * Lines skipped: {}\n" . format(summary['skipped_line_count'])
-
         stats = json['aggregate_statistics']
-        s = s + "\n## STATISTICS\n\n";
+        s += "\n## STATISTICS\n\n"
         for k,v in stats.items():
-            s = s + " * {}: {}\n" . format(k,v)
+            s += " * {}: {}\n" . format(k,v)
+
 
-
 
-        s = s + "\n## MESSAGES\n\n";
+        s += "\n## MESSAGES\n\n"
         for g in json['groups']:
-            s = s + " * {}: {}\n".format(g['level'], g['count'])
-        s = s + "\n\n";
+            s += " * {}: {}\n".format(g['level'], g['count'])
+        s += "\n\n"
         for g in json['groups']:
             level = g['level']
             msgs = g['messages']
             if len(msgs) > 0:
-                s = s + "### {}\n\n".format(level)
+                s += "### {}\n\n".format(level)
                 for m in msgs:
-                    s = s + " * {} {} `{}`\n".format(m['type'],m['message'],m['line'])
+                    s += " * {} {} `{}`\n".format(m['type'],m['message'],m['line'])
         return s
-
+
+# TODO avoid using names that are builtin python: file, id
+
 class AssocParser():
     """
     Abstract superclass of all association parser classes
@@ -149,7 +151,7 @@ class AssocParser():
     def parse(self, file, outfile=None):
         """
         Parse a file.
-        
+
         Arguments
         ---------
 
@@ -163,12 +165,13 @@ def parse(self, file, outfile=None):
         skipped = []
         n_lines = 0
         for line in file:
-            n_lines = n_lines+1
+            n_lines += 1
             if line.startswith("!"):
                 if outfile is not None:
                     outfile.write(line)
                 continue
             line = line.strip("\n")
+            # Let's rename line2 to something more meaningful
             line2, new_assocs  = self.parse_line(line)
             if new_assocs is None or new_assocs == []:
                 logging.warn("SKIPPING: {}".format(new_assocs))
@@ -181,20 +184,23 @@ def parse(self, file, outfile=None):
                     rpt.references.update(a['evidence']['has_supporting_reference'])
                     if 'taxon' in a['subject']:
                         rpt.taxa.add(a['subject']['taxon']['id'])
-                assocs = assocs + new_assocs
+                assocs += new_assocs
                 if outfile is not None:
                     outfile.write(line2 + "\n")
 
-        self.report.skipped = self.report.skipped + skipped
-        self.report.n_lines = self.report.n_lines + n_lines
-        self.report.n_assocs = self.report.n_assocs + len(assocs)
+        self.report.skipped += skipped
+        self.report.n_lines += n_lines
+        self.report.n_assocs += len(assocs)
         logging.info("Parsed {} assocs from {} lines. Skipped: {}".
                      format(len(assocs),
                             n_lines,
                             len(skipped)))
         file.close()
         return assocs
 
+    def parse_line(self, line):
+        raise NotImplementedError("AssocParser.parse_line not implemented")
+
     # split an ID/CURIE into prefix and local parts
     # (not currently used)
     def _parse_id(self, id):
@@ -208,7 +214,7 @@ def _parse_id(self, id):
     def _get_id_prefix(self, id):
         toks = id.split(":")
         return toks[0]
-        
+
     def _validate_taxon(self, taxon, line):
         if self.config.valid_taxa is None:
             return True
@@ -218,9 +224,9 @@ def _validate_taxon(self, taxon, line):
             else:
                 self.report.error(line, Report.INVALID_TAXON, taxon)
                 return False
-        
+
     def _validate_id(self, id, line, context=None):
-        if id.find(" ") > -1:
+        if " " in id:
             self.report.error(line, Report.INVALID_ID, id)
             return False
         if id.find("|") > -1:
@@ -239,7 +245,7 @@ def _split_pipe(self, v):
         ids = v.split("|")
         ids = [id for id in ids if self._validate_id(id, '')]
         return ids
-    
+
     def _pair_to_id(self, db, localid):
         if self.config.remove_double_prefixes:
             ## Switch MGI:MGI:n to MGI:n
@@ -251,9 +257,10 @@ def _taxon_id(self,id):
          id = id.replace('taxon','NCBITaxon')
          self._validate_id(id,'',TAXON)
          return id
-    
+
     def _ensure_file(self, file):
         if isinstance(file,str):
+            # TODO Let's fix this if/elseif chain.
             if file.startswith("ftp"):
                 f = tempfile.NamedTemporaryFile()
                 fn = f.name
@@ -273,8 +280,8 @@ def _ensure_file(self, file):
                  return open("myfile.txt", "r")
         else:
             return file
-            
-    
+
+
     def _parse_class_expression(self, x):
         ## E.g. exists_during(GO:0000753)
         ## Atomic class expressions only
@@ -283,8 +290,13 @@ def _parse_class_expression(self, x):
             'property':p,
             'filler':v
         }
-
-
+
+# TODO consider making an Association its own class too to give it a little more
+# TODO Semantic value?
+
+# TODO consider making an ID class?
+
+
 class GpadParser(AssocParser):
     """
     Parser for GO GPAD Format
@@ -301,7 +313,7 @@ def __init__(self,config=AssocParserConfig()):
         """
         self.config = config
         self.report = Report()
-        
+
     def skim(self, file):
         file = self._ensure_file(file)
         tuples = []
@@ -320,7 +332,7 @@ def skim(self, file):
             tuples.append( (id,None,t) )
         return tuples
 
-    def parse_line(self, line):            
+    def parse_line(self, line):
         """
         Parses a single line of a GPAD
         """
@@ -341,10 +353,10 @@ def parse_line(self, line):
         id = self._pair_to_id(db, db_object_id)
         if not self._validate_id(id, line, ENTITY):
             return line, []
-        
+
         if not self._validate_id(goid, line, ANNOTATION):
             return line, []
-        
+
         assocs = []
         xp_ors = annotation_xp.split("|")
         for xp_or in xp_ors:
@@ -372,16 +384,16 @@ def parse_line(self, line):
                 },
                 'provided_by': assigned_by,
                 'date': date,
-                
+
             }
             assocs.append(assoc)
         return line, assocs
-    
+
 class GafParser(AssocParser):
     """
     Parser for GO GAF format
     """
-    
+
     def __init__(self,config=AssocParserConfig()):
         """
         Arguments:
@@ -391,7 +403,7 @@ def __init__(self,config=AssocParserConfig()):
         """
         self.config = config
         self.report = Report()
-        
+
     def skim(self, file):
         file = self._ensure_file(file)
         tuples = []
@@ -418,10 +430,10 @@ def parse_line(self, line, class_map=None, entity_map=None):
         Parses a single line of a GAF
         """
         config = self.config
-        
+
         vals = line.split("\t")
         if len(vals) == 15:
-            vals = vals + ["",""]
+            vals += ["",""]
         [db,
          db_object_id,
          db_object_symbol,
@@ -446,10 +458,10 @@ def parse_line(self, line, class_map=None, entity_map=None):
         id = self._pair_to_id(db, db_object_id)
         if not self._validate_id(id, line, ENTITY):
             return line, []
-        
+
         if not self._validate_id(goid, line, ANNOTATION):
             return line, []
-        
+
         ## --
         ## optionally map goid and entity (gp) id
         ## --
@@ -459,7 +471,7 @@ def parse_line(self, line, class_map=None, entity_map=None):
             if not self._validate_id(goid, line, ANNOTATION):
                 return line, []
             vals[4] = goid
-            
+
         # Example use case: mapping from UniProtKB to MOD ID
         if config.entity_map is not None:
             id = self.map_id(id, config.entity_map)
@@ -482,7 +494,7 @@ def parse_line(self, line, class_map=None, entity_map=None):
         taxon = taxa[0]
         in_taxa = taxa[1:]
         self._validate_taxon(taxon, line)
-        
+
         ## --
         ## db_object_synonym CARD=0..*
         ## --
@@ -535,6 +547,7 @@ def parse_line(self, line, class_map=None, entity_map=None):
             ## --
             ## goid
             ## --
+            # TODO We shouldn't overload buildin keywords/functions
             object = {'id':goid,
                       'taxon': taxon}
 
@@ -549,7 +562,7 @@ def parse_line(self, line, class_map=None, entity_map=None):
                     'id': taxon
                 }
             }
-            
+
             ## --
             ## gene_product_isoform
             ## --
@@ -582,16 +595,16 @@ def parse_line(self, line, class_map=None, entity_map=None):
                 'evidence': evidence,
                 'provided_by': assigned_by,
                 'date': date,
-                
+
             }
             if len(subject_extns) > 0:
                 assoc['subject_extensions'] = subject_extns
             if len(extns) > 0:
                 assoc['object_extensions'] = extns
-                
+
             assocs.append(assoc)
         return line, assocs
-    
+
 class HpoaParser(GafParser):
     """
     Parser for HPOA format
@@ -600,7 +613,7 @@ class HpoaParser(GafParser):
 
     Note that there are similarities with Gaf format, so we inherit from GafParser, and override
     """
-    
+
     def __init__(self,config=AssocParserConfig()):
         """
         Arguments:
@@ -639,17 +652,17 @@ def parse_line(self, line, class_map=None, entity_map=None):
 
         # hardcode this, as HPOA is currently disease-only
         db_object_type = 'disease'
-        
+
         ## --
         ## db + db_object_id. CARD=1
         ## --
         id = self._pair_to_id(db, db_object_id)
         if not self._validate_id(id, line, ENTITY):
             return line, []
-        
+
         if not self._validate_id(hpoid, line, ANNOTATION):
             return line, []
-        
+
         ## --
         ## optionally map hpoid and entity (disease) id
         ## --
@@ -659,7 +672,7 @@ def parse_line(self, line, class_map=None, entity_map=None):
             if not self._validate_id(hpoid, line, ANNOTATION):
                 return line, []
             vals[4] = hpoid
-            
+
         # Example use case: mapping from OMIM to Orphanet
         if config.entity_map is not None:
             id = self.map_id(id, config.entity_map)
@@ -673,7 +686,7 @@ def parse_line(self, line, class_map=None, entity_map=None):
         ## --
         # regenerate line post-mapping
         line = "\t".join(vals)
-        
+
         ## --
         ## db_object_synonym CARD=0..*
         ## --
@@ -749,7 +762,7 @@ def parse_line(self, line, class_map=None, entity_map=None):
             'evidence': evidence,
             'provided_by': assigned_by,
             'date': date,
-            
+
         }
-            
+
         return line, [assoc]