Skip to content

Commit

Permalink
Merge pull request #15 from dougli1sqrd/ontobio-questions
Browse files Browse the repository at this point in the history
some comment TODOs and a few small edits
  • Loading branch information
cmungall committed May 16, 2017
2 parents d548d93 + a20987e commit a6e0b8b
Showing 1 changed file with 71 additions and 58 deletions.
129 changes: 71 additions & 58 deletions ontobio/io/gafparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class Report():
FATAL = 'FATAL'
ERROR = 'ERROR'
WARNING = 'WARNING'

# Warnings: TODO link to gorules
INVALID_ID = "Invalid identifier"
INVALID_IDSPACE = "Invalid identifier prefix"
Expand All @@ -50,7 +50,7 @@ class Report():
3 warning levels
"""
LEVELS = [FATAL, ERROR, WARNING]

def __init__(self):
self.messages = []
self.n_lines = 0
Expand Down Expand Up @@ -113,34 +113,36 @@ def to_markdown(self):
"""
json = self.to_report_json()
summary = json['summary']

s = ""
s = s + "\n## SUMMARY\n\n";
s += "\n## SUMMARY\n\n"

s += " * Associations: {}\n" . format(summary['association_count'])
s += " * Lines in file (incl headers): {}\n" . format(summary['line_count'])
s += " * Lines skipped: {}\n" . format(summary['skipped_line_count'])

s = s + " * Associations: {}\n" . format(summary['association_count'])
s = s + " * Lines in file (incl headers): {}\n" . format(summary['line_count'])
s = s + " * Lines skipped: {}\n" . format(summary['skipped_line_count'])

stats = json['aggregate_statistics']
s = s + "\n## STATISTICS\n\n";
s += "\n## STATISTICS\n\n"
for k,v in stats.items():
s = s + " * {}: {}\n" . format(k,v)
s += " * {}: {}\n" . format(k,v)




s = s + "\n## MESSAGES\n\n";
s += "\n## MESSAGES\n\n"
for g in json['groups']:
s = s + " * {}: {}\n".format(g['level'], g['count'])
s = s + "\n\n";
s += " * {}: {}\n".format(g['level'], g['count'])
s += "\n\n"
for g in json['groups']:
level = g['level']
msgs = g['messages']
if len(msgs) > 0:
s = s + "### {}\n\n".format(level)
s += "### {}\n\n".format(level)
for m in msgs:
s = s + " * {} {} `{}`\n".format(m['type'],m['message'],m['line'])
s += " * {} {} `{}`\n".format(m['type'],m['message'],m['line'])
return s


# TODO avoid using names that are builtin python: file, id

class AssocParser():
"""
Abstract superclass of all association parser classes
Expand All @@ -149,7 +151,7 @@ class AssocParser():
def parse(self, file, outfile=None):
"""
Parse a file.
Arguments
---------
Expand All @@ -163,12 +165,13 @@ def parse(self, file, outfile=None):
skipped = []
n_lines = 0
for line in file:
n_lines = n_lines+1
n_lines += 1
if line.startswith("!"):
if outfile is not None:
outfile.write(line)
continue
line = line.strip("\n")
# Let's rename line2 to something more meaningful
line2, new_assocs = self.parse_line(line)
if new_assocs is None or new_assocs == []:
logging.warn("SKIPPING: {}".format(new_assocs))
Expand All @@ -181,20 +184,23 @@ def parse(self, file, outfile=None):
rpt.references.update(a['evidence']['has_supporting_reference'])
if 'taxon' in a['subject']:
rpt.taxa.add(a['subject']['taxon']['id'])
assocs = assocs + new_assocs
assocs += new_assocs
if outfile is not None:
outfile.write(line2 + "\n")

self.report.skipped = self.report.skipped + skipped
self.report.n_lines = self.report.n_lines + n_lines
self.report.n_assocs = self.report.n_assocs + len(assocs)
self.report.skipped += skipped
self.report.n_lines += n_lines
self.report.n_assocs += len(assocs)
logging.info("Parsed {} assocs from {} lines. Skipped: {}".
format(len(assocs),
n_lines,
len(skipped)))
file.close()
return assocs

def parse_line(self, line):
raise NotImplementedError("AssocParser.parse_line not implemented")

# split an ID/CURIE into prefix and local parts
# (not currently used)
def _parse_id(self, id):
Expand All @@ -208,7 +214,7 @@ def _parse_id(self, id):
def _get_id_prefix(self, id):
toks = id.split(":")
return toks[0]

def _validate_taxon(self, taxon, line):
if self.config.valid_taxa is None:
return True
Expand All @@ -218,9 +224,9 @@ def _validate_taxon(self, taxon, line):
else:
self.report.error(line, Report.INVALID_TAXON, taxon)
return False

def _validate_id(self, id, line, context=None):
if id.find(" ") > -1:
if " " in id:
self.report.error(line, Report.INVALID_ID, id)
return False
if id.find("|") > -1:
Expand All @@ -239,7 +245,7 @@ def _split_pipe(self, v):
ids = v.split("|")
ids = [id for id in ids if self._validate_id(id, '')]
return ids

def _pair_to_id(self, db, localid):
if self.config.remove_double_prefixes:
## Switch MGI:MGI:n to MGI:n
Expand All @@ -251,9 +257,10 @@ def _taxon_id(self,id):
id = id.replace('taxon','NCBITaxon')
self._validate_id(id,'',TAXON)
return id

def _ensure_file(self, file):
if isinstance(file,str):
# TODO Let's fix this if/elseif chain.
if file.startswith("ftp"):
f = tempfile.NamedTemporaryFile()
fn = f.name
Expand All @@ -273,8 +280,8 @@ def _ensure_file(self, file):
return open("myfile.txt", "r")
else:
return file


def _parse_class_expression(self, x):
## E.g. exists_during(GO:0000753)
## Atomic class expressions only
Expand All @@ -283,8 +290,13 @@ def _parse_class_expression(self, x):
'property':p,
'filler':v
}



# TODO consider making an Association its own class too to give it a little more
# TODO Semantic value?

# TODO consider making an ID class?


class GpadParser(AssocParser):
"""
Parser for GO GPAD Format
Expand All @@ -301,7 +313,7 @@ def __init__(self,config=AssocParserConfig()):
"""
self.config = config
self.report = Report()

def skim(self, file):
file = self._ensure_file(file)
tuples = []
Expand All @@ -320,7 +332,7 @@ def skim(self, file):
tuples.append( (id,None,t) )
return tuples

def parse_line(self, line):
def parse_line(self, line):
"""
Parses a single line of a GPAD
"""
Expand All @@ -341,10 +353,10 @@ def parse_line(self, line):
id = self._pair_to_id(db, db_object_id)
if not self._validate_id(id, line, ENTITY):
return line, []

if not self._validate_id(goid, line, ANNOTATION):
return line, []

assocs = []
xp_ors = annotation_xp.split("|")
for xp_or in xp_ors:
Expand Down Expand Up @@ -372,16 +384,16 @@ def parse_line(self, line):
},
'provided_by': assigned_by,
'date': date,

}
assocs.append(assoc)
return line, assocs

class GafParser(AssocParser):
"""
Parser for GO GAF format
"""

def __init__(self,config=AssocParserConfig()):
"""
Arguments:
Expand All @@ -391,7 +403,7 @@ def __init__(self,config=AssocParserConfig()):
"""
self.config = config
self.report = Report()

def skim(self, file):
file = self._ensure_file(file)
tuples = []
Expand All @@ -418,10 +430,10 @@ def parse_line(self, line, class_map=None, entity_map=None):
Parses a single line of a GAF
"""
config = self.config

vals = line.split("\t")
if len(vals) == 15:
vals = vals + ["",""]
vals += ["",""]
[db,
db_object_id,
db_object_symbol,
Expand All @@ -446,10 +458,10 @@ def parse_line(self, line, class_map=None, entity_map=None):
id = self._pair_to_id(db, db_object_id)
if not self._validate_id(id, line, ENTITY):
return line, []

if not self._validate_id(goid, line, ANNOTATION):
return line, []

## --
## optionally map goid and entity (gp) id
## --
Expand All @@ -459,7 +471,7 @@ def parse_line(self, line, class_map=None, entity_map=None):
if not self._validate_id(goid, line, ANNOTATION):
return line, []
vals[4] = goid

# Example use case: mapping from UniProtKB to MOD ID
if config.entity_map is not None:
id = self.map_id(id, config.entity_map)
Expand All @@ -482,7 +494,7 @@ def parse_line(self, line, class_map=None, entity_map=None):
taxon = taxa[0]
in_taxa = taxa[1:]
self._validate_taxon(taxon, line)

## --
## db_object_synonym CARD=0..*
## --
Expand Down Expand Up @@ -535,6 +547,7 @@ def parse_line(self, line, class_map=None, entity_map=None):
## --
## goid
## --
# TODO We shouldn't overload buildin keywords/functions
object = {'id':goid,
'taxon': taxon}

Expand All @@ -549,7 +562,7 @@ def parse_line(self, line, class_map=None, entity_map=None):
'id': taxon
}
}

## --
## gene_product_isoform
## --
Expand Down Expand Up @@ -582,16 +595,16 @@ def parse_line(self, line, class_map=None, entity_map=None):
'evidence': evidence,
'provided_by': assigned_by,
'date': date,

}
if len(subject_extns) > 0:
assoc['subject_extensions'] = subject_extns
if len(extns) > 0:
assoc['object_extensions'] = extns

assocs.append(assoc)
return line, assocs

class HpoaParser(GafParser):
"""
Parser for HPOA format
Expand All @@ -600,7 +613,7 @@ class HpoaParser(GafParser):
Note that there are similarities with Gaf format, so we inherit from GafParser, and override
"""

def __init__(self,config=AssocParserConfig()):
"""
Arguments:
Expand Down Expand Up @@ -639,17 +652,17 @@ def parse_line(self, line, class_map=None, entity_map=None):

# hardcode this, as HPOA is currently disease-only
db_object_type = 'disease'

## --
## db + db_object_id. CARD=1
## --
id = self._pair_to_id(db, db_object_id)
if not self._validate_id(id, line, ENTITY):
return line, []

if not self._validate_id(hpoid, line, ANNOTATION):
return line, []

## --
## optionally map hpoid and entity (disease) id
## --
Expand All @@ -659,7 +672,7 @@ def parse_line(self, line, class_map=None, entity_map=None):
if not self._validate_id(hpoid, line, ANNOTATION):
return line, []
vals[4] = hpoid

# Example use case: mapping from OMIM to Orphanet
if config.entity_map is not None:
id = self.map_id(id, config.entity_map)
Expand All @@ -673,7 +686,7 @@ def parse_line(self, line, class_map=None, entity_map=None):
## --
# regenerate line post-mapping
line = "\t".join(vals)

## --
## db_object_synonym CARD=0..*
## --
Expand Down Expand Up @@ -749,7 +762,7 @@ def parse_line(self, line, class_map=None, entity_map=None):
'evidence': evidence,
'provided_by': assigned_by,
'date': date,

}

return line, [assoc]

0 comments on commit a6e0b8b

Please sign in to comment.