Skip to content

Commit

Permalink
Closes #112
Browse files Browse the repository at this point in the history
  • Loading branch information
William Hayes committed Jun 5, 2020
1 parent 87663ec commit b15e70d
Show file tree
Hide file tree
Showing 3 changed files with 192 additions and 9 deletions.
80 changes: 80 additions & 0 deletions app/namespaces/SCHEM_equivalences.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#!/usr/bin/env python
# -*- coding: utf-8-*-

"""
Usage: $ {1: program}.py
"""

import gzip
import json
import re

prefix = "SCHEM"
equivalences_fn = f"/data/bel_resources/downloads/{prefix}_equivalences.txt"
ns_fn = f"/data/bel_resources/data/namespaces/{prefix}_belns.jsonl.gz"
ns2_fn = f"/data/bel_resources/data/namespaces/{prefix}_equiv_belns.jsonl.gz"


def collect_equivalences(fn):
"""Collect equivalences from filename"""

equivalences = {}

with open(fn, "r") as f:
for line in f:
if line.startswith("#") or line.startswith("ID"):
continue

(
id,
altids,
label,
synonyms,
description,
type_,
species,
xref,
obsolete,
parents,
children,
) = line.split("\t")

# print(f"id: {id} label: {label} xref: {xref}")
# print(f"xref: {xref}")
if "," in xref:
print("Problem with xref", xref)

if xref:
if re.search('[,"\s\(\)]+', label):
label.strip().strip('"').strip()
label = f'"{label}"'

xref = xref.replace("MESHC", "MESH")

equivalences[f"{prefix}:{label}"] = xref

return equivalences


def add_equivalences(equivalences):

with gzip.open(ns_fn, "rt") as fin, gzip.open(ns2_fn, "wt") as fout:
for line in fin:
r = json.loads(line)
if "term" not in r:
fout.write(line)
continue

if r["term"]["id"] in equivalences:
r["term"]["equivalences"] = [equivalences[r["term"]["id"]]]

fout.write(f"{json.dumps(r)}\n")


def main():
equivalences = collect_equivalences(equivalences_fn)
add_equivalences(equivalences)


if __name__ == "__main__":
main()
81 changes: 81 additions & 0 deletions app/namespaces/SCOMP_equivalences.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/usr/bin/env python
# -*- coding: utf-8-*-

"""
Usage: $ {1: program}.py
"""

import gzip
import json
import re

prefix = "SCOMP"
equivalences_fn = f"/data/bel_resources/downloads/{prefix}_equivalences.txt"
ns_fn = f"/data/bel_resources/data/namespaces/{prefix}_belns.jsonl.gz"
ns2_fn = f"/data/bel_resources/data/namespaces/{prefix}_equiv_belns.jsonl.gz"


def collect_equivalences(fn):
"""Collect equivalences from filename"""

equivalences = {}

with open(fn, "r") as f:
for line in f:
if line.startswith("#") or line.startswith("ID"):
continue

(
id,
altids,
label,
synonyms,
description,
type_,
species,
xref,
obsolete,
parents,
children,
) = line.split("\t")

# print(f"id: {id} label: {label} xref: {xref}")
# print(f"xref: {xref}")
if "," in xref:
print("Problem with xref", xref)

if xref:
if re.search('[,"\s\(\)]+', label):
label.strip().strip('"').strip()
label = f'"{label}"'

xref = xref.replace("MESHC", "MESH")
xref = xref.replace("GOCC", "GO")

equivalences[f"{prefix}:{label}"] = xref

return equivalences


def add_equivalences(equivalences):

with gzip.open(ns_fn, "rt") as fin, gzip.open(ns2_fn, "wt") as fout:
for line in fin:
r = json.loads(line)
if "term" not in r:
fout.write(line)
continue

if r["term"]["id"] in equivalences:
r["term"]["equivalences"] = [equivalences[r["term"]["id"]]]

fout.write(f"{json.dumps(r)}\n")


def main():
equivalences = collect_equivalences(equivalences_fn)
add_equivalences(equivalences)


if __name__ == "__main__":
main()
40 changes: 31 additions & 9 deletions app/namespaces/sp.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,21 @@
"""

import re
import copy
import datetime
import gzip
import json
import os
import re
import tempfile
import json
from typing import Any, List, Mapping

import structlog
import yaml
import datetime
import copy
import gzip
from typing import List, Mapping, Any

import app.utils as utils
import app.settings as settings

import app.setup_logging
import structlog
import app.utils as utils

log = structlog.getLogger(__name__)

Expand All @@ -31,6 +31,9 @@
namespace_def = settings.NAMESPACE_DEFINITIONS[namespace_key]
ns_prefix = namespace_def["namespace"]

model_org_prefixes = ["HGNC", "MGI", "RGD", "ZFIN"]
model_org_prefix_list = "|".join(model_org_prefixes)

terms_fp = f"../data/terms/{namespace_key}.jsonl.gz"
tmpdir = tempfile.TemporaryDirectory(suffix=None, prefix=None, dir=None)
dt = datetime.datetime.now().replace(microsecond=0).isoformat()
Expand Down Expand Up @@ -186,6 +189,25 @@ def process_record(record: List[str]) -> Mapping[str, Any]:
if not name:
name = orfnames[0]

# Equivalence processing
# Remove EG ID's if HGNC/MGI/RGD or other model organism database IDs
# We do this because some SP have multiple EG IDs - we want to remove readthrough entries
# and resolve SP IFNA1_Human to EG:3439!IFNA1 instead of EG:3447!IFNA13
# Protocol therefore is:
# 1. Check if model org ID exists
# 2. If so, take first model org ID from sorted list
# 3. Else take first EG ID from sorted list

equivalences.sort()

eg_equivalences = [e for e in equivalences if e.startswith("EG")]
if len(eg_equivalences) > 1:
model_org_equivalences = [e for e in equivalences if re.match(model_org_prefix_list, e)]
if len(model_org_equivalences) >= 1:
equivalences = [model_org_equivalences[0]]
else:
equivalences = [eg_equivalences[0]]

# DE - name processing
log.debug(f"DE {de}")
de = re.sub(" {.*?}", "", de, flags=re.S)
Expand Down

0 comments on commit b15e70d

Please sign in to comment.