Closes #112

belbio · Jun 5, 2020 · b15e70d · b15e70d
1 parent 87663ec
commit b15e70d
Show file tree

Hide file tree

Showing 3 changed files with 192 additions and 9 deletions.
diff --git a/app/namespaces/SCHEM_equivalences.py b/app/namespaces/SCHEM_equivalences.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python
+# -*- coding: utf-8-*-
+
+"""
+Usage: $ {1: program}.py
+"""
+
+import gzip
+import json
+import re
+
+prefix = "SCHEM"
+equivalences_fn = f"/data/bel_resources/downloads/{prefix}_equivalences.txt"
+ns_fn = f"/data/bel_resources/data/namespaces/{prefix}_belns.jsonl.gz"
+ns2_fn = f"/data/bel_resources/data/namespaces/{prefix}_equiv_belns.jsonl.gz"
+
+
+def collect_equivalences(fn):
+    """Collect equivalences from filename"""
+
+    equivalences = {}
+
+    with open(fn, "r") as f:
+        for line in f:
+            if line.startswith("#") or line.startswith("ID"):
+                continue
+
+            (
+                id,
+                altids,
+                label,
+                synonyms,
+                description,
+                type_,
+                species,
+                xref,
+                obsolete,
+                parents,
+                children,
+            ) = line.split("\t")
+
+            # print(f"id: {id} label: {label}  xref: {xref}")
+            # print(f"xref: {xref}")
+            if "," in xref:
+                print("Problem with xref", xref)
+
+            if xref:
+                if re.search('[,"\s\(\)]+', label):
+                    label.strip().strip('"').strip()
+                    label = f'"{label}"'
+
+                xref = xref.replace("MESHC", "MESH")
+
+                equivalences[f"{prefix}:{label}"] = xref
+
+    return equivalences
+
+
+def add_equivalences(equivalences):
+
+    with gzip.open(ns_fn, "rt") as fin, gzip.open(ns2_fn, "wt") as fout:
+        for line in fin:
+            r = json.loads(line)
+            if "term" not in r:
+                fout.write(line)
+                continue
+
+            if r["term"]["id"] in equivalences:
+                r["term"]["equivalences"] = [equivalences[r["term"]["id"]]]
+
+            fout.write(f"{json.dumps(r)}\n")
+
+
+def main():
+    equivalences = collect_equivalences(equivalences_fn)
+    add_equivalences(equivalences)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/app/namespaces/SCOMP_equivalences.py b/app/namespaces/SCOMP_equivalences.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python
+# -*- coding: utf-8-*-
+
+"""
+Usage: $ {1: program}.py
+"""
+
+import gzip
+import json
+import re
+
+prefix = "SCOMP"
+equivalences_fn = f"/data/bel_resources/downloads/{prefix}_equivalences.txt"
+ns_fn = f"/data/bel_resources/data/namespaces/{prefix}_belns.jsonl.gz"
+ns2_fn = f"/data/bel_resources/data/namespaces/{prefix}_equiv_belns.jsonl.gz"
+
+
+def collect_equivalences(fn):
+    """Collect equivalences from filename"""
+
+    equivalences = {}
+
+    with open(fn, "r") as f:
+        for line in f:
+            if line.startswith("#") or line.startswith("ID"):
+                continue
+
+            (
+                id,
+                altids,
+                label,
+                synonyms,
+                description,
+                type_,
+                species,
+                xref,
+                obsolete,
+                parents,
+                children,
+            ) = line.split("\t")
+
+            # print(f"id: {id} label: {label}  xref: {xref}")
+            # print(f"xref: {xref}")
+            if "," in xref:
+                print("Problem with xref", xref)
+
+            if xref:
+                if re.search('[,"\s\(\)]+', label):
+                    label.strip().strip('"').strip()
+                    label = f'"{label}"'
+
+                xref = xref.replace("MESHC", "MESH")
+                xref = xref.replace("GOCC", "GO")
+
+                equivalences[f"{prefix}:{label}"] = xref
+
+    return equivalences
+
+
+def add_equivalences(equivalences):
+
+    with gzip.open(ns_fn, "rt") as fin, gzip.open(ns2_fn, "wt") as fout:
+        for line in fin:
+            r = json.loads(line)
+            if "term" not in r:
+                fout.write(line)
+                continue
+
+            if r["term"]["id"] in equivalences:
+                r["term"]["equivalences"] = [equivalences[r["term"]["id"]]]
+
+            fout.write(f"{json.dumps(r)}\n")
+
+
+def main():
+    equivalences = collect_equivalences(equivalences_fn)
+    add_equivalences(equivalences)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/app/namespaces/sp.py b/app/namespaces/sp.py
@@ -6,21 +6,21 @@
 
 """
 
-import re
+import copy
+import datetime
+import gzip
+import json
 import os
+import re
 import tempfile
-import json
+from typing import Any, List, Mapping
+
+import structlog
 import yaml
-import datetime
-import copy
-import gzip
-from typing import List, Mapping, Any
 
-import app.utils as utils
 import app.settings as settings
-
 import app.setup_logging
-import structlog
+import app.utils as utils
 
 log = structlog.getLogger(__name__)
 
@@ -31,6 +31,9 @@
 namespace_def = settings.NAMESPACE_DEFINITIONS[namespace_key]
 ns_prefix = namespace_def["namespace"]
 
+model_org_prefixes = ["HGNC", "MGI", "RGD", "ZFIN"]
+model_org_prefix_list = "|".join(model_org_prefixes)
+
 terms_fp = f"../data/terms/{namespace_key}.jsonl.gz"
 tmpdir = tempfile.TemporaryDirectory(suffix=None, prefix=None, dir=None)
 dt = datetime.datetime.now().replace(microsecond=0).isoformat()
@@ -186,6 +189,25 @@ def process_record(record: List[str]) -> Mapping[str, Any]:
         if not name:
             name = orfnames[0]
 
+    # Equivalence processing
+    #    Remove EG ID's if HGNC/MGI/RGD or other model organism database IDs
+    #    We do this because some SP have multiple EG IDs - we want to remove readthrough entries
+    #        and resolve SP IFNA1_Human to EG:3439!IFNA1 instead of EG:3447!IFNA13
+    #    Protocol therefore is:
+    #        1. Check if model org ID exists
+    #        2. If so, take first model org ID from sorted list
+    #        3. Else take first EG ID from sorted list
+
+    equivalences.sort()
+
+    eg_equivalences = [e for e in equivalences if e.startswith("EG")]
+    if len(eg_equivalences) > 1:
+        model_org_equivalences = [e for e in equivalences if re.match(model_org_prefix_list, e)]
+        if len(model_org_equivalences) >= 1:
+            equivalences = [model_org_equivalences[0]]
+        else:
+            equivalences = [eg_equivalences[0]]
+
     # DE - name processing
     log.debug(f"DE {de}")
     de = re.sub(" {.*?}", "", de, flags=re.S)