#198: added scripts for investigating partial alignments

biocommons · Sep 11, 2019 · 6e0b3dd · 6e0b3dd
1 parent ea866a5
commit 6e0b3dd
Show file tree

Hide file tree

Showing 2 changed files with 140 additions and 0 deletions.
diff --git a/misc/partial-alignments/Makefile b/misc/partial-alignments/Makefile
@@ -0,0 +1,40 @@
+.DELETE_ON_ERROR:
+.PHONY: FORCE
+.PRECIOUS:
+.SUFFIXES:
+
+SHELL:=bash -o pipefail
+
+VPATH:=mirrors-ncbi/2019/09/05/refseq/H_sapiens/alignments
+
+ASSYS:=GRCh37.p13 GRCh38.p2 GRCh38.p7 GRCh38.p12
+
+
+default: $(foreach a,${ASSYS},$a/tx $a/warnings $a/warnings-tx $a/clean-tx)
+
+
+GRCh37.p13/gff: GCF_000001405.25_knownrefseq_alignments.gff3
+	mkdir -p ${@D}
+	cd ${@D}; ln -fns ../$< ${@F}
+GRCh38.p2/gff: GCF_000001405.28_knownrefseq_alignments.gff3
+	mkdir -p ${@D}
+	cd ${@D}; ln -fns ../$< ${@F}
+GRCh38.p7/gff: GCF_000001405.33_knownrefseq_alignments.gff3
+	mkdir -p ${@D}
+	cd ${@D}; ln -fns ../$< ${@F}
+GRCh38.p12/gff: GCF_000001405.38_knownrefseq_alignments.gff3
+	mkdir -p ${@D}
+	cd ${@D}; ln -fns ../$< ${@F}
+
+
+%/tx: %/gff
+	perl -lne 'print $$1 if m/Target=(\S+)/' <$< | sort -u >$@
+
+%/warnings: %/gff
+	./find-partial-alignments $< >$@ 2>&1
+
+%/warnings-tx: %/warnings
+	perl -lne 'print $$1 if m/~([.\w]+)/' <$< | sort -u >$@
+
+%/clean-tx: %/tx %/warnings-tx
+	join -v1 $^ >$@
diff --git a/misc/partial-alignments/find-partial-alignments b/misc/partial-alignments/find-partial-alignments
@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+
+import io
+import logging
+import os
+import re
+import sys
+
+import attr
+import coloredlogs
+
+
+_logger = logging.getLogger()
+
+
+@attr.s(slots=True)
+class ExonAlignment(object):
+    """alignment of a single exon"""
+    ref_ac = attr.ib()
+    origin = attr.ib()
+    match_type = attr.ib()
+    g_start = attr.ib(converter=int)
+    g_end = attr.ib(converter=int)
+    score = attr.ib(converter=float)
+    strand = attr.ib()
+    aln_id = attr.ib()
+    tx_ac = attr.ib()
+    tx_start = attr.ib(converter=int)
+    tx_end = attr.ib(converter=int)
+    pct_coverage = attr.ib(converter=float)
+    pct_identity_gap = attr.ib(converter=float)
+    pct_identity_ungap = attr.ib(converter=float)
+
+
+
+def read_exon_alignments(fn):
+    """read lines of NCBI's alignment gff file, fn, returning ExonAlignment records"""
+
+    # NC_000007.13	RefSeq	cDNA_match	50344265	50344518	254	+	.	ID=aln58042;Target=NM_001220765.2 1 254 +;gap_count=0;identity=0.0691326;idty=1;num_ident=428;num_mismatch=0;pct_coverage=6.91326;pct_identity_gap=100;pct_identity_ungap=100;score=254
+    # NC_000002.11  RefSeq  cDNA_match  179671939   179672150             212 -   .               ID=ed951d46-194c-477a-a480-4bc64530c5ba;Target=NM_001267550.2 1 212 +;gap_count=0;identity=0.999991;idty=1;num_ident=109223;num_mismatch=1;pct_coverage=100;pct_identity_gap=99.9991;pct_identity_ungap=99.9991
+    line_re = re.compile(
+        "(?P<ref_ac>\S+)\s+(?P<origin>\S+)\s+(?P<match_type>\S+)\s+"
+        "(?P<g_start>\d+)\s+(?P<g_end>\d+)\s+(?P<score>\S+)\s+"
+        "(?P<strand>[-+])\s+\.\s+ID=(?P<aln_id>[^;]+);Target=(?P<tx_ac>\S+)"
+        "\s+(?P<tx_start>\d+)\s+(?P<tx_end>\d+).+?"
+        "pct_coverage=(?P<pct_coverage>[^;]+);"
+        "pct_identity_gap=(?P<pct_identity_gap>[^;]+);"
+        "pct_identity_ungap=(?P<pct_identity_ungap>[^;]+)"
+    )
+    fh = io.open(fn, "rt")
+    for line in fh.readlines():
+        if not line.startswith('#'):
+            try:
+                d = line_re.match(line).groupdict()
+                if d["score"] == ".":   # seems to only occur in 1405.38
+                    d["score"] = 0
+                yield ExonAlignment(**d)
+            except AttributeError:
+                raise Exception("Failed at", line)
+            except ValueError:
+                import IPython; IPython.embed()	  ### TODO: Remove IPython.embed()
+
+
+if __name__ == "__main__":
+    coloredlogs.install(level="INFO")
+
+    last_key = None
+    last_end = None
+    tx_seen = set()
+    tx_5u = set()
+    tx_iu = set()
+
+    fn = sys.argv[1]
+    for ea in read_exon_alignments(fn):
+        # Use NCs only
+        if not ea.ref_ac.startswith("NC"):
+            continue
+
+        tx_seen.add(ea.tx_ac)
+        key = ea.ref_ac + "~" + ea.tx_ac
+
+        if last_key is None:
+            pass
+
+        elif last_key == key:
+            if ea.tx_start > last_end + 1:
+                tx_iu.add(ea.tx_ac)
+                _logger.warning(f"{key}: gap between {last_end} and {ea.tx_start}")
+
+        else:
+            if ea.tx_start != 1:
+                tx_5u.add(ea.tx_ac)
+                _logger.warning(f"{key}: transcript doesn't start at 1")
+
+        last_key = key
+        last_end = ea.tx_end
+
+    tx_clean = tx_seen - tx_5u - tx_iu
+
+    print(f"{os.path.basename(fn)}: {len(tx_seen)} tx seen; {len(tx_clean)} clean; {len(tx_5u)} 5' unaligned; {len(tx_iu)} internal unaligned")