In [None]:
# pull tRNAs over from the RefSeq annotation to CHESS
# assign new chess IDs to each of them, keeping original IDs 

In [1]:
# main imports
import os
import sys
import csv
import glob
import math
import shutil
import random
import importlib
import subprocess

import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 50)

In [2]:
%load_ext autoreload
%autoreload 1

sys.path.insert(0, "/ccb/salz4-4/avaraby/orfanage/soft")
%aimport definitions

In [12]:
# data
ref_fasta = "/home/avaraby1/genomes/human/hg38/hg38_p12_ucsc.fa"
input_gtf = "/ccb/salz8-1/avaraby/chess_maintenance_scripts/tmp/chess3.1.2.CHM13.gtf"
outbase = "../tmp/chess3.1.2.CHM13.fix_tRNA"
refseq_gtf = "/ccb/salz2-1/dpuiu/Homo_sapiens/T2T/chm13v2.0_RefSeq_Liftoff_v5.1.gff3"

In [13]:
# get maximum gene_id from the current annotation
max_gid = 0
with open(input_gtf,"r") as inFP:
    for line in inFP:
        lcs = line.split("\t")
        if not len(lcs) == 9:
            continue

        if not lcs[2]=="transcript":
            continue

        tid = lcs[8].split("transcript_id \"",1)[1].split("\"",1)[0]
        try:
            gid = int(tid.split(".")[1])
            max_gid = max(max_gid,gid)
        except:
            continue
max_gid

180165

In [16]:
# run gffread conversion of the gff3 to gtf
cmd = ["gffread","-T","-F","-o","../tmp/refseq.gtf",refseq_gtf]
subprocess.call(cmd)
refseq_gtf = "../tmp/refseq.gtf"

In [14]:
# load all tids along with gene_biotype and isolate
tid2type_chs = definitions.get_attribute(input_gtf,["gene_id","gene_biotype","transcript_biotype","gbkey","gene_name"],cols=[0])
trna_tids_chs = set(tid2type_chs[
                                ~(tid2type_chs[0]=="chrM")&\
                                (
                                    (tid2type_chs["gene_biotype"]=="tRNA")|\
                                    (tid2type_chs["transcript_biotype"]=="tRNA")|\
                                    (tid2type_chs["gbkey"]=="tRNA")
                                )]["tid"])
print(len(trna_tids_chs))

0


In [17]:
tid2type_ref = definitions.get_attribute(refseq_gtf,["gene_biotype","gene_name","gene_id"])
trna_tids_ref = set(tid2type_ref[tid2type_ref["gene_biotype"]=="tRNA"]["tid"])
print(len(trna_tids_ref))

522


In [18]:
# construct map of transcript IDs to new gene and transcript IDs for the refseq tRNAs

gidmap = dict() # old gene ID mapping to a tuple where 1st element is the new gid and 2nd element is the next available tid
tidmap = dict() # old transcript ID mapping to a tuyple where 1st element is the new gid and 2nd element is the newly assigned tid

with open(refseq_gtf,"r") as inFP:
    for line in inFP:
        lcs = line.split("\t")
        if not len(lcs) == 9:
            continue

        if not lcs[2]=="transcript":
            continue

        tid = lcs[8].split("transcript_id \"",1)[1].split("\"",1)[0]
        gid = lcs[8].split("gene_id \"",1)[1].split("\"",1)[0]
        
        new_gid = None
        new_tid = None

        if tid in trna_tids_ref:
            if gid in gidmap:
                new_gid = gidmap[gid][0]
                new_tid = gidmap[gid][1]
                gidmap[gid][1]+=1
            else:
                max_gid+=1
                new_gid = max_gid
                new_tid = 0
                gidmap[gid] = [new_gid,new_tid+1]
        else:
            continue

        assert new_gid is not None
        assert new_tid is not None
        tidmap[tid] = (new_gid,new_tid)

In [19]:
# we should assign transcript and gene IDs to these transcripts

with open(outbase+".gtf","w+") as outFP:
    # first write out everything in chess that is not tRNA
    with open(input_gtf,"r") as inFP:
        for line in inFP:
            lcs = line.split("\t")
            if not len(lcs) == 9:
                continue
    
            tid = lcs[8].split("transcript_id \"",1)[1].split("\"",1)[0]
            if tid in trna_tids_chs:
                continue
            else:
                outFP.write(line)

    # next add the tRNAs from the reference assigning new IDs in the process
    with open(refseq_gtf,"r") as inFP:
        for line in inFP:
            lcs = line.split("\t")
            if not len(lcs) == 9:
                continue
    
            tid = lcs[8].split("transcript_id \"",1)[1].split("\"",1)[0]
            
            if tid in trna_tids_ref:
                assert tid in tidmap
                new_gid, new_tid = tidmap[tid]
            else:
                continue

            # write things out
            attrs = definitions.extract_attributes(lcs[8])
            if lcs[2] == "transcript":
                gid = lcs[8].split("gene_id \"",1)[1].split("\"",1)[0]
                attrs["trna_og_tid"] = tid
                attrs["trna_og_gid"] = gid
                assert "gene_id" in attrs

            assert "transcript_id" in attrs
            attrs["transcript_id"] = "CHS."+str(new_gid)+"."+str(new_tid)

            if "gene_id" in attrs:
                attrs["gene_id"] = "CHS."+str(new_gid)
            
            res_line = "\t".join(lcs[:-1]) + "\t" + definitions.to_attribute_string(attrs, False, lcs[2])
            outFP.write(res_line+"\n")

In [20]:
print(os.path.abspath(outbase+".gtf"))

/ccb/salz8-1/avaraby/chess_maintenance_scripts/tmp/chess3.1.2.CHM13.fix_tRNA.gtf
