In [None]:
# liftoff correctly handles asignment of IDs to transcripts in duplicated genes by appending "_#" to the end of the transcript
# however, for genes this fails and geneID remains the same for all duplciates
# this notebook fixes the issues by:
# 1. find all transcripts associated with duplicated genes
# 2. figure out which ones are gene duplicates
# 3. assign new gene ID and move liftoff geneIDs over to the aux attributes

In [1]:
# main imports
import os
import sys
import csv
import glob
import math
import shutil
import random
import importlib
import subprocess

import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 50)

In [2]:
%load_ext autoreload
%autoreload 1

sys.path.insert(0, "/ccb/salz4-4/avaraby/orfanage/soft")
%aimport definitions

In [4]:
# data
ref_fasta = "/home/avaraby1/genomes/human/hg38/hg38_p12_ucsc.fa"
input_gtf = "../data/chess_github_clone/chess3.0.1.CHM13.gtf"
outbase = "../tmp/chess3.0.1.CHM13.fix_dup_gid"

is_gff = definitions.gtf_or_gff(input_gtf)=="gff"

In [5]:
# load the "copy_num_id" attribute for all transcripts
df = definitions.get_attribute(input_gtf,["copy_num_id","gene_id"])
df.head()

Unnamed: 0,tid,copy_num_id,gene_id
0,CHS.23663.3_2,CHS.23663_2,CHS.23663
1,CHS.139257.1,CHS.139257_0,CHS.139257
2,CHS.19.5,CHS.19_0,CHS.19
3,CHS.170756.1_3,CHS.170756_3,CHS.170756
4,CHS.144080.1,CHS.144080_0,CHS.144080


In [33]:
# get maximum gene_id from the current annotation
max_gid = 0
with open(input_gtf,"r") as inFP:
    for line in inFP:
        lcs = line.split("\t")
        if not len(lcs) == 9:
            continue

        if not lcs[2]=="transcript":
            continue

        tid = lcs[8].split("transcript_id \"",1)[1].split("\"",1)[0]
        try:
            gid = int(tid.split(".")[1])
            max_gid = max(max_gid,gid)
        except:
            continue
max_gid

176298

In [34]:
# assign new gene and transcript ids for each of the duplicate genes

def is_int(s):
    if "_" in s:
        return False
    try:
        float(s)
        return True
    except ValueError:
        return False

def is_chess_tid(tid):
    # returns true if complies with the chess format
    try:
        parts = tid.split(".")
        if len(parts)==3 and parts[0][:3]=="CHS" and is_int(parts[1]) and is_int(parts[2]):
            return True
        else:
            return False
    except:
        return False


cpn_dict = dict() # keeps track of gid/copynumber to new gid assignment
def assign_ids(row, idmap):
    global max_gid  # Declare max_gid as global to modify the global variable
    global cpn_dict
    # given a dictionary of transcript and gene ids and a new transcript
    # figure assign an ID to it

    # first check if its in the chess format or not
    if is_chess_tid(row["tid"]):  # just add to the dicts
        gid = row["tid"].rsplit(".",1)[0]
        assert gid == row["gene_id"], "incompatible gene_ids: " + gid + " : " + row["gene_id"]
        idmap[row["tid"]] = (row["tid"], row["gene_id"])
    else:
        if row["tid"][:3] == "CHS":  # is likely a gene copy
            idnum = int(row["copy_num_id"].rsplit("_",1)[-1])
            gid = row["copy_num_id"].rsplit("_",1)[0]
            if idnum == 0:
                assert row["tid"].rsplit(".",1)[0] == gid == row["gene_id"], "incompatible gene_ids: " + row["tid"].rsplit(".",1)[0] + " : " + gid + " : " + row["gene_id"]
                tid_num = int(row["tid"].rsplit("_",1)[0].rsplit(".",1)[-1])
                new_gid = gid
                new_tid = new_gid + "." + str(tid_num)
                idmap[row["tid"]] = (row["tid"], row["gene_id"])
            else:  # is copy
                assert "_" in row["tid"], "copy tid without _: " + row["tid"]
                # first check if already in the map - if so, use the assigned ID
                if not row["copy_num_id"] in cpn_dict:
                    max_gid += 1
                    cpn_dict[row["copy_num_id"]] = max_gid
                    
                new_gid_num = cpn_dict[row["copy_num_id"]]                    
                tid_num = int(row["tid"].rsplit("_",1)[0].rsplit(".",1)[-1])
                new_gid = "CHS." + str(new_gid_num)
                new_tid = new_gid + "." + str(tid_num)
                idmap[row["tid"]] = (new_tid, new_gid)
        else:  # is something else - just leave it as is
            idmap[row["tid"]] = (row["tid"], row["gene_id"])

idmap = dict()
df.apply(lambda row: assign_ids(row, idmap), axis=1)

0         None
1         None
2         None
3         None
4         None
          ... 
162375    None
162376    None
162377    None
162378    None
162379    None
Length: 162380, dtype: object

In [37]:
# now iterate over the GTF and write out the conversion

with open(outbase+".gtf","w+") as outFP:
    with open(input_gtf,"r") as inFP:
        for line in inFP:
            lcs = line.split("\t")
            if not len(lcs) == 9:
                continue

            if not lcs[2] in ["transcript","exon","CDS"]:
                continue
            
    
            tid = lcs[8].split("transcript_id \"",1)[1].split("\"",1)[0]
            assert tid in idmap,"unknown tid: "+tid

            new_tid, new_gid = idmap[tid]

            attrs = definitions.extract_attributes(lcs[8])

            if lcs[2] == "transcript":
                attrs["liftoff_tid"] = tid
                attrs["liftoff_gid"] = attrs["gene_id"]
                assert "gene_id" in attrs

            assert "transcript_id" in attrs
            attrs["transcript_id"] = new_tid

            if "gene_id" in attrs:
                attrs["gene_id"] = new_gid
            
            res_line = "\t".join(lcs[:-1]) + "\t" + definitions.to_attribute_string(attrs, False, lcs[2])
            outFP.write(res_line+"\n")

In [58]:
os.path.abspath(outbase+".gtf")

'/ccb/salz8-1/avaraby/chess_maintenance_scripts/tmp/chess3.0.1.CHM13.fix_dup_gid.gtf'