In [1]:
# multiple transcripts are inherited from refseq as exon-chain duplicates
# this can present an issue with assembly and quantification
# we need to identify the dominant copy and remove duplicate

In [1]:
# main imports
import os
import sys
import csv
import glob
import math
import shutil
import random
import importlib
import subprocess

import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 50)

In [2]:
%load_ext autoreload
%autoreload 1

sys.path.insert(0, "/ccb/salz4-4/avaraby/orfanage/soft")
%aimport definitions

In [3]:
# data
ref_fasta = "/home/avaraby1/genomes/human/hg38/hg38_p12_ucsc.fa"
grch_gtf = "/ccb/salz8-1/avaraby/chess_maintenance_scripts/data/chess_github_clone/chess3.1.2.GRCh38.gtf"
chm_gtf = "/ccb/salz8-1/avaraby/chess_maintenance_scripts/data/chess_github_clone/chess3.1.2.CHM13.gtf"
outbase = "../tmp/chess3.1.2.fix_duplicated_tx"

In [4]:
# find duplicates
# load exon chains and find those with the same chain
# compare against intron-chain

In [33]:
# run borf on chess
cmd = ["/ccb/salz8-1/avaraby/chess_maintenance_scripts/soft/borf/borf/main.py","--use_geneid","--score","-g",grch_gtf,"-o","/ccb/salz8-1/avaraby/chess_maintenance_scripts/tmp/borf"]
subprocess.call(cmd)

0

In [34]:
c3 = definitions.get_chains(grch_gtf,"exon",True)
c3["echain"] = c3["seqid"]+c3["strand"]+c3.apply(lambda row: ",".join(str(x[0])+"-"+str(x[1]) for x in row["chain"]),axis=1)
c3["ichain"] = c3["seqid"]+c3["strand"]+c3.apply(lambda row: ",".join(str(x[0])+"-"+str(x[1]) for x in definitions.chain_inv(row["chain"])),axis=1)
c3["ichain"] = np.where(c3["ichain"].str.endswith(("-","+")),"-",c3["ichain"])
c3 = c3[["tid","has_cds","echain","ichain"]]

c3 = c3.merge(definitions.get_attribute(grch_gtf,["gene_id"]),on="tid",how="left")
c3 = c3.merge(definitions.get_attribute("/ccb/salz8-1/avaraby/chess_maintenance_scripts/tmp/borf.gtf",["mom"]),on="tid",how="left")

# attach cds chain to it as well
c3_cds = definitions.get_chains(grch_gtf,"CDS",True)
c3_cds["cchain"] = c3_cds["seqid"]+c3_cds["strand"]+c3_cds.apply(lambda row: ",".join(str(x[0])+"-"+str(x[1]) for x in row["chain"]),axis=1)
c3_cds = c3_cds[["tid","cchain"]]
c3_cds["cchain"] = np.where(c3_cds["cchain"].str.endswith(("-","+")),"-",c3_cds["cchain"])

# merge
c3 = c3.merge(c3_cds,on="tid")
c3

Unnamed: 0,tid,has_cds,echain,ichain,gene_id,mom,cchain
0,CHS.1.1,1,"chr1+11874-12227,12613-12721,13221-14409","chr1+12227-12613,12721-13221",CHS.1,,-
1,CHS.2.1,1,"chr1-14362-14829,14970-15038,15796-15947,16607...","chr1-14829-14970,15038-15796,15947-16607,16765...",CHS.2,,-
2,CHS.166734.2,1,chr1-17369-17391,-,CHS.166734,,-
3,CHS.166734.1,1,chr1-17369-17436,-,CHS.166734,,-
4,CHS.166734.3,1,chr1-17409-17431,-,CHS.166734,,-
...,...,...,...,...,...,...,...
168446,CHS.57396.1,1,"chrUn_KI270755v1+21574-26743,27337-27719",chrUn_KI270755v1+26743-27337,CHS.57396,,-
168447,CHS.59138.3,1,"chrX_KI270881v1_alt-65309-66585,69007-69092,69...","chrX_KI270881v1_alt-66585-69007,69092-69763,69...",CHS.59138,50.0,"chrX_KI270881v1_alt-65972-66585,69007-69092,69..."
168448,CHS.59138.4,1,"chrX_KI270881v1_alt-65309-66585,69007-69092,69...","chrX_KI270881v1_alt-66585-69007,69092-69763,69...",CHS.59138,50.0,"chrX_KI270881v1_alt-65972-66585,69007-69092,69..."
168449,CHS.59138.1,1,"chrX_KI270881v1_alt-65314-66585,69007-69092,69...","chrX_KI270881v1_alt-66585-69007,69092-69763,69...",CHS.59138,50.0,"chrX_KI270881v1_alt-65972-66585,69007-69092,69..."


In [35]:
# compute similarity to mane and decide which ones to keep

# for every gene, we need to get MANE transcript and CDS chain

# load transcripts with source and gene_id
mane_cmp_df = definitions.get_attribute(grch_gtf,["gene_id"],cols=[1])
mane_cmp_df.rename({1:"source"},axis=1,inplace=True)

# split off mane and join on the dataframe to mark mane_tid for each transcript
mane_cmp_df = mane_cmp_df.merge(mane_cmp_df[mane_cmp_df["source"]=="MANE"][["tid","gene_id"]].rename({"tid":"tid_mane"},axis=1),on="gene_id",how="left")

mane_cmp_df = mane_cmp_df[~(mane_cmp_df["tid_mane"].isna())]

mane_cmp_df.head()

Unnamed: 0,tid,source,gene_id,tid_mane
11,CHS.131169.0,MANE,CHS.131169,CHS.131169.0
27,CHS.15.1,MANE,CHS.15,CHS.15.1
44,CHS.20.5,MANE,CHS.20,CHS.20.5
80,CHS.39.2,CHESS,CHS.39,CHS.39.7
81,CHS.39.5,RefSeq,CHS.39,CHS.39.7


In [36]:
# load chains for all transcripts
cds_chains = definitions.get_chains(grch_gtf,"CDS",True)
cds_chains.head()

Unnamed: 0,tid,has_cds,seqid,strand,coords,chain
0,CHS.1.1,0,chr1,+,chr1:11874-14409,()
1,CHS.2.1,0,chr1,-,chr1:14362-29370,()
2,CHS.166734.2,0,chr1,-,chr1:17369-17391,()
3,CHS.166734.1,0,chr1,-,chr1:17369-17436,()
4,CHS.166734.3,0,chr1,-,chr1:17409-17431,()


In [37]:
# merge onto the dataframe mane_cmp
mane_cmp_df = mane_cmp_df.merge(cds_chains,on="tid",how="left")
mane_cmp_df = mane_cmp_df.merge(cds_chains[["tid","chain"]].rename({"chain":"chain_mane"},axis=1),left_on="tid_mane",right_on="tid",how="left")
mane_cmp_df.drop("tid_y",axis=1,inplace=True)
mane_cmp_df.head()

Unnamed: 0,tid_x,source,gene_id,tid_mane,has_cds,seqid,strand,coords,chain,chain_mane
0,CHS.131169.0,MANE,CHS.131169,CHS.131169.0,1,chr1,+,chr1:65419-71585,"((65565, 65573), (69037, 70008))","((65565, 65573), (69037, 70008))"
1,CHS.15.1,MANE,CHS.15,CHS.15.1,1,chr1,-,chr1:450740-451678,"((450740, 451678),)","((450740, 451678),)"
2,CHS.20.5,MANE,CHS.20,CHS.20.5,1,chr1,-,chr1:685716-686654,"((685716, 686654),)","((685716, 686654),)"
3,CHS.39.2,CHESS,CHS.39,CHS.39.7,1,chr1,+,chr1:923923-944574,"((924432, 924948), (925922, 926013), (930155, ...","((924432, 924948), (925922, 926013), (930155, ..."
4,CHS.39.5,RefSeq,CHS.39,CHS.39.7,1,chr1,+,chr1:923923-944574,"((924432, 924948), (925922, 926013), (930155, ...","((924432, 924948), (925922, 926013), (930155, ..."


In [38]:
mane_cmp_df[["mod_chain","c1len","c2len","match_start","match_end","num_bp_extra","num_bp_missing","num_bp_inframe","num_bp_match","num_bp_outframe","lpi","ilpi","mlpi"]] = mane_cmp_df.apply(lambda row: definitions.compare_and_extract(row["chain"],row["chain_mane"],row["strand"]),axis=1)
mane_cmp_df.rename({"tid_x":"tid"},axis=1,inplace=True)
mane_cmp_df.head()

Unnamed: 0,tid,source,gene_id,tid_mane,has_cds,seqid,strand,coords,chain,chain_mane,mod_chain,c1len,c2len,match_start,match_end,num_bp_extra,num_bp_missing,num_bp_inframe,num_bp_match,num_bp_outframe,lpi,ilpi,mlpi
0,CHS.131169.0,MANE,CHS.131169,CHS.131169.0,1,chr1,+,chr1:65419-71585,"((65565, 65573), (69037, 70008))","((65565, 65573), (69037, 70008))","[[65565, 65573, 0], [69037, 70008, 0]]",981,981,True,True,0,0,981,981,0,100,100,100
1,CHS.15.1,MANE,CHS.15,CHS.15.1,1,chr1,-,chr1:450740-451678,"((450740, 451678),)","((450740, 451678),)","[[450740, 451678, 0]]",939,939,True,True,0,0,939,939,0,100,100,100
2,CHS.20.5,MANE,CHS.20,CHS.20.5,1,chr1,-,chr1:685716-686654,"((685716, 686654),)","((685716, 686654),)","[[685716, 686654, 0]]",939,939,True,True,0,0,939,939,0,100,100,100
3,CHS.39.2,CHESS,CHS.39,CHS.39.7,1,chr1,+,chr1:923923-944574,"((924432, 924948), (925922, 926013), (930155, ...","((924432, 924948), (925922, 926013), (930155, ...","[[924432, 924948, 0], [925922, 926013, 0], [93...",2583,2535,True,True,48,0,2535,2535,0,100,98,98
4,CHS.39.5,RefSeq,CHS.39,CHS.39.7,1,chr1,+,chr1:923923-944574,"((924432, 924948), (925922, 926013), (930155, ...","((924432, 924948), (925922, 926013), (930155, ...","[[924432, 924948, 0], [925922, 926013, 0], [93...",2538,2535,True,True,3,0,2535,2535,0,100,99,99


In [39]:
c3 = c3.merge(mane_cmp_df[["tid","tid_mane","ilpi"]],on="tid",how="left")
c3.head()

Unnamed: 0,tid,has_cds,echain,ichain,gene_id,mom,cchain,tid_mane,ilpi
0,CHS.1.1,1,"chr1+11874-12227,12613-12721,13221-14409","chr1+12227-12613,12721-13221",CHS.1,,-,,
1,CHS.2.1,1,"chr1-14362-14829,14970-15038,15796-15947,16607...","chr1-14829-14970,15038-15796,15947-16607,16765...",CHS.2,,-,,
2,CHS.166734.2,1,chr1-17369-17391,-,CHS.166734,,-,,
3,CHS.166734.1,1,chr1-17369-17436,-,CHS.166734,,-,,
4,CHS.166734.3,1,chr1-17409-17431,-,CHS.166734,,-,,


In [53]:
# iterate over duplicates and decide what to do based on the comparisons

# keep group information (all tids so we can add tag with duplicates)

keep_tids = {}

for (gid,echain), grp in c3.groupby(by=['gene_id','echain']):
    if len(set(grp["tid"].tolist()))==1:
        keep_tids[grp["tid"].tolist()[0]] = []
    
    # find one with highest ilpi to mane
    max_ilpi = 0
    max_ilpi_tid = None

    max_mom = 0
    max_mom_tid = None
    for idx, row in grp.iterrows():
        ilpi = 0
        try:
            ilpi = float(row["ilpi"])
        except:
            ilpi = 0
        if ilpi>max_ilpi:
            max_ilpi = ilpi
            max_ilpi_tid = row["tid"]

        mom = 0
        try:
            mom = float(row["mom"])
        except:
            mom = 0
        if mom>max_mom:
            max_mom = mom
            max_mom_tid = row["tid"]

    if max_ilpi > 0: # found match
        keep_tids[max_ilpi_tid] = []
        for idx, row in grp.iterrows():
            keep_tids[max_ilpi_tid].append(row["tid"])
        # continue to next group
        continue

    if max_mom > 0: # found match
        keep_tids[max_mom_tid] = []
        for idx, row in grp.iterrows():
            keep_tids[max_mom_tid].append(row["tid"])
        # continue to next group
        continue

    # for everything else - pick at random
    rnd_tid = random.choice(grp["tid"].tolist())
    keep_tids[rnd_tid] = []
    for idx, row in grp.iterrows():
        keep_tids[rnd_tid].append(row["tid"])

In [None]:
# write out the output
with open() as outFP:
    with open() as inFP:
        for line in inFP:
            