In [1]:
# multiple transcripts are inherited from refseq as exon-chain duplicates
# this can present an issue with assembly and quantification
# we need to identify the dominant copy and remove duplicate

In [1]:
# main imports
import os
import sys
import csv
import glob
import math
import shutil
import random
import importlib
import subprocess

import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 50)

In [2]:
%load_ext autoreload
%autoreload 1

sys.path.insert(0, "/ccb/salz4-4/avaraby/orfanage/soft")
%aimport definitions

In [3]:
# data
ref_fasta = "/home/avaraby1/genomes/human/hg38/hg38_p12_ucsc.fa"
grch_gtf = "/ccb/salz8-1/avaraby/chess_maintenance_scripts/data/chess_github_clone/chess3.1.2.GRCh38.gtf"
chm_gtf = "/ccb/salz8-1/avaraby/chess_maintenance_scripts/data/chess_github_clone/chess3.1.2.CHM13.gtf"
mane_gtf = "/ccb/salz8-1/avaraby/chess_maintenance_scripts/data/MANE.GRCh38.v1.3.refseq_genomic.gtf"
outbase = "../tmp/chess3.1.2.fix_duplicated_tx"

In [4]:
# find duplicates
# load exon chains and find those with the same chain
# compare against intron-chain

In [5]:
mane = definitions.get_chains(mane_gtf,"exon",True)
mane["echain"] = mane["seqid"]+mane["strand"]+mane.apply(lambda row: ",".join(str(x[0])+"-"+str(x[1]) for x in row["chain"]),axis=1)
mane["ichain"] = mane["seqid"]+mane["strand"]+mane.apply(lambda row: ",".join(str(x[0])+"-"+str(x[1]) for x in definitions.chain_inv(row["chain"])),axis=1)
mane["ichain"] = np.where(mane["ichain"].str.endswith(("-","+")),"-",mane["ichain"])
mane = mane[["tid","has_cds","echain","ichain"]]

# attach cds chain to it as well
mane_cds = definitions.get_chains(mane_gtf,"CDS",True)
mane_cds["cchain"] = mane_cds["seqid"]+mane_cds["strand"]+mane_cds.apply(lambda row: ",".join(str(x[0])+"-"+str(x[1]) for x in row["chain"]),axis=1)
mane_cds = mane_cds[["tid","cchain"]]

# merge
mane = mane.merge(mane_cds,on="tid")

# add gene names for ease of debugging
gns = definitions.get_attribute(mane_gtf,"gene_name")
mane = mane.merge(gns,on="tid")
mane.head()

Unnamed: 0,tid,has_cds,echain,ichain,cchain,gene_name
0,rna-NM_001005484.2,1,"chr1+65419-65433,65520-65573,69037-71585","chr1+65433-65520,65573-69037","chr1+65565-65573,69037-70008",OR4F5
1,rna-NM_001005221.2,1,chr1-450740-451678,-,chr1-450740-451678,OR4F29
2,rna-NM_001005277.1,1,chr1-685716-686654,-,chr1-685716-686654,OR4F16
3,rna-NM_001385641.1,1,"chr1+923923-924948,925922-926013,930155-930336...","chr1+924948-925922,926013-930155,930336-931039...","chr1+924432-924948,925922-926013,930155-930336...",SAMD11
4,rna-NM_015658.4,1,"chr1-944203-944800,945057-945146,945518-945653...","chr1-944800-945057,945146-945518,945653-946173...","chr1-944694-944800,945057-945146,945518-945653...",NOC2L


In [6]:
c3 = definitions.get_chains(grch_gtf,"exon",True)
c3["echain"] = c3["seqid"]+c3["strand"]+c3.apply(lambda row: ",".join(str(x[0])+"-"+str(x[1]) for x in row["chain"]),axis=1)
c3["ichain"] = c3["seqid"]+c3["strand"]+c3.apply(lambda row: ",".join(str(x[0])+"-"+str(x[1]) for x in definitions.chain_inv(row["chain"])),axis=1)
c3["ichain"] = np.where(c3["ichain"].str.endswith(("-","+")),"-",c3["ichain"])
c3 = c3[["tid","has_cds","echain","ichain"]]

# attach cds chain to it as well
c3_cds = definitions.get_chains(grch_gtf,"CDS",True)
c3_cds["cchain"] = c3_cds["seqid"]+c3_cds["strand"]+c3_cds.apply(lambda row: ",".join(str(x[0])+"-"+str(x[1]) for x in row["chain"]),axis=1)
c3_cds = c3_cds[["tid","cchain"]]
c3_cds["cchain"] = np.where(c3_cds["cchain"].str.endswith(("-","+")),"-",c3_cds["cchain"])

# merge
c3 = c3.merge(c3_cds,on="tid")
c3

Unnamed: 0,tid,has_cds,echain,ichain,cchain
0,CHS.1.1,1,"chr1+11874-12227,12613-12721,13221-14409","chr1+12227-12613,12721-13221",-
1,CHS.2.1,1,"chr1-14362-14829,14970-15038,15796-15947,16607...","chr1-14829-14970,15038-15796,15947-16607,16765...",-
2,CHS.166734.2,1,chr1-17369-17391,-,-
3,CHS.166734.1,1,chr1-17369-17436,-,-
4,CHS.166734.3,1,chr1-17409-17431,-,-
...,...,...,...,...,...
168446,CHS.57396.1,1,"chrUn_KI270755v1+21574-26743,27337-27719",chrUn_KI270755v1+26743-27337,-
168447,CHS.59138.3,1,"chrX_KI270881v1_alt-65309-66585,69007-69092,69...","chrX_KI270881v1_alt-66585-69007,69092-69763,69...","chrX_KI270881v1_alt-65972-66585,69007-69092,69..."
168448,CHS.59138.4,1,"chrX_KI270881v1_alt-65309-66585,69007-69092,69...","chrX_KI270881v1_alt-66585-69007,69092-69763,69...","chrX_KI270881v1_alt-65972-66585,69007-69092,69..."
168449,CHS.59138.1,1,"chrX_KI270881v1_alt-65314-66585,69007-69092,69...","chrX_KI270881v1_alt-66585-69007,69092-69763,69...","chrX_KI270881v1_alt-65972-66585,69007-69092,69..."


In [7]:
# isolate those without cds (where all duplicates are without cds

keep_noncoding_tids = set()
noncoding_duplicate_group_tids = []
edup = c3[c3.duplicated(subset='echain', keep=False)].reset_index(drop=True)
print(len(set(edup["echain"])))
for echain, grp in edup.groupby('echain'):
    # if cds in mane - keep it - we can repeat duplicate detection afterwards
    if list(set(grp["cchain"].tolist())) == ["-"]:
        noncoding_duplicate_group_tids.extend(grp["tid"].tolist())
        keep_noncoding_tids.add(grp.iloc[0]["tid"])

noncoding_duplicates = c3[c3["tid"].isin(noncoding_duplicate_group_tids)].reset_index(drop=True)
print(len(keep_noncoding_tids))
noncoding_duplicates.head()

174
12


Unnamed: 0,tid,has_cds,echain,ichain,cchain
0,CHS.144560.1,1,chr1+69879592-69879895,-,-
1,CHS.167000.1,1,chr1+69879592-69879895,-,-
2,CHS.146088.1,1,chr2+97100587-97100874,-,-
3,CHS.167737.1,1,chr2+97100587-97100874,-,-
4,CHS.136513.14,1,"chr4+9659025-9671347,9677311-9677934,9685007-9...","chr4+9671347-9677311,9677934-9685007,9685242-9...",-


In [8]:
# lets get rid of all remaining non-coding transcripts since they are now guaranteed to exist alongside coding transcripts
c3 = c3[~(c3["tid"].isin(noncoding_duplicate_group_tids))].reset_index(drop=True)
c3 = c3[~(c3["cchain"]=="-")].reset_index(drop=True)
c3.head()

Unnamed: 0,tid,has_cds,echain,ichain,cchain
0,CHS.131169.0,1,"chr1+65419-65433,65520-65573,69037-71585","chr1+65433-65520,65573-69037","chr1+65565-65573,69037-70008"
1,CHS.139257.1,1,"chr1-365134-365692,373144-373323,379769-379870...","chr1-365692-373144,373323-379769,379870-380897...","chr1-365555-365692,373144-373323,379769-379870..."
2,CHS.15.1,1,chr1-450740-451678,-,chr1-450740-451678
3,CHS.18.2,1,"chr1-586287-586358,586821-586955,601398-601577...","chr1-586358-586821,586955-601398,601577-607955...","chr1-586839-586955,601398-601577,607955-608056..."
4,CHS.20.5,1,chr1-685716-686654,-,chr1-685716-686654


In [9]:
# iterate over and check with mane

mane_matched_tids = []
mane_group_tids = set() # all tids in groups where a transcript is matched to mane
edup = c3[c3.duplicated(subset='echain', keep=False)].reset_index(drop=True)
print(len(set(edup["echain"])))
mane_cchains = set(mane["cchain"].tolist())
for echain, grp in edup.groupby('echain'):
    # if cds in mane - keep it - we can repeat duplicate detection afterwards
    for idx, row in grp.iterrows():
        if row["cchain"] in mane_cchains:
            mane_matched_tids.append(row["tid"])
            mane_group_tids.update(grp["tid"].tolist())
len(mane_matched_tids),len(mane_group_tids)

162


(56, 116)

In [10]:
# compute similarity to mane and decide which ones to keep

# for every gene, we need to get MANE transcript and CDS chain

# load transcripts with source and gene_id
mane_cmp_df = definitions.get_attribute(grch_gtf,["gene_id"],cols=[1])
mane_cmp_df.rename({1:"source"},axis=1,inplace=True)

# split off mane and join on the dataframe to mark mane_tid for each transcript
mane_cmp_df = mane_cmp_df.merge(mane_cmp_df[mane_cmp_df["source"]=="MANE"][["tid","gene_id"]].rename({"tid":"tid_mane"},axis=1),on="gene_id",how="left")

mane_cmp_df = mane_cmp_df[~(mane_cmp_df["tid_mane"].isna())]

mane_cmp_df.head()

Unnamed: 0,tid,source,gene_id,tid_mane
11,CHS.131169.0,MANE,CHS.131169,CHS.131169.0
27,CHS.15.1,MANE,CHS.15,CHS.15.1
44,CHS.20.5,MANE,CHS.20,CHS.20.5
80,CHS.39.2,CHESS,CHS.39,CHS.39.7
81,CHS.39.5,RefSeq,CHS.39,CHS.39.7


In [156]:
# load chains for all transcripts
cds_chains = definitions.get_chains(grch_gtf,"CDS",True)
cds_chains.head()

Unnamed: 0,tid,has_cds,seqid,strand,coords,chain
0,CHS.1.1,0,chr1,+,chr1:11874-14409,()
1,CHS.2.1,0,chr1,-,chr1:14362-29370,()
2,CHS.166734.2,0,chr1,-,chr1:17369-17391,()
3,CHS.166734.1,0,chr1,-,chr1:17369-17436,()
4,CHS.166734.3,0,chr1,-,chr1:17409-17431,()


In [143]:
# merge onto the dataframe mane_cmp
mane_cmp_df = mane_cmp_df.merge(cds_chains,on="tid",how="left")
mane_cmp_df = mane_cmp_df.merge(cds_chains[["tid","chain"]].rename({"chain":"chain_mane"},axis=1),left_on="tid_mane",right_on="tid",how="left")
mane_cmp_df.drop("tid_y",axis=1,inplace=True)
mane_cmp_df.head()

Unnamed: 0,tid_x,source,gene_id,tid_mane,has_cds,seqid,strand,coords,chain,chain_mane
0,CHS.131169.0,MANE,CHS.131169,CHS.131169.0,1,chr1,+,chr1:65419-71585,"((65565, 65573), (69037, 70008))","((65565, 65573), (69037, 70008))"
1,CHS.15.1,MANE,CHS.15,CHS.15.1,1,chr1,-,chr1:450740-451678,"((450740, 451678),)","((450740, 451678),)"
2,CHS.20.5,MANE,CHS.20,CHS.20.5,1,chr1,-,chr1:685716-686654,"((685716, 686654),)","((685716, 686654),)"
3,CHS.39.2,CHESS,CHS.39,CHS.39.7,1,chr1,+,chr1:923923-944574,"((924432, 924948), (925922, 926013), (930155, ...","((924432, 924948), (925922, 926013), (930155, ..."
4,CHS.39.5,RefSeq,CHS.39,CHS.39.7,1,chr1,+,chr1:923923-944574,"((924432, 924948), (925922, 926013), (930155, ...","((924432, 924948), (925922, 926013), (930155, ..."


In [144]:
mane_cmp_df[["mod_chain","c1len","c2len","match_start","match_end","num_bp_extra","num_bp_missing","num_bp_inframe","num_bp_match","num_bp_outframe","lpi","ilpi","mlpi"]] = mane_cmp_df.apply(lambda row: definitions.compare_and_extract(row["chain"],row["chain_mane"],row["strand"]),axis=1)
mane_cmp_df.rename({"tid_x":"tid"},axis=1,inplace=True)
mane_cmp_df.head()

Unnamed: 0,tid,source,gene_id,tid_mane,has_cds,seqid,strand,coords,chain,chain_mane,mod_chain,c1len,c2len,match_start,match_end,num_bp_extra,num_bp_missing,num_bp_inframe,num_bp_match,num_bp_outframe,lpi,ilpi,mlpi
0,CHS.131169.0,MANE,CHS.131169,CHS.131169.0,1,chr1,+,chr1:65419-71585,"((65565, 65573), (69037, 70008))","((65565, 65573), (69037, 70008))","[[65565, 65573, 0], [69037, 70008, 0]]",981,981,True,True,0,0,981,981,0,100,100,100
1,CHS.15.1,MANE,CHS.15,CHS.15.1,1,chr1,-,chr1:450740-451678,"((450740, 451678),)","((450740, 451678),)","[[450740, 451678, 0]]",939,939,True,True,0,0,939,939,0,100,100,100
2,CHS.20.5,MANE,CHS.20,CHS.20.5,1,chr1,-,chr1:685716-686654,"((685716, 686654),)","((685716, 686654),)","[[685716, 686654, 0]]",939,939,True,True,0,0,939,939,0,100,100,100
3,CHS.39.2,CHESS,CHS.39,CHS.39.7,1,chr1,+,chr1:923923-944574,"((924432, 924948), (925922, 926013), (930155, ...","((924432, 924948), (925922, 926013), (930155, ...","[[924432, 924948, 0], [925922, 926013, 0], [93...",2583,2535,True,True,48,0,2535,2535,0,100,98,98
4,CHS.39.5,RefSeq,CHS.39,CHS.39.7,1,chr1,+,chr1:923923-944574,"((924432, 924948), (925922, 926013), (930155, ...","((924432, 924948), (925922, 926013), (930155, ...","[[924432, 924948, 0], [925922, 926013, 0], [93...",2538,2535,True,True,3,0,2535,2535,0,100,99,99


In [146]:
c3 = c3.merge(mane_cmp_df[["tid","gene_id","tid_mane","ilpi"]],on="tid",how="left")
c3.head()

Unnamed: 0,tid,has_cds,echain,ichain,cchain,tid_mane,ilpi
0,CHS.131169.0,1,"chr1+65419-65433,65520-65573,69037-71585","chr1+65433-65520,65573-69037","chr1+65565-65573,69037-70008",CHS.131169.0,100.0
1,CHS.139257.1,1,"chr1-365134-365692,373144-373323,379769-379870...","chr1-365692-373144,373323-379769,379870-380897...","chr1-365555-365692,373144-373323,379769-379870...",,
2,CHS.15.1,1,chr1-450740-451678,-,chr1-450740-451678,CHS.15.1,100.0
3,CHS.18.2,1,"chr1-586287-586358,586821-586955,601398-601577...","chr1-586358-586821,586955-601398,601577-607955...","chr1-586839-586955,601398-601577,607955-608056...",,
4,CHS.20.5,1,chr1-685716-686654,-,chr1-685716-686654,CHS.20.5,100.0


In [149]:
# iterate over duplicates and decide what to do based on the comparisons

# keep group information (all tids so we can add tag with duplicates)

mane_matched_tids = {} # dict mapping tid to keep/discard (1/0)
mane_cmp_tids = {}

mane_cchains = set(mane["cchain"].tolist())
for echain, grp in c3.groupby('echain'):
    if len(grp)==1:
        continue
    # check if matching mane first
    matched_mane = None
    for idx, row in grp.iterrows():
        if row["cchain"] in mane_cchains:
            assert matched_mane is None or matched_mane==row["tid"], "multiple mane matches: "+str(grp)
            matched_mane = row["tid"]

    if matched_mane is not None: # found mane match
        for idx, row in grp.iterrows():
            mane_matched_tids[row["tid"]] = row["cchain"] in mane_cchains
        # continue to next group
        continue

    # for everything else - find one with highest ilpi to mane
    max_ilpi = 0
    for idx, row in grp.iterrows():
        max_ilpi = max(max_ilpi,row["ilpi"])

    if max_ilpi > 0: # found match
        for idx, row in grp.iterrows():
            mane_cmp_tids[row["tid"]] = row["ilpi"] == max_ilpi
        # continue to next group
        continue

    # for everything else - report
    print(grp["tid"].tolist())

['CHS.7654.6', 'CHS.7654.7']
['CHS.175543.1', 'CHS.175543.2']
['CHS.175545.1', 'CHS.175545.2']
['CHS.175560.1', 'CHS.175560.2']
['CHS.175566.1', 'CHS.175566.2']
['CHS.175684.1', 'CHS.175684.2']
['CHS.175688.10', 'CHS.175688.9']
['CHS.175688.3', 'CHS.175688.4']
['CHS.18537.16', 'CHS.18537.17']
['CHS.175891.2', 'CHS.175891.3']
['CHS.175891.4', 'CHS.175891.5']
['CHS.18634.3', 'CHS.18634.4', 'CHS.18634.5']
['CHS.18634.10', 'CHS.18634.11', 'CHS.18634.6', 'CHS.18634.7', 'CHS.18634.8', 'CHS.18634.9']
['CHS.175910.1', 'CHS.175910.2']
['CHS.25096.10', 'CHS.25096.9']


AssertionError: multiple mane matches:                tid  has_cds  \
87182  CHS.26116.2        1   
87183  CHS.59390.1        1   

                                                  echain  \
87182  chr19-18868545-18869390,18869983-18870619,1887...   
87183  chr19-18868545-18869390,18869983-18870619,1887...   

                                                  ichain  \
87182  chr19-18869390-18869983,18870619-18878930,1887...   
87183  chr19-18869390-18869983,18870619-18878930,1887...   

                                                  cchain     tid_mane   ilpi  
87182  chr19-18870577-18870619,18878930-18879039,1887...  CHS.26116.2  100.0  
87183          chr19-18868597-18869390,18869983-18870307  CHS.59390.1  100.0  

In [150]:
mane_matched_tids,mane_cmp_tids

({'CHS.3309.1': True,
  'CHS.3309.21': True,
  'CHS.3309.19': True,
  'CHS.657.3': True,
  'CHS.657.6': False,
  'CHS.4270.11': True,
  'CHS.4270.102': True,
  'CHS.4270.47': True,
  'CHS.732.15': True,
  'CHS.732.2': False,
  'CHS.5175.11': False,
  'CHS.5175.20': True,
  'CHS.1196.15': False,
  'CHS.1196.2': True,
  'CHS.1547.2': True,
  'CHS.1547.6': False,
  'CHS.3530.18': False,
  'CHS.3530.5': True,
  'CHS.3784.11': True,
  'CHS.3784.3': False,
  'CHS.3784.1': False,
  'CHS.3784.10': True,
  'CHS.4689.2': False,
  'CHS.4689.40': True,
  'CHS.5646.2': True,
  'CHS.5646.16': True,
  'CHS.6165.6': True,
  'CHS.6165.8': False,
  'CHS.6381.4': True,
  'CHS.6381.3': True,
  'CHS.6381.5': True,
  'CHS.6381.26': True,
  'CHS.6381.17': True,
  'CHS.6696.5': True,
  'CHS.6696.16': True,
  'CHS.6696.45': True,
  'CHS.6696.28': True,
  'CHS.6696.31': True,
  'CHS.6163.19': True,
  'CHS.6163.2': True,
  'CHS.6163.17': True,
  'CHS.6163.18': True,
  'CHS.6163.13': True,
  'CHS.6206.27': True,
