In [1]:
import pandas as pd
import numpy as np
from itertools import groupby
from operator import itemgetter
from multiprocessing import cpu_count, Pool
import subprocess
import sys
import os
import csv
import matplotlib.pyplot as plt
from matplotlib import animation
import seaborn as sns
plt.rcParams['figure.figsize'] = (20.0, 10.0)
plt.rcParams['font.family'] = "serif"
%matplotlib inline

gff3Cols=["seqid","source","type","start","end","score","strand","phase","attributes"]
novelSources=['CHESS','StringTie','FANTOM']

In [2]:
# now need to separate out the case when the only reason the known ORF does not fit in the exon chain
# is due to exon trimming

# In the end we shall provide the following information
# % fits in entirety
# % does not fit completely, but without frameshift
# % does not fit comletely due to frameshift
# % fits entirely but with trimming at the end
# % nonsense-mediated decay

In [3]:
# formBlocks formats all segments (CDS or exon) per transcript into a single line entry
def formBlocks(series):
    return ",".join([str(x) for x in sorted(series.tolist())])

# groupBlocks groups all exon/cds chains by gene
def groupBlocks(series):
    return series.tolist()

# this function is used by comparitors to find the minimum value
# based on several conditionals
def getMin(res):
    minimum=None
    numElements=len(res[0])
    for i in range(numElements):
        minimum=min(res,key=lambda k:k[i])[i]
        res=[v for v in res if v[i]==minimum]
    
    return res[0]

# def compareKnownToKnown(curChain,rest): # performs comparrisons for all
#     # stop if a single match is found - otherwise
#     finalRes=[]
#     for chain in rest:
#         chain=chain.split("$")[-1]
#         res=checkCompat(chain,curChain)
#         # will run longer, however, multiple conditionals demand that we evaluate every possibility
# #         if res==0:
# #             return 0
#         finalRes.append(res) # return the smallest code

#     # we should also consider returning the name/chain of the closest isoform to which we've found a match
#     return getMin(finalRes)

# checkCompat performs a comparison of two coordinate-chains
# The current output format is as follows:
# 1. 0 - perfect match of two chains
# 2. 1 - there is a modification to the chain (addition/removal) which does not result in a frameshift
# 3. 2 - there is a modification to the chain (addition/removal) which results in a frame shift
#================NOTES================
# It is important to note here, that several tye 2 modifications may result in a return back to the original frame
# in it's current state the function does not take it into account
# meaning that if a single modification which changes the frame is found - frameshift is reported

# checkCompat is not dependent on the strand of either the exon chain or the orf unlike the startTrim and endTrim functions
def checkCompat(orf,chain,strand):
    tmp=orf.split("-")
    orfStarts=[int(x) for x in tmp[0].split(",")]
    orfEnds=[int(x) for x in tmp[-1].split(",")]
    
    # next we need to parse the exons in the same manner
    tmp=chain.split("-")
    chainStarts=[int(x) for x in tmp[0].split(",")]
    chainEnds=[int(x) for x in tmp[-1].split(",")]
    
    orf=[]
    for t in zip(orfStarts,orfEnds):
        orf+=list(range(t[0],t[1]+1))
    
    chain=[]
    for t in zip(chainStarts,chainEnds):
        chain+=list(range(t[0],t[1]+1))

    # now remove anything in the chain that is before or after the start and end of the ORF respectively
    # trim the end
    
    tmpOrf=orf # save orf for later to verify if the chain does not fit the orf completely
    if strand=="+":
        orf=[x for x in orf if x<=chain[-1]]
    else:
        orf=[x for x in orf if x>=chain[0]]
        
    if len(orf)==0:
        return 7777777
    
    # now verify chain compatability with the original orf
    tmpChain=[x for x in chain if x>=orf[0] and x<=orf[-1]]
    if len(tmpChain)==0:
        return 9999999
    
    # if both conditions (7777777 and 9999999) pass - we shall proceede with the computation
    chain=[x for x in chain if x>=orf[0] and x<=orf[-1]]
    
    orf=set(orf)
    chain=set(chain)
    
    # now get set differences66
    orf_chain=orf-chain
    chain_orf=chain-orf
    
    if len(orf_chain)==0 and len(chain_orf)==0:
        return 0
    
    allMis=[]
    for k, g in groupby(enumerate(orf_chain), lambda ix : ix[0] - ix[1]):
        allMis.append(list(map(itemgetter(1), g)))
    for k, g in groupby(enumerate(chain_orf), lambda ix : ix[0] - ix[1]):
        allMis.append(list(map(itemgetter(1), g)))
    
    # for now this will be a very easy thing to do
    # would be better to have a method for computing how many bases are in off-frame vs on-frame regions
    for mis in allMis:
        if not len(mis)%3==0:
            return 2
    return 1

def trimming(orf,chain,strand,parent,start=True):
    if not start:
        if strand=="+":
            strand="-"
        else:
            strand="+"
    if strand=="+":
        tmp=orf.split("-")
        orfStarts=[int(x) for x in tmp[0].split(",")]
        orfEnds=[int(x) for x in tmp[-1].split(",")]

        # next we need to parse the exons in the same manner
        tmp=chain.split("-")
        chainStarts=[int(x) for x in tmp[0].split(",")]
        chainEnds=[int(x) for x in tmp[-1].split(",")]

        orf=[]
        for t in zip(orfStarts,orfEnds):
            orf+=list(range(t[0],t[1]+1))

        chain=[]
        for t in zip(chainStarts,chainEnds):
            chain+=list(range(t[0],t[1]+1))

        # now remove anything in the chain that is before or after the start and end of the ORF
        chain=[x for x in chain if x>=orf[0] and x<=orf[-1]]

        if len(chain)==0: # should hopefully never happen
            return 7777777,7777777,7777777

        setStartCodon_orf=set(orf[:3])
        setStartCodon_chain=set(chain[:3])
        startCodon_orf_diff_chain=setStartCodon_orf-setStartCodon_chain

        if orf[0]==chain[0]: # at least the first nucleotide is there
            if len(startCodon_orf_diff_chain)<3: # at least part of the start codon is there
                return len(startCodon_orf_diff_chain),0,0 # return the number of missing nucleotides

        # what we shall do instead is simply quantify how far down the orf the exon begins
        firstExonChainNT=min(chain)
        curPos=0 # position within the ORF    
        orfIter=iter(orf)
        skippedExonBases=0
        exons=0
        prevOrf=None
        curOrf=next(orfIter)
        while True:
            if not curOrf>=firstExonChainNT:
                if prevOrf and not curOrf-prevOrf==1:
                    if skippedExonBases==0:
                        skippedExonBases+=curPos
                    exons+=1
                curPos+=1
                prevOrf=curOrf
                curOrf=next(orfIter)
            else:
                if prevOrf and not curOrf-prevOrf==1:
                    if skippedExonBases==0:
                        skippedExonBases+=curPos
                    exons+=1
                break
        return curPos,exons,skippedExonBases
    else:
        tmp=orf.split("-")
        orfStarts=[int(x) for x in tmp[0].split(",")]
        orfEnds=[int(x) for x in tmp[-1].split(",")]
        
        tmp=chain.split("-")
        chainStarts=[int(x) for x in tmp[0].split(",")]
        chainEnds=[int(x) for x in tmp[-1].split(",")]
        
        orf=[]
        for t in zip(orfStarts,orfEnds):
            orf+=list(range(t[0],t[1]+1))
        orf=orf[::-1]
        
        chain=[]
        for t in zip(chainStarts,chainEnds):
            chain+=list(range(t[0],t[1]+1))
        chain=chain[::-1]
        
        chain=[x for x in chain if x<=orf[0] and x>=orf[-1]]
            
        if len(chain)==0: # should hopefully never happen
            return 7777777,7777777,7777777
        
        setStartCodon_orf=set(orf[:3])
        setStartCodon_chain=set(chain[:3])
        startCodon_orf_diff_chain=sorted(list(setStartCodon_orf-setStartCodon_chain))[::-1]

        if orf[0]==chain[0]: # at least the first nucleotide is there
            if len(startCodon_orf_diff_chain)<3: # at least part of the start codon is there
                return len(startCodon_orf_diff_chain),0,0 # return the number of missing nucleotides

        # what we shall do instead is simply quantify how far down the orf the exon begins
        firstExonChainNT=max(chain)
        curPos=0 # position within the ORF
        orfIter=iter(orf)
        skippedExonBases=0
        exons=0
        prevOrf=None
        curOrf=next(orfIter)
        while True:
            if not curOrf<=firstExonChainNT:
                if prevOrf and not prevOrf-curOrf==1:
                    if skippedExonBases==0:
                        skippedExonBases+=curPos
                    exons+=1
                curPos+=1
                prevOrf=curOrf
                curOrf=next(orfIter)
            else:
                if prevOrf and not prevOrf-curOrf==1:
                    if skippedExonBases==0:
                        skippedExonBases+=curPos
                    exons+=1
                break
        return curPos,exons,skippedExonBases

    # need to deal with the special case when the first base of the start codon is missing
    # next apply this function to the entirety of the dataframe
    
def nmdDist(orf,chain,strand):
    if strand=="+":
        strand="-"
    else:
        strand="+"
        
    if strand=="+":
        tmp=orf.split('-')
        orfStarts=[int(x) for x in tmp[0].split(',')]
        orfEnds=[int(x) for x in tmp[-1].split(',')]
        tmp=chain.split('-')
        chainStarts=[int(x) for x in tmp[0].split(',')]
        chainEnds=[int(x) for x in tmp[-1].split(',')]
        
        orf=[]
        for t in zip(orfStarts,orfEnds):
            orf+=list(range(t[0],t[1]+1))
        chain=[]
        for t in zip(chainStarts,chainEnds):
            chain+=list(range(t[0],t[1]+1))

        if len(set(chain).union(set(orf)))==0:
            return 6666666
        chain=[x for x in chain if x<orf[0]]
        if len(chain)==0:
            return 0
        
        prevNT=0
        exons=0
        dist=0
        for nt in chain:
            if prevNT-nt<-1:
                exons+=1
            if exons>0:
                dist+=1
            prevNT=nt
        return dist
    else:
        tmp=orf.split('-')
        orfStarts=[int(x) for x in tmp[0].split(',')]
        orfEnds=[int(x) for x in tmp[-1].split(',')]
        tmp=chain.split('-')
        chainStarts=[int(x) for x in tmp[0].split(',')]
        chainEnds=[int(x) for x in tmp[-1].split(',')]
        
        orf=[]
        for t in zip(orfStarts,orfEnds):
            orf+=list(range(t[0],t[1]+1))
        orf=orf[::-1]
        chain=[]
        for t in zip(chainStarts,chainEnds):
            chain+=list(range(t[0],t[1]+1))
        chain=chain[::-1]

        if len(set(chain).union(set(orf)))==0: # non compatible - report
            return 6666666
        chain=[x for x in chain if x>orf[0]]
        if len(chain)==0:
            return 0
        
        prevNT=0
        exons=0
        dist=0
        for nt in chain: # iterate over chain to see if any of novel transcripts have nonesense-mediated decay due to length (the function does not evaluate NMD due to premature codon
            if nt-prevNT<-1:
                exons+=1
            if exons>0:
                dist+=1
            prevNT=nt
        return dist
    
def outOfBounds(orfList,chain):
#     orfList=orfs.split(";")
    starts=[]
    ends=[]
    for orf in orfList:
        # get start/end
        tmp=orf.split("-")
        orfStarts=[int(x) for x in tmp[0].split(",")]
        orfEnds=[int(x) for x in tmp[1].split(",")]
        starts.append(sorted(orfStarts)[0])
        ends.append(sorted(orfEnds)[-1])
    minOrfStart=min(starts)
    maxOrfEnd=max(ends)
    
    # next we need to parse the exons in the same manner
    tmp=chain.split("-")
    chainStarts=[int(x) for x in tmp[0].split(",")]
    minChainStart=min(chainStarts)
    maxChainStart=max(chainStarts)
    chainEnds=[int(x) for x in tmp[-1].split(",")]
    minChainEnd=min(chainEnds)
    maxChainEnd=max(chainEnds)

    if maxChainEnd<minOrfStart or minChainStart>maxOrfEnd:
        return True
    return False

def compareNovelToKnown2(row,knownChains): # performs comparrisons for all
    # stop if a single match is found - otherwise
    parent=row['parent']
    finalRes=[]
    for chain in knownChains[row['gID']]:
        tmp=chain.split("&")
        chain=tmp[1]
        if not parent==tmp[0]:
            res=checkCompat(chain,row['uid'],row['strand'])
            resStart,neStart,skippedExonBases_start=trimming(chain,row['uid'],row['strand'],parent,True)
            resNonesense,neEnd,skippedExonBases_end=trimming(chain,row['uid'],row['strand'],parent,False)
            resNMD=nmdDist(chain,row['uid'],row['strand'])
            finalRes.append((res,resStart,resNonesense,neStart,neEnd,skippedExonBases_start,skippedExonBases_end,resNMD)) # return the smallest code
    if len(finalRes)==0:
        return [999999999,999999999,999999999,999999999,999999999,999999999,999999999,999999999]
    return pd.Series(list(getMin(finalRes)))

# compareNovelToKnown is a wrapper function which performs all required individual comparrisons
# between novel and known isoforms of the same gene
def compareNovelToKnown(row,knownChains): # performs comparrisons for all
    # stop if a single match is found - otherwise
    parent=row['parent']
    finalRes=[]
    for chain in knownChains[row['gID']]:
        res=checkCompat(chain,row['uid'],row['strand'])
        resStart,neStart,skippedExonBases_start=trimming(chain,row['uid'],row['strand'],parent,True)
        resNonesense,neEnd,skippedExonBases_end=trimming(chain,row['uid'],row['strand'],parent,False)
        resNMD=nmdDist(chain,row['uid'],row['strand'])
        finalRes.append((res,resStart,resNonesense,neStart,neEnd,skippedExonBases_start,skippedExonBases_end,resNMD)) # return the smallest code
    return pd.Series(list(getMin(finalRes)))

# compareKnownToKnown performs a comparison of exon/cds chains between known isoforms
# for each known isoform the function will compare it to every other known isoorms that belongs to the same gene
def compareKnownToKnown(curChain,rest,strand):
    finalRes=[]
#     oob=outOfBounds(rest,curChain)
    for chain in rest:
        chain=chain.split("$")[-1]
        res=checkCompat(chain,curChain,strand)
        resStart,neStart,skippedExonBases_start=trimming(chain,curChain,strand,"filler",True)
        resNonesense,neEnd,skippedExonBases_end=trimming(chain,curChain,strand,"filler",False)
        nmd=nmdDist(chain,curChain,strand)
        finalRes.append((res,resStart,resNonesense,neStart,neEnd,skippedExonBases_start,skippedExonBases_end,nmd))
    return list(getMin(finalRes))

In [4]:
# ==============STEP1==============
# Building CDS chains for all known protein-coding transcripts excluding anything novel

df_known_only=pd.read_csv("./chess2.02.gff",sep="\t",names=gff3Cols)
df_known_only.dropna(inplace=True,axis=0)
df_known_only.reset_index(inplace=True,drop=True)
df_known_only["start"]=df_known_only["start"].astype(int)
df_known_only["end"]=df_known_only["end"].astype(int)

# next we shall create the subset of all non-novel genes
print(set(df_known_only["source"]))
df_known_only=df_known_only[~(df_known_only['source'].isin(novelSources))].reset_index(drop=True)

# now let's isolate the protein-coding genes and their transcripts, exons and CDS
df_known_only["id"]=df_known_only.attributes.str.split("ID=",expand=True)[1].str.split(";",expand=True)[0]
df_known_only["parent"]=df_known_only.attributes.str.split("Parent=",expand=True)[1].str.split(";",expand=True)[0]
df_known_only["geneID"]=np.where(df_known_only["type"].isin(['transcript','exon','CDS']),df_known_only.parent.str.extract('(CHS.(\d)*)',expand=True)[0],df_known_only['id'])
# first create a tmpdf_known_only (to be removed right after) of just the genes, so we can get their coding potential
tmp=df_known_only[df_known_only["type"]=="gene"].reset_index(drop=True)
# next let's extract information about the gene type
tmp["gene_type"]=tmp.attributes.str.split("GENE_TYPE=",expand=True)[1].str.split(";",expand=True)[0].str.strip("\n")
print("set of all gene_types in CHESS",set(tmp["gene_type"]))

# now we can get a subset of all known protein_coding genes
tmp=tmp[tmp["gene_type"]=="protein_coding"].reset_index(drop=True)
print("number of known protein-coding genes is: %d"%len(tmp))
# now we just need to get geneIDs for the protein_coding sequences
setProtGenes_known_only=set(tmp["geneID"])
del tmp
df_known_only=df_known_only[df_known_only["geneID"].isin(setProtGenes_known_only)].reset_index(drop=True)

# next we would like to test whether all transcripts in these known protein-coding genes contain a CDS
# first get a set of parent IDs for all CDSs
cdsParents_known_only=set(df_known_only[df_known_only["type"]=="CDS"]["parent"])
# now form a set of IDs for all transcripts
transIDs_known_only=set(df_known_only[df_known_only["type"]=="transcript"]["id"])
print("the number of transcripts in protein-coding known genes is: %d\nthe number of CDSs associated with transcripts in protein-coding known genes is %d"%(len(transIDs_known_only),len(cdsParents_known_only)))
print("number of known transcripts in known protein-coding genes is: ",len(df_known_only[(df_known_only['type']=='transcript')&~(df_known_only['source'].isin(["CHESS","StringTie","FANTOM"]))&(df_known_only["id"].isin(transIDs_known_only))]))

cdsDF=df_known_only[df_known_only['type']=='CDS'].reset_index(drop=True)

cdsGDF=cdsDF.groupby("parent").agg({'start':formBlocks,'end':formBlocks}).reset_index()
mergedDF=cdsDF[['seqid','parent','strand']].merge(cdsGDF,on='parent',how='outer',indicator=True)
assert len(mergedDF[mergedDF["_merge"]=="both"])==len(mergedDF), "ids don't match"
mergedDF.drop_duplicates(inplace=True)
mergedDF.reset_index(drop=True,inplace=True)
mergedDF.drop('_merge',inplace=True,axis=1)

mergedDF['uid']=mergedDF["start"]+"-"+mergedDF["end"]
mergedDF=mergedDF[['seqid','strand','parent','uid']]

# now we can extract gene IDs and groupby them
# we should count the total number of isoforms included
# as well as the number of unique ORFs present for that gene
mergedDF["gID"]=mergedDF["parent"].reset_index(drop=True).str.extract('(CHS.(\d)*)',expand=True)[0]
setKnownTranscripts=set(mergedDF["parent"])
setKnownGenes=set(mergedDF["gID"])
# now let's get rid of any duplicates in this dataframe
mergedDF.drop_duplicates(['seqid','strand','gID','uid','parent'],inplace=True)
mergedDF.reset_index(drop=True,inplace=True)

{'RefSeq', 'FANTOM', 'HAVANA', 'Curated Genomic', 'CHESS', 'Gnomon', 'GENCODE', 'StringTie', 'ENSEMBL', 'BestRefSeq'}
set of all gene_types in CHESS {'antisense_RNA', 'lncRNA', 'misc_RNA', 'protein_coding'}
number of known protein-coding genes is: 22659
the number of transcripts in protein-coding known genes is: 169965
the number of CDSs associated with transcripts in protein-coding known genes is 129847
number of known transcripts in known protein-coding genes is:  169965


In [6]:
#================STEP2================
# Building a dataframe of all novel transcripts that were added to known genes

df=pd.read_csv("./chess2.02.gff",sep="\t",names=gff3Cols)
df.dropna(inplace=True,axis=0)
df.reset_index(inplace=True,drop=True)
df["start"]=df["start"].astype(int)
df["end"]=df["end"].astype(int)
df["id"]=df.attributes.str.split("ID=",expand=True)[1].str.split(";",expand=True)[0]
df["parent"]=df.attributes.str.split("Parent=",expand=True)[1].str.split(";",expand=True)[0]
df["geneID"]=np.where(df["type"].isin(['transcript','exon','CDS']),df.parent.str.extract('(CHS.(\d)*)',expand=True)[0],df['id'])

df=df[(df["geneID"].isin(setKnownGenes))&(df["source"].isin(novelSources))&~((df["parent"].isin(setKnownTranscripts))|(df["id"].isin(setKnownTranscripts)))].reset_index(drop=True)

# now we can further refine the original mergedDF by removing any genes that have no novel transcripts
mergedDF=mergedDF[mergedDF['gID'].isin(set(df["geneID"]))].reset_index(drop=True)
print("the final number of known genes we are examining is: %d"%(len(set(mergedDF["gID"]))))

exonDF=df[df['type']=='exon'].reset_index(drop=True)

exonGDF=exonDF.groupby("parent").agg({'start':formBlocks,'end':formBlocks}).reset_index()
mergedExonDF=exonDF[['seqid','parent','strand']].merge(exonGDF,on='parent',how='outer',indicator=True)
assert len(mergedExonDF[mergedExonDF["_merge"]=="both"])==len(mergedExonDF), "ids don't match"
mergedExonDF.drop_duplicates(inplace=True)
mergedExonDF.reset_index(drop=True,inplace=True)
mergedExonDF.drop('_merge',inplace=True,axis=1)

mergedExonDF['uid']=mergedExonDF["start"]+"-"+mergedExonDF["end"]
mergedExonDF=mergedExonDF[['seqid','strand','parent','uid']]

# now we can extract gene IDs and groupby them
# we should count the total number of isoforms included
# as well as the number of unique ORFs present for that gene
mergedExonDF["gID"]=mergedExonDF["parent"].reset_index(drop=True).str.extract('(CHS.(\d)*)',expand=True)[0]
# now let's get rid of any duplicates in this dataframe
mergedExonDF.drop_duplicates(['seqid','strand','gID','uid','parent'],inplace=True)
mergedExonDF.reset_index(drop=True,inplace=True)

print("the total number of novel transcripts in known genes we are examining is: %d"%(len(set(mergedExonDF["parent"]))))

the final number of known genes we are examining is: 16496
the total number of novel transcripts in known genes we are examining is: 95974


In [7]:
#================STEP3================
# Now we create a dictionary of all exon chains of known transcripts on known genes
geneBlocks=mergedDF.groupby("gID").agg({'uid':groupBlocks}).reset_index()
knownChains=pd.Series(geneBlocks.uid.values,index=geneBlocks.gID).to_dict()

In [6]:
#================STEP4================
# now perform the comparrison between novel and known isoforms
mergedExonDF["match"]=np.nan
mergedExonDF["startTrim"]=np.nan
mergedExonDF["nonesenseTrim"]=np.nan
mergedExonDF["numExonsStart"]=np.nan
mergedExonDF["numExonsEnd"]=np.nan
mergedExonDF["skippedExonBases_start"]=np.nan
mergedExonDF["skippedExonBases_end"]=np.nan
mergedExonDF[["match","startTrim","nonesenseTrim","numExonsStart","numExonsEnd","skippedExonBases_start","skippedExonBases_end","nmd"]]=mergedExonDF.apply(lambda row: compareNovelToKnown(row,knownChains),axis=1)
print("percent good:",len(mergedExonDF[(mergedExonDF["match"]==0)|(mergedExonDF["match"]==1)])/len(mergedExonDF))
print("number good: ",len(mergedExonDF[(mergedExonDF["match"]==0)|(mergedExonDF["match"]==1)]))
print("percent bad: ",len(mergedExonDF[mergedExonDF["match"]==2])/len(mergedExonDF))
print("number bad: ",len(mergedExonDF[mergedExonDF["match"]==2]))
print("perfect transcripts:",len(mergedExonDF[(mergedExonDF["match"].isin([0,1]))&(mergedExonDF["startTrim"]==0)])/len(mergedExonDF))
print("number of perfect transcripts: ",len(mergedExonDF[(mergedExonDF["match"].isin([0,1]))&(mergedExonDF["startTrim"]==0)]))
mergedExonDF

percent good: 0.5369579261049867
number good:  51534
percent bad:  0.44598537103798946
number bad:  42803
perfect transcripts: 0.519526121657949
number of perfect transcripts:  49861


Unnamed: 0,seqid,strand,parent,uid,gID,match,startTrim,nonesenseTrim,numExonsStart,numExonsEnd,skippedExonBases_start,skippedExonBases_end,nmd
0,chr1,-,CHS.23.9,"732160,732981,736713,740187-732211,733213,7367...",CHS.23,2,214,0,2,0,149,0,215
1,chr1,-,CHS.23.10,"733082,736713,738834,739803,740129,743286,7466...",CHS.23,2,149,0,1,0,149,0,62
2,chr1,-,CHS.37.3,"916309,925518-923461,925604",CHS.37,1,1197,0,2,0,60,0,7147
3,chr1,-,CHS.37.5,"922909,924651-923461,924924",CHS.37,2,13,0,0,0,0,0,547
4,chr1,+,CHS.39.1,"923923,925922,930155,931039,935772,939040,9392...",CHS.39,0,0,0,0,0,0,0,1476
5,chr1,+,CHS.39.2,"923923,925922,930155,931039,935772,939040,9392...",CHS.39,0,0,0,0,0,0,0,0
6,chr1,+,CHS.39.5,"924676,925922,930155,931039,935772,939040,9392...",CHS.39,0,0,0,0,0,0,0,1428
7,chr1,+,CHS.39.6,"924892,925922,930155,931039,935772,939040,9392...",CHS.39,0,0,0,0,0,0,0,1428
8,chr1,+,CHS.39.7,"924938,925922,930155,931039,935772,939040,9392...",CHS.39,0,0,0,0,0,0,0,0
9,chr1,+,CHS.39.8,"924938,925922,930155,931039,935772,939040,9392...",CHS.39,0,0,0,0,0,0,0,0


In [8]:
#================STEP4.5==============
# now let's perform the same known-to-known with exon chains against cds

df_known_only=pd.read_csv("./chess2.02.gff",sep="\t",names=gff3Cols)
df_known_only.dropna(inplace=True,axis=0)
df_known_only.reset_index(inplace=True,drop=True)
df_known_only["start"]=df_known_only["start"].astype(int)
df_known_only["end"]=df_known_only["end"].astype(int)

# next we shall create the subset of all non-novel genes
print(set(df_known_only["source"]))
df_known_only=df_known_only[~(df_known_only['source'].isin(novelSources))].reset_index(drop=True)

# now let's isolate the protein-coding genes and their transcripts, exons and CDS
df_known_only["id"]=df_known_only.attributes.str.split("ID=",expand=True)[1].str.split(";",expand=True)[0]
df_known_only["parent"]=df_known_only.attributes.str.split("Parent=",expand=True)[1].str.split(";",expand=True)[0]
df_known_only["geneID"]=np.where(df_known_only["type"].isin(['transcript','exon','CDS']),df_known_only.parent.str.extract('(CHS.(\d)*)',expand=True)[0],df_known_only['id'])
# first create a tmpdf_known_only (to be removed right after) of just the genes, so we can get their coding potential
tmp=df_known_only[df_known_only["type"]=="gene"].reset_index(drop=True)
# next let's extract information about the gene type
tmp["gene_type"]=tmp.attributes.str.split("GENE_TYPE=",expand=True)[1].str.split(";",expand=True)[0].str.strip("\n")
print("set of all gene_types in CHESS",set(tmp["gene_type"]))

# now we can get a subset of all known protein_coding genes
tmp=tmp[tmp["gene_type"]=="protein_coding"].reset_index(drop=True)
print("number of known protein-coding genes is: %d"%len(tmp))
# now we just need to get geneIDs for the protein_coding sequences
setProtGenes_known_only=set(tmp["geneID"])
del tmp
df_known_only=df_known_only[df_known_only["geneID"].isin(setProtGenes_known_only)].reset_index(drop=True)

# next we would like to test whether all transcripts in these known protein-coding genes contain a CDS
# first get a set of parent IDs for all CDSs
cdsParents_known_only=set(df_known_only[df_known_only["type"]=="CDS"]["parent"])
# now form a set of IDs for all transcripts
transIDs_known_only=set(df_known_only[df_known_only["type"]=="transcript"]["id"])
print("the number of transcripts in protein-coding known genes is: %d\nthe number of CDSs associated with transcripts in protein-coding known genes is %d"%(len(transIDs_known_only),len(cdsParents_known_only)))
print("number of known transcripts in known protein-coding genes is: ",len(df_known_only[(df_known_only['type']=='transcript')&~(df_known_only['source'].isin(["CHESS","StringTie","FANTOM"]))&(df_known_only["id"].isin(transIDs_known_only))]))

cdsDF=df_known_only[df_known_only['type']=='exon'].reset_index(drop=True)

cdsGDF=cdsDF.groupby("parent").agg({'start':formBlocks,'end':formBlocks}).reset_index()
mergedDF=cdsDF[['seqid','parent','strand']].merge(cdsGDF,on='parent',how='outer',indicator=True)
assert len(mergedDF[mergedDF["_merge"]=="both"])==len(mergedDF), "ids don't match"
mergedDF.drop_duplicates(inplace=True)
mergedDF.reset_index(drop=True,inplace=True)
mergedDF.drop('_merge',inplace=True,axis=1)

mergedDF['uid']=mergedDF["start"]+"-"+mergedDF["end"]
mergedDF=mergedDF[['seqid','strand','parent','uid']]

# now we can extract gene IDs and groupby them
# we should count the total number of isoforms included
# as well as the number of unique ORFs present for that gene
mergedDF["gID"]=mergedDF["parent"].reset_index(drop=True).str.extract('(CHS.(\d)*)',expand=True)[0]
setKnownTranscripts=set(mergedDF["parent"])
setKnownGenes=set(mergedDF["gID"])
# now let's get rid of any duplicates in this dataframe
mergedDF.drop_duplicates(['seqid','strand','gID','uid','parent'],inplace=True)
mergedDF.reset_index(drop=True,inplace=True)

df=pd.read_csv("./chess2.02.gff",sep="\t",names=gff3Cols)
df.dropna(inplace=True,axis=0)
df.reset_index(inplace=True,drop=True)
df["start"]=df["start"].astype(int)
df["end"]=df["end"].astype(int)
df["id"]=df.attributes.str.split("ID=",expand=True)[1].str.split(";",expand=True)[0]
df["parent"]=df.attributes.str.split("Parent=",expand=True)[1].str.split(";",expand=True)[0]
df["geneID"]=np.where(df["type"].isin(['transcript','exon','CDS']),df.parent.str.extract('(CHS.(\d)*)',expand=True)[0],df['id'])

df=df[(df["geneID"].isin(setKnownGenes))&(df["source"].isin(novelSources))&~((df["parent"].isin(setKnownTranscripts))|(df["id"].isin(setKnownTranscripts)))].reset_index(drop=True)

# now we can further refine the original mergedDF by removing any genes that have no novel transcripts
mergedDF=mergedDF[mergedDF['gID'].isin(set(df["geneID"]))].reset_index(drop=True)

mergedDF # should be ~150K exon chains

{'Curated Genomic', 'GENCODE', 'BestRefSeq', 'ENSEMBL', 'CHESS', 'FANTOM', 'StringTie', 'HAVANA', 'Gnomon', 'RefSeq'}
set of all gene_types in CHESS {'antisense_RNA', 'misc_RNA', 'lncRNA', 'protein_coding'}
number of known protein-coding genes is: 22659
the number of transcripts in protein-coding known genes is: 169965
the number of CDSs associated with transcripts in protein-coding known genes is 129847
number of known transcripts in known protein-coding genes is:  169965


Unnamed: 0,seqid,strand,parent,uid,gID
0,chr1,-,CHS.23.1,"732159,732981,733307,735423-732207,733213,7333...",CHS.23
1,chr1,-,CHS.23.2,"732159,732981,733307,735419-732207,733213,7333...",CHS.23
2,chr1,-,CHS.23.3,"732159,732981,733307,743286,744592-732207,7332...",CHS.23
3,chr1,-,CHS.23.4,"732159,732981,733307,743286,743917,744592-7322...",CHS.23
4,chr1,-,CHS.23.5,"732159,732981,733307,743286,744728-732207,7332...",CHS.23
5,chr1,-,CHS.23.6,"732159,732981,733307,743286,743917,744728-7322...",CHS.23
6,chr1,-,CHS.23.7,"732159,732981,733307,743286,744195,744728-7322...",CHS.23
7,chr1,-,CHS.23.8,"732159,732981,733307,743286,746695-732207,7332...",CHS.23
8,chr1,-,CHS.23.11,"733140,733307,743286,744195-733213,733364,7433...",CHS.23
9,chr1,-,CHS.37.4,"921013,923616,924878-923461,924752,924937",CHS.37


In [9]:
mergedDF=mergedDF[mergedDF['gID'].isin(list(knownChains))].reset_index(drop=True)
print(len(mergedDF))

150111


In [10]:
#================STEP3================
# Now we create a dictionary of all exon chains of known transcripts on known genes
mergedDF["uid2"]=mergedDF["parent"]+"&"+mergedDF["uid"]
geneBlocks=mergedDF.groupby("gID").agg({'uid2':groupBlocks}).reset_index()
knownChains=pd.Series(geneBlocks.uid2.values,index=geneBlocks.gID).to_dict()

In [11]:
cores = cpu_count() #Number of CPU cores on your system
partitions = cores
print(cores,partitions)

8 8


In [21]:
def compareNovelToKnown2(row,k): # performs comparrisons for all
    # stop if a single match is found - otherwise
    parent=row['parent']
    finalRes=[]
    for chain in knownChains[row['gID']]:
        tmp=chain.split("&")
        chain=tmp[1]
        if not parent==tmp[0]:
            res=checkCompat(chain,row['uid'],row['strand'])
            resStart,neStart,skippedExonBases_start=trimming(chain,row['uid'],row['strand'],parent,True)
            resNonesense,neEnd,skippedExonBases_end=trimming(chain,row['uid'],row['strand'],parent,False)
            resNMD=nmdDist(chain,row['uid'],row['strand'])
            finalRes.append((res,resStart,resNonesense,neStart,neEnd,skippedExonBases_start,skippedExonBases_end,resNMD)) # return the smallest code
    if len(finalRes)==0:
        return pd.Series([999999999,999999999,999999999,999999999,999999999,999999999,999999999,999999999])
    return pd.Series(list(getMin(finalRes)))

def work(sub):
    global knownChains()
    mergedDF[["match",
              "startTrim",
              "nonesenseTrim",
              "numExonsStart",
              "numExonsEnd",
              "skippedExonBases_start",
              "skippedExonBases_end",
              "nmd"]]=mergedDF.head().apply(lambda row: compareNovelToKnown2(row,knownChains),axis=1)
    return mergedDF

def parallelize(data, func):
    data_split = np.array_split(data, partitions)
    pool = Pool(cores)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data

mergedDF["match"]=np.nan
mergedDF["startTrim"]=np.nan
mergedDF["nonesenseTrim"]=np.nan
mergedDF["numExonsStart"]=np.nan
mergedDF["numExonsEnd"]=np.nan
mergedDF["skippedExonBases_start"]=np.nan
mergedDF["skippedExonBases_end"]=np.nan
# mergedDF[["match","startTrim","nonesenseTrim","numExonsStart","numExonsEnd","skippedExonBases_start","skippedExonBases_end","nmd"]]=mergedDF.head().apply(lambda row: compareNovelToKnown2(row,knownChains),axis=1)
mergeDF=parallelize(mergedDF,work)
print("percent good:",len(mergedDF[(mergedDF["match"]==0)|(mergedDF["match"]==1)])/len(mergedDF))
print("number good: ",len(mergedDF[(mergedDF["match"]==0)|(mergedDF["match"]==1)]))
print("percent bad: ",len(mergedDF[mergedDF["match"]==2])/len(mergedDF))
print("number bad: ",len(mergedDF[mergedDF["match"]==2]))
print("perfect transcripts:",len(mergedDF[(mergedDF["match"].isin([0,1]))&(mergedDF["startTrim"]==0)])/len(mergedDF))
print("number of perfect transcripts: ",len(mergedDF[(mergedDF["match"].isin([0,1]))&(mergedDF["startTrim"]==0)]))
mergedDF

       seqid strand       parent  \
0       chr1      -     CHS.23.1   
1       chr1      -     CHS.23.2   
2       chr1      -     CHS.23.3   
3       chr1      -     CHS.23.4   
4       chr1      -     CHS.23.5   
5       chr1      -     CHS.23.6   
6       chr1      -     CHS.23.7   
7       chr1      -     CHS.23.8   
8       chr1      -    CHS.23.11   
9       chr1      -     CHS.37.4   
10      chr1      +     CHS.39.3   
11      chr1      +    CHS.39.11   
12      chr1      +    CHS.39.13   
13      chr1      +    CHS.39.16   
14      chr1      +    CHS.39.21   
15      chr1      +    CHS.39.22   
16      chr1      +    CHS.39.25   
17      chr1      +    CHS.39.26   
18      chr1      +    CHS.39.27   
19      chr1      -     CHS.40.3   
20      chr1      -     CHS.40.4   
21      chr1      -     CHS.40.5   
22      chr1      -    CHS.40.18   
23      chr1      -    CHS.40.19   
24      chr1      +     CHS.42.1   
25      chr1      +     CHS.42.2   
26      chr1      +     CHS.

[18764 rows x 13 columns]
       seqid strand        parent  \
18764  chr10      +   CHS.6882.33   
18765  chr10      +   CHS.6882.34   
18766  chr10      +   CHS.6882.35   
18767  chr10      +    CHS.6887.1   
18768  chr10      +    CHS.6887.2   
18769  chr10      +    CHS.6887.3   
18770  chr10      +    CHS.6887.4   
18771  chr10      +    CHS.6887.5   
18772  chr10      +    CHS.6887.6   
18773  chr10      +    CHS.6887.7   
18774  chr10      +   CHS.6887.10   
18775  chr10      +    CHS.6889.1   
18776  chr10      +    CHS.6889.2   
18777  chr10      +    CHS.6889.3   
18778  chr10      +    CHS.6889.4   
18779  chr10      +    CHS.6889.5   
18780  chr10      +    CHS.6889.7   
18781  chr10      +    CHS.6889.8   
18782  chr10      +    CHS.6889.9   
18783  chr10      +   CHS.6889.10   
18784  chr10      +   CHS.6889.11   
18785  chr10      +   CHS.6889.12   
18786  chr10      +   CHS.6889.13   
18787  chr10      +   CHS.6889.14   
18788  chr10      +   CHS.6889.15   
18789  chr10

[18764 rows x 13 columns]
       seqid strand        parent  \
37528  chr12      -  CHS.13059.14   
37529  chr12      -  CHS.13059.15   
37530  chr12      -  CHS.13059.16   
37531  chr12      -  CHS.13059.17   
37532  chr12      -  CHS.13059.18   
37533  chr12      -  CHS.13059.19   
37534  chr12      -  CHS.13059.20   
37535  chr12      -  CHS.13059.21   
37536  chr12      -  CHS.13059.22   
37537  chr12      -  CHS.13059.23   
37538  chr12      -  CHS.13059.24   
37539  chr12      -  CHS.13059.25   
37540  chr12      -  CHS.13059.33   
37541  chr12      -  CHS.13059.34   
37542  chr12      -  CHS.13059.37   
37543  chr12      -  CHS.13059.38   
37544  chr12      +   CHS.13060.1   
37545  chr12      +   CHS.13060.2   
37546  chr12      +   CHS.13060.3   
37547  chr12      +   CHS.13060.4   
37548  chr12      +   CHS.13060.5   
37549  chr12      +   CHS.13060.6   
37550  chr12      +   CHS.13060.7   
37551  chr12      +   CHS.13060.8   
37552  chr12      +   CHS.13060.9   
37553  chr12

[18764 rows x 13 columns]
       seqid strand        parent  \
56292  chr16      -  CHS.20353.20   
56293  chr16      -  CHS.20353.21   
56294  chr16      -  CHS.20353.22   
56295  chr16      -  CHS.20353.23   
56296  chr16      -  CHS.20353.24   
56297  chr16      -   CHS.20355.2   
56298  chr16      -   CHS.20355.6   
56299  chr16      -   CHS.20355.7   
56300  chr16      -   CHS.20356.1   
56301  chr16      -   CHS.20356.2   
56302  chr16      -   CHS.20356.3   
56303  chr16      -   CHS.20356.4   
56304  chr16      -   CHS.20356.5   
56305  chr16      -   CHS.20356.6   
56306  chr16      -   CHS.20356.8   
56307  chr16      -   CHS.20356.9   
56308  chr16      -  CHS.20356.10   
56309  chr16      -  CHS.20356.11   
56310  chr16      -  CHS.20356.13   
56311  chr16      -   CHS.20358.1   
56312  chr16      -   CHS.20358.2   
56313  chr16      -   CHS.20359.1   
56314  chr16      -   CHS.20359.2   
56315  chr16      -   CHS.20359.3   
56316  chr16      -   CHS.20359.8   
56317  chr16

[18764 rows x 13 columns]
       seqid strand        parent  \
75056  chr19      +   CHS.26833.9   
75057  chr19      +  CHS.26833.10   
75058  chr19      -   CHS.26834.3   
75059  chr19      -   CHS.26834.4   
75060  chr19      -   CHS.26834.5   
75061  chr19      -   CHS.26834.6   
75062  chr19      -   CHS.26834.7   
75063  chr19      -   CHS.26834.8   
75064  chr19      -   CHS.26834.9   
75065  chr19      -  CHS.26834.10   
75066  chr19      -  CHS.26834.11   
75067  chr19      -  CHS.26834.12   
75068  chr19      -  CHS.26834.13   
75069  chr19      -  CHS.26834.14   
75070  chr19      -  CHS.26834.15   
75071  chr19      -  CHS.26834.16   
75072  chr19      -  CHS.26834.17   
75073  chr19      -  CHS.26834.18   
75074  chr19      -  CHS.26834.19   
75075  chr19      -  CHS.26834.20   
75076  chr19      -  CHS.26834.21   
75077  chr19      -  CHS.26834.22   
75078  chr19      -  CHS.26834.23   
75079  chr19      -  CHS.26834.24   
75080  chr19      -  CHS.26834.25   
75081  chr19

[18764 rows x 13 columns]
        seqid strand        parent  \
93820   chr21      +  CHS.34395.30   
93821   chr21      +  CHS.34395.32   
93822   chr21      +  CHS.34395.33   
93823   chr21      +  CHS.34395.34   
93824   chr21      +  CHS.34395.36   
93825   chr21      +  CHS.34395.39   
93826   chr21      +  CHS.34395.48   
93827   chr21      +   CHS.34412.1   
93828   chr21      +   CHS.34412.2   
93829   chr21      +   CHS.34412.3   
93830   chr21      +   CHS.34412.4   
93831   chr21      +   CHS.34412.5   
93832   chr21      +   CHS.34412.6   
93833   chr21      +   CHS.34412.7   
93834   chr21      +   CHS.34412.8   
93835   chr21      +   CHS.34412.9   
93836   chr21      +  CHS.34412.10   
93837   chr21      +  CHS.34412.16   
93838   chr21      -   CHS.34413.5   
93839   chr21      -   CHS.34413.6   
93840   chr21      -   CHS.34413.9   
93841   chr21      -  CHS.34413.10   
93842   chr21      -  CHS.34413.11   
93843   chr21      -  CHS.34413.16   
93844   chr21      -  CH

[18764 rows x 13 columns]
       seqid strand        parent  \
112584  chr4      -   CHS.41445.2   
112585  chr4      -   CHS.41445.3   
112586  chr4      -   CHS.41445.4   
112587  chr4      -   CHS.41445.5   
112588  chr4      -   CHS.41445.6   
112589  chr4      -   CHS.41445.7   
112590  chr4      -   CHS.41445.8   
112591  chr4      -   CHS.41445.9   
112592  chr4      -  CHS.41445.10   
112593  chr4      -  CHS.41445.11   
112594  chr4      -  CHS.41445.12   
112595  chr4      -  CHS.41445.14   
112596  chr4      -  CHS.41445.15   
112597  chr4      -  CHS.41445.16   
112598  chr4      -  CHS.41445.17   
112599  chr4      -  CHS.41445.18   
112600  chr4      -  CHS.41445.19   
112601  chr4      +   CHS.41447.1   
112602  chr4      +   CHS.41447.2   
112603  chr4      +   CHS.41447.3   
112604  chr4      +   CHS.41447.4   
112605  chr4      +   CHS.41447.5   
112606  chr4      +   CHS.41447.6   
112607  chr4      +   CHS.41447.7   
112608  chr4      +   CHS.41447.8   
112609  chr4

[18764 rows x 13 columns]
       seqid strand        parent  \
131348  chr7      -   CHS.51320.2   
131349  chr7      -   CHS.51320.5   
131350  chr7      -  CHS.51320.14   
131351  chr7      -  CHS.51320.15   
131352  chr7      -  CHS.51320.16   
131353  chr7      +   CHS.51323.1   
131354  chr7      +   CHS.51323.2   
131355  chr7      +   CHS.51323.3   
131356  chr7      +   CHS.51323.8   
131357  chr7      +   CHS.51323.9   
131358  chr7      +  CHS.51323.10   
131359  chr7      +  CHS.51323.11   
131360  chr7      +  CHS.51323.14   
131361  chr7      +  CHS.51323.15   
131362  chr7      +   CHS.51325.1   
131363  chr7      +   CHS.51325.5   
131364  chr7      +   CHS.51325.6   
131365  chr7      +   CHS.51325.7   
131366  chr7      +   CHS.51325.8   
131367  chr7      +   CHS.51325.9   
131368  chr7      +  CHS.51325.10   
131369  chr7      -   CHS.51326.3   
131370  chr7      -   CHS.51326.4   
131371  chr7      -   CHS.51326.8   
131372  chr7      -   CHS.51326.9   
131373  chr7

[18763 rows x 13 columns]


TypeError: 'Series' objects are mutable, thus they cannot be hashed

In [None]:
mergedDF.to_csv("./revision/knownExonToCDS.csv")

In [8]:
#================STEP5================
# Now perform the comparison within the CDS sequences of known isoforms
mergedDF["cds2"]=mergedDF["parent"]+"$"+mergedDF["uid"]
geneCDSBlocks=mergedDF.groupby("gID").agg({'cds2':groupBlocks}).reset_index()
knownCDSChains=pd.Series(geneCDSBlocks.cds2.values,index=geneCDSBlocks.gID).to_dict()

In [9]:
geneStrands=mergedDF.groupby("gID")[["strand"]].first().reset_index()
knownGeneStrands=pd.Series(geneStrands.strand.values,index=geneStrands.gID).to_dict()

In [10]:
resDF=pd.DataFrame([])
for g in set(knownCDSChains):
    curChains=knownCDSChains[g]
    if len(curChains)==1:
        resDF=pd.concat([resDF,pd.DataFrame([[curChains[0].split("$")[0],99999]])])
    else:
        for idx in range(len(curChains)):
            curParent,curChain=curChains[idx].split("$")
            rest=curChains[:idx]+curChains[idx+1:]
#             resDF["outOfBounds"]=resDF.apply(lambda row: outOfBounds(knownChains[row['gID']],row['uid']),axis=1)
            resDF=pd.concat([resDF,pd.DataFrame([[curParent]+compareKnownToKnown(curChain,rest,knownGeneStrands[g])])])

resDF.columns=['parent','match','startTrim','endTrim',"neStart","neEnd","skippedExonBases_start","skippedExonBases_end","nmd"]
resDF.reset_index(drop=True,inplace=True)

In [11]:
resDF

Unnamed: 0,parent,match,startTrim,endTrim,neStart,neEnd,skippedExonBases_start,skippedExonBases_end,nmd
0,CHS.36583.3,0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
1,CHS.36583.4,0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
2,CHS.36583.5,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CHS.36583.7,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CHS.36583.8,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,CHS.36583.9,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,CHS.36583.10,0,0.0,26.0,0.0,1.0,0.0,26.0,0.0
7,CHS.36583.26,0,0.0,26.0,0.0,1.0,0.0,26.0,0.0
8,CHS.36583.27,0,0.0,26.0,0.0,1.0,0.0,26.0,0.0
9,CHS.36583.31,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
mergedExonDF.to_csv("./revision/mergedExonDF.csv")
resDF.to_csv("./revision/resDF.csv")

In [14]:
resDF=resDF.dropna(axis=0)
resDF=resDF[~(resDF["startTrim"]==7777777)].reset_index(drop=True)

In [15]:
mergedExonsDF=mergedExonDF.dropna(axis=0)
mergedExonsDF=mergedExonDF[~(mergedExonDF["startTrim"]==7777777)].reset_index(drop=True)

In [16]:
print("perfect transcripts:",len(mergedExonDF[(mergedExonDF["match"].isin([0,1]))&(mergedExonDF["startTrim"]==0)&(mergedExonDF["nonesenseTrim"]==0)])/len(mergedExonDF))
# looking at cases where hisat might have potentially been the reason for incomplete transcript
display(mergedExonDF[mergedExonDF ["gID"]=="CHS.10130"].reset_index(drop=True))
print("potential hisat2 errors (soft-clip?)",len(mergedExonDF[(mergedExonDF["startTrim"]<=100)&(mergedExonDF["startTrim"]>0)]))
print("potential errors at end", len(mergedExonDF[(mergedExonDF["nonesenseTrim"]<=1)&(mergedExonDF["nonesenseTrim"]>0)]))
mergedExonDF[(mergedExonDF["nonesenseTrim"]<2)&(mergedExonDF["nonesenseTrim"]>0)]

perfect transcripts: 0.45304978431658577


Unnamed: 0,seqid,strand,parent,uid,gID,match,startTrim,nonesenseTrim,numExonsStart,numExonsEnd,skippedExonBases_start,skippedExonBases_end,nmd
0,chr11,+,CHS.10130.2,"118527496,118528502,118530654-118527612,118528...",CHS.10130,2,1,0,0,0,0,0,0


potential hisat2 errors (soft-clip?) 5063
potential errors at end 133


Unnamed: 0,seqid,strand,parent,uid,gID,match,startTrim,nonesenseTrim,numExonsStart,numExonsEnd,skippedExonBases_start,skippedExonBases_end,nmd
5605,chr1,-,CHS.2851.3,"146066319,146067704-146067288,146068007",CHS.2851,2,10523,1,83,0,175,0,0
9461,chr1,-,CHS.4714.5,"225886295,225887399,225887786,225911119-225887...",CHS.4714,2,0,1,0,0,0,0,1036
9463,chr1,-,CHS.4714.7,"225886991,225887399,225887786,225900806-225887...",CHS.4714,2,0,1,0,0,0,0,340
9854,chr1,-,CHS.5013.4,"236215565,236217391,236220832,236221924,236225...",CHS.5013,2,0,1,0,0,0,0,2985
9855,chr1,-,CHS.5013.5,"236215570,236217391,236220832,236221924,236225...",CHS.5013,2,0,1,0,0,0,0,2980
11552,chr10,-,CHS.6408.2,"70254958,70266077,70283670-70255827,70266391,7...",CHS.6408,2,0,1,0,0,0,0,0
13779,chr10,-,CHS.7569.4,"131967684,131970638,131970864,131973034,131973...",CHS.7569,2,0,1,0,0,0,0,886
13780,chr10,-,CHS.7569.6,"131967684,131970638,131970864,131973034,131973...",CHS.7569,2,0,1,0,0,0,0,1636
13781,chr10,-,CHS.7569.7,"131967684,131970638,131970864,131973034,131973...",CHS.7569,2,0,1,0,0,0,0,1649
13782,chr10,-,CHS.7569.8,"131967684,131970638,131970864,131973034,131973...",CHS.7569,2,0,1,0,0,0,0,886


In [None]:
# gencode biotype tab - nonsense_mediated_decay definition

In [30]:
mergedExonDF[(mergedExonDF["nonesenseTrim"]<=1)&(mergedExonDF["nonesenseTrim"]>0)]

Unnamed: 0,seqid,strand,parent,uid,gID,match,startTrim,nonesenseTrim,numExonsStart,numExonsEnd,skippedExonBases_start,skippedExonBases_end
5605,chr1,-,CHS.2851.3,"146066319,146067704-146067288,146068007",CHS.2851,2,10523,1,83,0,175,0
9461,chr1,-,CHS.4714.5,"225886295,225887399,225887786,225911119-225887...",CHS.4714,2,0,1,0,0,0,0
9462,chr1,-,CHS.4714.6,"225886987,225887786,225888817-225887090,225888...",CHS.4714,2,0,1,0,0,0,0
9463,chr1,-,CHS.4714.7,"225886991,225887399,225887786,225900806-225887...",CHS.4714,2,0,1,0,0,0,0
9854,chr1,-,CHS.5013.4,"236215565,236217391,236220832,236221924,236225...",CHS.5013,2,0,1,0,0,0,0
9855,chr1,-,CHS.5013.5,"236215570,236217391,236220832,236221924,236225...",CHS.5013,2,0,1,0,0,0,0
11552,chr10,-,CHS.6408.2,"70254958,70266077,70283670-70255827,70266391,7...",CHS.6408,2,0,1,0,0,0,0
13293,chr10,-,CHS.7246.12,"114151120,114157554,114162637,114173141-114151...",CHS.7246,2,0,1,0,0,0,0
13779,chr10,-,CHS.7569.4,"131967684,131970638,131970864,131973034,131973...",CHS.7569,2,0,1,0,0,0,0
13780,chr10,-,CHS.7569.6,"131967684,131970638,131970864,131973034,131973...",CHS.7569,2,0,1,0,0,0,0


In [23]:
print("percent good:",len(resDF[(resDF["match"]==0)|(resDF["match"]==1)])/len(resDF))
print("perfect transcripts:",len(resDF[(resDF["match"].isin([0,1]))&(resDF["startTrim"]==0)])/len(resDF))
# looking at cases where hisat might have potentially been the reason for incomplete transcript
# display(resDF[resDF["gID"]=="CHS.10130"].reset_index(drop=True))
print("potential errors at start",len(resDF[(resDF["startTrim"]<=1)&(resDF["startTrim"]>0)]))
display(resDF[(resDF["startTrim"]<=1)&(resDF["startTrim"]>0)])
print("potential errors at end", len(resDF[(resDF["endTrim"]<=1)&(resDF["endTrim"]>0)]))

percent good: 0.7948382564885126
perfect transcripts: 0.7628044423451901
potential errors at start 30


Unnamed: 0,parent,match,startTrim,endTrim,neStart,neEnd,skippedExonBases_start,skippedExonBases_end
433,CHS.7763.4,2,1.0,0.0,0.0,0.0,0.0,0.0
1001,CHS.38571.3,2,1.0,0.0,1.0,0.0,1.0,0.0
4556,CHS.36600.43,2,1.0,4.0,0.0,1.0,0.0,4.0
5000,CHS.13259.20,2,1.0,0.0,0.0,0.0,0.0,0.0
6510,CHS.12626.2,2,1.0,1.0,0.0,0.0,0.0,0.0
6792,CHS.39465.1,2,1.0,0.0,0.0,0.0,0.0,0.0
12715,CHS.41084.9,2,1.0,0.0,1.0,0.0,1.0,0.0
22786,CHS.53769.15,2,1.0,97.0,0.0,1.0,0.0,97.0
26428,CHS.10501.14,2,1.0,0.0,1.0,0.0,1.0,0.0
26430,CHS.10501.17,2,1.0,0.0,1.0,0.0,1.0,0.0


potential errors at end 190


In [40]:
# to answer the question of how much falls into the category of nonesense-mediated decay given the GENCODE definition
display(mergedExonDF.head())
nmdDF=mergedExonDF[(mergedExonDF["numExonsEnd"]>0)&~(mergedExonDF["numExonsStart"]==7777777)].reset_index(drop=True)
nmdDF["numAfterSkippedExon"]=nmdDF["nonesenseTrim"]-nmdDF["skippedExonBases_end"]
print("before",len(nmdDF))
nmdDF=nmdDF[nmdDF["numAfterSkippedExon"]>=50].reset_index(drop=True)
print("after",len(nmdDF))
nmdDF.head()

Unnamed: 0,seqid,strand,parent,uid,gID,match,startTrim,nonesenseTrim,numExonsStart,numExonsEnd,skippedExonBases_start,skippedExonBases_end
0,chr1,-,CHS.23.9,"732160,732981,736713,740187-732211,733213,7367...",CHS.23,2,214,0,2,0,149,0
1,chr1,-,CHS.23.10,"733082,736713,738834,739803,740129,743286,7466...",CHS.23,2,149,0,1,0,149,0
2,chr1,-,CHS.37.3,"916309,925518-923461,925604",CHS.37,1,1197,0,2,0,60,0
3,chr1,-,CHS.37.5,"922909,924651-923461,924924",CHS.37,2,13,0,0,0,0,0
4,chr1,+,CHS.39.1,"923923,925922,930155,931039,935772,939040,9392...",CHS.39,0,0,0,0,0,0,0


before 9125
after 5616


Unnamed: 0,seqid,strand,parent,uid,gID,match,startTrim,nonesenseTrim,numExonsStart,numExonsEnd,skippedExonBases_start,skippedExonBases_end,numAfterSkippedExon
0,chr1,-,CHS.43.19,"978190,981425-981325,982116",CHS.43,2,0,158,0,2,0,98,60
1,chr1,+,CHS.48.9,"1022227,1035277,1040665-1022462,1035324,1040881",CHS.48,2,227,5288,1,33,201,23,5265
2,chr1,+,CHS.64.17,"1180106,1180302,1180483-1180239,1180340,1180498",CHS.64,2,52,912,0,6,0,174,738
3,chr1,-,CHS.68.18,"1223834,1228468-1223968,1228830",CHS.68,2,0,148,0,2,0,32,116
4,chr1,+,CHS.80.2,"1331280,1332024-1331938,1332806",CHS.80,2,0,1284,0,3,0,959,325


In [None]:
# now let's check all 99999 and 77777 there
# we shall verify that the codes are assigned correctly

# lastly we need to take a few to check startTrim and endTrim

In [None]:
# now need summaries on a per-gene level
# how many zeros, ones and twos are per gene with respect to known and novel transcripts

# we can then provide these summaries as histograms or in some other graphical form


In [None]:
# I wonder if we could use anything else here like gffcompare - unlikely
# However, we might also need to account for a missing start codon as a special code

In [None]:
# the next logical step would be to analyze existing/known ORFs only
# to see how frequently ORFs within the same gene have frameshifts/deletions/insertions

In [None]:
# lastly, we need to investigate novel genes and the transcripts annotated there
# to see if the distribution of frameshifts/insertions/deletions is similar
# to that observed for the known transcripts inside known genes

# all in all these steps should answer the question completely

one other important consideration is (not critical at the moment, but will be necessary for the final results)

At the moment we are removing duplicates of any transcripts which appear to have identical intron-exon chain,
whihc is done to avoid making comparisons multiple times

What we should be doing instead is:
1. grouping them by the same ['seqid','strand','blocks']
2. keeping track of the number that went into the same group
3. keeping only one single representative for the group, thus effectively performing the same function as the drop_duplicates()