In [1]:
# used to check how the results change after fixing some of the errors

In [1]:
import pandas as pd
import numpy as np
from itertools import groupby
from operator import itemgetter
import subprocess
import sys
import os
import csv
import matplotlib.pyplot as plt
from matplotlib import animation
import seaborn as sns
plt.rcParams['figure.figsize'] = (20.0, 10.0)
plt.rcParams['font.family'] = "serif"
%matplotlib inline

gff3Cols=["seqid","source","type","start","end","score","strand","phase","attributes"]
novelSources=['CHESS','StringTie','FANTOM']

In [2]:
# formBlocks formats all segments (CDS or exon) per transcript into a single line entry
def formBlocks(series):
    return ",".join([str(x) for x in sorted(series.tolist())])

# groupBlocks groups all exon/cds chains by gene
def groupBlocks(series):
    return series.tolist()

# this function is used by comparitors to find the minimum value
# based on several conditionals
def getMin(res):
    minimum=None
    numElements=len(res[0])
    for i in range(numElements):
        minimum=min(res,key=lambda k:k[i])[i]
        res=[v for v in res if v[i]==minimum]
    
    return res[0]

# def compareKnownToKnown(curChain,rest): # performs comparrisons for all
#     # stop if a single match is found - otherwise
#     finalRes=[]
#     for chain in rest:
#         chain=chain.split("$")[-1]
#         res=checkCompat(chain,curChain)
#         # will run longer, however, multiple conditionals demand that we evaluate every possibility
# #         if res==0:
# #             return 0
#         finalRes.append(res) # return the smallest code

#     # we should also consider returning the name/chain of the closest isoform to which we've found a match
#     return getMin(finalRes)

# checkCompat performs a comparison of two coordinate-chains
# The current output format is as follows:
# 1. 0 - perfect match of two chains
# 2. 1 - there is a modification to the chain (addition/removal) which does not result in a frameshift
# 3. 2 - there is a modification to the chain (addition/removal) which results in a frame shift
#================NOTES================
# It is important to note here, that several tye 2 modifications may result in a return back to the original frame
# in it's current state the function does not take it into account
# meaning that if a single modification which changes the frame is found - frameshift is reported

# checkCompat is not dependent on the strand of either the exon chain or the orf unlike the startTrim and endTrim functions
def checkCompat(orf,chain,strand):
    tmp=orf.split("-")
    orfStarts=[int(x) for x in tmp[0].split(",")]
    orfEnds=[int(x) for x in tmp[-1].split(",")]
    
    # next we need to parse the exons in the same manner
    tmp=chain.split("-")
    chainStarts=[int(x) for x in tmp[0].split(",")]
    chainEnds=[int(x) for x in tmp[-1].split(",")]
    
    orf=[]
    for t in zip(orfStarts,orfEnds):
        orf+=list(range(t[0],t[1]+1))
    
    chain=[]
    for t in zip(chainStarts,chainEnds):
        chain+=list(range(t[0],t[1]+1))

    # now remove anything in the chain that is before or after the start and end of the ORF respectively
    # trim the end
    
    tmpOrf=orf # save orf for later to verify if the chain does not fit the orf completely
    if strand=="+":
        orf=[x for x in orf if x<=chain[-1]]
    else:
        orf=[x for x in orf if x>=chain[0]]
        
    if len(orf)==0:
        return 7777777
    
    # now verify chain compatability with the original orf
    tmpChain=[x for x in chain if x>=orf[0] and x<=orf[-1]]
    if len(tmpChain)==0:
        return 9999999
    
    # if both conditions (7777777 and 9999999) pass - we shall proceede with the computation
    chain=[x for x in chain if x>=orf[0] and x<=orf[-1]]
    
    orf=set(orf)
    chain=set(chain)
    
    # now get set differences66
    orf_chain=orf-chain
    chain_orf=chain-orf
    
    if len(orf_chain)==0 and len(chain_orf)==0:
        return 0
    
    allMis=[]
    for k, g in groupby(enumerate(orf_chain), lambda ix : ix[0] - ix[1]):
        allMis.append(list(map(itemgetter(1), g)))
    for k, g in groupby(enumerate(chain_orf), lambda ix : ix[0] - ix[1]):
        allMis.append(list(map(itemgetter(1), g)))
    
    # for now this will be a very easy thing to do
    # would be better to have a method for computing how many bases are in off-frame vs on-frame regions
    for mis in allMis:
        if not len(mis)%3==0:
            return 2
    return 1

def trimming(orf,chain,strand,parent,start=True):
    if not start:
        if strand=="+":
            strand="-"
        else:
            strand="+"
    if strand=="+":
        tmp=orf.split("-")
        orfStarts=[int(x) for x in tmp[0].split(",")]
        orfEnds=[int(x) for x in tmp[-1].split(",")]

        # next we need to parse the exons in the same manner
        tmp=chain.split("-")
        chainStarts=[int(x) for x in tmp[0].split(",")]
        chainEnds=[int(x) for x in tmp[-1].split(",")]

        orf=[]
        for t in zip(orfStarts,orfEnds):
            orf+=list(range(t[0],t[1]+1))

        chain=[]
        for t in zip(chainStarts,chainEnds):
            chain+=list(range(t[0],t[1]+1))

        # now remove anything in the chain that is before or after the start and end of the ORF
        chain=[x for x in chain if x>=orf[0] and x<=orf[-1]]

        if len(chain)==0: # should hopefully never happen
            return 7777777,7777777,7777777

        setStartCodon_orf=set(orf[:3])
        setStartCodon_chain=set(chain[:3])
        startCodon_orf_diff_chain=setStartCodon_orf-setStartCodon_chain

        if orf[0]==chain[0]: # at least the first nucleotide is there
            if len(startCodon_orf_diff_chain)<3: # at least part of the start codon is there
                return len(startCodon_orf_diff_chain),0,0 # return the number of missing nucleotides

        # what we shall do instead is simply quantify how far down the orf the exon begins
        firstExonChainNT=min(chain)
        curPos=0 # position within the ORF    
        orfIter=iter(orf)
        skippedExonBases=0
        exons=0
        prevOrf=None
        curOrf=next(orfIter)
        while True:
            if not curOrf>=firstExonChainNT:
                if prevOrf and not curOrf-prevOrf==1:
                    if skippedExonBases==0:
                        skippedExonBases+=curPos
                    exons+=1
                curPos+=1
                prevOrf=curOrf
                curOrf=next(orfIter)
            else:
                if prevOrf and not curOrf-prevOrf==1:
                    if skippedExonBases==0:
                        skippedExonBases+=curPos
                    exons+=1
                break
        return curPos,exons,skippedExonBases
    else:
        tmp=orf.split("-")
        orfStarts=[int(x) for x in tmp[0].split(",")]
        orfEnds=[int(x) for x in tmp[-1].split(",")]
        
        tmp=chain.split("-")
        chainStarts=[int(x) for x in tmp[0].split(",")]
        chainEnds=[int(x) for x in tmp[-1].split(",")]
        
        orf=[]
        for t in zip(orfStarts,orfEnds):
            orf+=list(range(t[0],t[1]+1))
        orf=orf[::-1]
        
        chain=[]
        for t in zip(chainStarts,chainEnds):
            chain+=list(range(t[0],t[1]+1))
        chain=chain[::-1]
        
        chain=[x for x in chain if x<=orf[0] and x>=orf[-1]]
            
        if len(chain)==0: # should hopefully never happen
            return 7777777,7777777,7777777
        
        setStartCodon_orf=set(orf[:3])
        setStartCodon_chain=set(chain[:3])
        startCodon_orf_diff_chain=sorted(list(setStartCodon_orf-setStartCodon_chain))[::-1]

        if orf[0]==chain[0]: # at least the first nucleotide is there
            if len(startCodon_orf_diff_chain)<3: # at least part of the start codon is there
                return len(startCodon_orf_diff_chain),0,0 # return the number of missing nucleotides

        # what we shall do instead is simply quantify how far down the orf the exon begins
        firstExonChainNT=max(chain)
        curPos=0 # position within the ORF
        orfIter=iter(orf)
        skippedExonBases=0
        exons=0
        prevOrf=None
        curOrf=next(orfIter)
        while True:
            if not curOrf<=firstExonChainNT:
                if prevOrf and not prevOrf-curOrf==1:
                    if skippedExonBases==0:
                        skippedExonBases+=curPos
                    exons+=1
                curPos+=1
                prevOrf=curOrf
                curOrf=next(orfIter)
            else:
                if prevOrf and not prevOrf-curOrf==1:
                    if skippedExonBases==0:
                        skippedExonBases+=curPos
                    exons+=1
                break
        return curPos,exons,skippedExonBases

    # need to deal with the special case when the first base of the start codon is missing
    # next apply this function to the entirety of the dataframe
    
def nmdDist(orf,chain,strand):
    if strand=="+":
        strand="-"
    else:
        strand="+"
        
    if strand=="+":
        tmp=orf.split('-')
        orfStarts=[int(x) for x in tmp[0].split(',')]
        orfEnds=[int(x) for x in tmp[-1].split(',')]
        tmp=chain.split('-')
        chainStarts=[int(x) for x in tmp[0].split(',')]
        chainEnds=[int(x) for x in tmp[-1].split(',')]
        
        orf=[]
        for t in zip(orfStarts,orfEnds):
            orf+=list(range(t[0],t[1]+1))
        chain=[]
        for t in zip(chainStarts,chainEnds):
            chain+=list(range(t[0],t[1]+1))

        if len(set(chain).union(set(orf)))==0:
            return 6666666
        chain=[x for x in chain if x<orf[0]]
        if len(chain)==0:
            return 0
        
        prevNT=0
        exons=0
        dist=0
        for nt in chain:
            if prevNT-nt<-1:
                exons+=1
            if exons>0:
                dist+=1
            prevNT=nt
        return dist
    else:
        tmp=orf.split('-')
        orfStarts=[int(x) for x in tmp[0].split(',')]
        orfEnds=[int(x) for x in tmp[-1].split(',')]
        tmp=chain.split('-')
        chainStarts=[int(x) for x in tmp[0].split(',')]
        chainEnds=[int(x) for x in tmp[-1].split(',')]
        
        orf=[]
        for t in zip(orfStarts,orfEnds):
            orf+=list(range(t[0],t[1]+1))
        orf=orf[::-1]
        chain=[]
        for t in zip(chainStarts,chainEnds):
            chain+=list(range(t[0],t[1]+1))
        chain=chain[::-1]

        if len(set(chain).union(set(orf)))==0: # non compatible - report
            return 6666666
        chain=[x for x in chain if x>orf[0]]
        if len(chain)==0:
            return 0
        
        prevNT=0
        exons=0
        dist=0
        for nt in chain: # iterate over chain to see if any of novel transcripts have nonesense-mediated decay due to length (the function does not evaluate NMD due to premature codon
            if nt-prevNT<-1:
                exons+=1
            if exons>0:
                dist+=1
            prevNT=nt
        return dist
    
def outOfBounds(orfList,chain):
#     orfList=orfs.split(";")
    starts=[]
    ends=[]
    for orf in orfList:
        # get start/end
        tmp=orf.split("-")
        orfStarts=[int(x) for x in tmp[0].split(",")]
        orfEnds=[int(x) for x in tmp[1].split(",")]
        starts.append(sorted(orfStarts)[0])
        ends.append(sorted(orfEnds)[-1])
    minOrfStart=min(starts)
    maxOrfEnd=max(ends)
    
    # next we need to parse the exons in the same manner
    tmp=chain.split("-")
    chainStarts=[int(x) for x in tmp[0].split(",")]
    minChainStart=min(chainStarts)
    maxChainStart=max(chainStarts)
    chainEnds=[int(x) for x in tmp[-1].split(",")]
    minChainEnd=min(chainEnds)
    maxChainEnd=max(chainEnds)

    if maxChainEnd<minOrfStart or minChainStart>maxOrfEnd:
        return True
    return False

# compareNovelToKnown is a wrapper function which performs all required individual comparrisons
# between novel and known isoforms of the same gene
def compareNovelToKnown(row,knownChains): # performs comparrisons for all
    # stop if a single match is found - otherwise
    parent=row['parent']
    finalRes=[]
    for chain in knownChains[row['gID']]:
        res=checkCompat(chain,row['uid'],row['strand'])
        resStart,neStart,skippedExonBases_start=trimming(chain,row['uid'],row['strand'],parent,True)
        resNonesense,neEnd,skippedExonBases_end=trimming(chain,row['uid'],row['strand'],parent,False)
        resNMD=nmdDist(chain,row['uid'],row['strand'])
        finalRes.append((res,resStart,resNonesense,neStart,neEnd,skippedExonBases_start,skippedExonBases_end,resNMD)) # return the smallest code
    return pd.Series(list(getMin(finalRes)))

# compareKnownToKnown performs a comparison of exon/cds chains between known isoforms
# for each known isoform the function will compare it to every other known isoorms that belongs to the same gene
def compareKnownToKnown(curChain,rest,strand):
    finalRes=[]
#     oob=outOfBounds(rest,curChain)
    for chain in rest:
        chain=chain.split("$")[-1]
        res=checkCompat(chain,curChain,strand)
        resStart,neStart,skippedExonBases_start=trimming(chain,curChain,strand,"filler",True)
        resNonesense,neEnd,skippedExonBases_end=trimming(chain,curChain,strand,"filler",False)
        nmd=nmdDist(chain,curChain,strand)
        finalRes.append((res,resStart,resNonesense,neStart,neEnd,skippedExonBases_start,skippedExonBases_end,nmd))
    return list(getMin(finalRes))

In [26]:
#==============STEP1==============
# Building CDS chains for all known protein-coding transcripts excluding anything novel

df_known_only=pd.read_csv("./chess2.03.gff",sep="\t",names=gff3Cols)
df_known_only.dropna(inplace=True,axis=0)
df_known_only.reset_index(inplace=True,drop=True)
df_known_only["start"]=df_known_only["start"].astype(int)
df_known_only["end"]=df_known_only["end"].astype(int)

# next we shall create the subset of all non-novel genes
print(set(df_known_only["source"]))
df_known_only=df_known_only[~(df_known_only['source'].isin(novelSources))].reset_index(drop=True)

# now let's isolate the protein-coding genes and their transcripts, exons and CDS
df_known_only["id"]=df_known_only.attributes.str.split("ID=",expand=True)[1].str.split(";",expand=True)[0]
df_known_only["parent"]=df_known_only.attributes.str.split("Parent=",expand=True)[1].str.split(";",expand=True)[0]
df_known_only["geneID"]=np.where(df_known_only["type"].isin(['transcript','exon','CDS']),df_known_only.parent.str.extract('(CHS.(\d)*)',expand=True)[0],df_known_only['id'])
# first create a tmpdf_known_only (to be removed right after) of just the genes, so we can get their coding potential
tmp=df_known_only[df_known_only["type"]=="gene"].reset_index(drop=True)
# next let's extract information about the gene type
tmp["gene_type"]=tmp.attributes.str.split("GENE_TYPE=",expand=True)[1].str.split(";",expand=True)[0].str.strip("\n")
print("set of all gene_types in CHESS",set(tmp["gene_type"]))

# now we can get a subset of all known protein_coding genes
tmp=tmp[tmp["gene_type"]=="protein_coding"].reset_index(drop=True)
print("number of known protein-coding genes is: %d"%len(tmp))
# now we just need to get geneIDs for the protein_coding sequences
setProtGenes_known_only=set(tmp["geneID"])
del tmp
df_known_only=df_known_only[df_known_only["geneID"].isin(setProtGenes_known_only)].reset_index(drop=True)

# next we would like to test whether all transcripts in these known protein-coding genes contain a CDS
# first get a set of parent IDs for all CDSs
cdsParents_known_only=set(df_known_only[df_known_only["type"]=="CDS"]["parent"])
# now form a set of IDs for all transcripts
transIDs_known_only=set(df_known_only[df_known_only["type"]=="transcript"]["id"])
print("the number of transcripts in protein-coding known genes is: %d\nthe number of CDSs associated with transcripts in protein-coding known genes is %d"%(len(transIDs_known_only),len(cdsParents_known_only)))

cdsDF=df_known_only[df_known_only['type']=='CDS'].reset_index(drop=True)

cdsGDF=cdsDF.groupby("parent").agg({'start':formBlocks,'end':formBlocks}).reset_index()
mergedDF=cdsDF[['seqid','parent','strand']].merge(cdsGDF,on='parent',how='outer',indicator=True)
assert len(mergedDF[mergedDF["_merge"]=="both"])==len(mergedDF), "ids don't match"
mergedDF.drop_duplicates(inplace=True)
mergedDF.reset_index(drop=True,inplace=True)
mergedDF.drop('_merge',inplace=True,axis=1)

mergedDF['uid']=mergedDF["start"]+"-"+mergedDF["end"]
mergedDF=mergedDF[['seqid','strand','parent','uid']]

# now we can extract gene IDs and groupby them
# we should count the total number of isoforms included
# as well as the number of unique ORFs present for that gene
mergedDF["gID"]=mergedDF["parent"].reset_index(drop=True).str.extract('(CHS.(\d)*)',expand=True)[0]
setKnownTranscripts=set(mergedDF["parent"])
setKnownGenes=set(mergedDF["gID"])
# now let's get rid of any duplicates in this dataframe
mergedDF.drop_duplicates(['seqid','strand','gID','uid'],inplace=True)
mergedDF.reset_index(drop=True,inplace=True)

{'StringTie', 'FANTOM', 'Gnomon', 'BestRefSeq', 'CHESS', 'GENCODE', 'HAVANA', 'RefSeq', 'ENSEMBL', 'Curated Genomic'}
set of all gene_types in CHESS {'misc_RNA', 'protein_coding', 'lncRNA', 'antisense_RNA'}
number of known protein-coding genes is: 22659
the number of transcripts in protein-coding known genes is: 169965
the number of CDSs associated with transcripts in protein-coding known genes is 129847


In [27]:
#================STEP2================
# Building a dataframe of all novel transcripts that were added to known genes

df=pd.read_csv("./chess2.03.gff",sep="\t",names=gff3Cols)
df.dropna(inplace=True,axis=0)
df.reset_index(inplace=True,drop=True)
df["start"]=df["start"].astype(int)
df["end"]=df["end"].astype(int)
df["id"]=df.attributes.str.split("ID=",expand=True)[1].str.split(";",expand=True)[0]
df["parent"]=df.attributes.str.split("Parent=",expand=True)[1].str.split(";",expand=True)[0]
df["geneID"]=np.where(df["type"].isin(['transcript','exon','CDS']),df.parent.str.extract('(CHS.(\d)*)',expand=True)[0],df['id'])

df=df[(df["geneID"].isin(setKnownGenes))&(df["source"].isin(novelSources))&~((df["parent"].isin(setKnownTranscripts))|(df["id"].isin(setKnownTranscripts)))].reset_index(drop=True)

# now we can further refine the original mergedDF by removing any genes that have no novel transcripts
mergedDF=mergedDF[mergedDF['gID'].isin(set(df["geneID"]))].reset_index(drop=True)
print("the final number of known genes we are examining is: %d"%(len(set(mergedDF["gID"]))))

exonDF=df[df['type']=='exon'].reset_index(drop=True)

exonGDF=exonDF.groupby("parent").agg({'start':formBlocks,'end':formBlocks}).reset_index()
mergedExonDF=exonDF[['seqid','parent','strand']].merge(exonGDF,on='parent',how='outer',indicator=True)
assert len(mergedExonDF[mergedExonDF["_merge"]=="both"])==len(mergedExonDF), "ids don't match"
mergedExonDF.drop_duplicates(inplace=True)
mergedExonDF.reset_index(drop=True,inplace=True)
mergedExonDF.drop('_merge',inplace=True,axis=1)

mergedExonDF['uid']=mergedExonDF["start"]+"-"+mergedExonDF["end"]
mergedExonDF=mergedExonDF[['seqid','strand','parent','uid']]

# now we can extract gene IDs and groupby them
# we should count the total number of isoforms included
# as well as the number of unique ORFs present for that gene
mergedExonDF["gID"]=mergedExonDF["parent"].reset_index(drop=True).str.extract('(CHS.(\d)*)',expand=True)[0]
# now let's get rid of any duplicates in this dataframe
mergedExonDF.drop_duplicates(['seqid','strand','gID','uid'],inplace=True)
mergedExonDF.reset_index(drop=True,inplace=True)

print("the total number of novel transcripts in known genes we are examining is: %d"%(len(set(mergedExonDF["parent"]))))

the final number of known genes we are examining is: 16496
the total number of novel transcripts in known genes we are examining is: 95974


In [28]:
#================STEP3================
# Now we create a dictionary of all exon chains of known transcripts on known genes
geneBlocks=mergedDF.groupby("gID").agg({'uid':groupBlocks}).reset_index()
knownChains=pd.Series(geneBlocks.uid.values,index=geneBlocks.gID).to_dict()

In [35]:
#================STEP4================
# now perform the comparrison between novel and known isoforms
mergedExonDF["match"]=np.nan
mergedExonDF["startTrim"]=np.nan
mergedExonDF["nonesenseTrim"]=np.nan
mergedExonDF["numExonsStart"]=np.nan
mergedExonDF["numExonsEnd"]=np.nan
mergedExonDF["skippedExonBases_start"]=np.nan
mergedExonDF["skippedExonBases_end"]=np.nan
mergedExonDF[["match","startTrim","nonesenseTrim","numExonsStart","numExonsEnd","skippedExonBases_start","skippedExonBases_end","nmd"]]=mergedExonDF.apply(lambda row: compareNovelToKnown(row,knownChains),axis=1)
print("percent good:",len(mergedExonDF[(mergedExonDF["match"]==0)|(mergedExonDF["match"]==1)])/len(mergedExonDF))
print("number good: ",len(mergedExonDF[(mergedExonDF["match"]==0)|(mergedExonDF["match"]==1)]))
print("percent bad: ",len(mergedExonDF[mergedExonDF["match"]==2])/len(mergedExonDF))
print("number bad: ",len(mergedExonDF[mergedExonDF["match"]==2]))
print("perfect transcripts:",len(mergedExonDF[(mergedExonDF["match"].isin([0,1]))&(mergedExonDF["startTrim"]==0)])/len(mergedExonDF))
print("number of perfect transcripts: ",len(mergedExonDF[(mergedExonDF["match"].isin([0,1]))&(mergedExonDF["startTrim"]==0)]))
mergedExonDF.head()

percent good: 0.5400525142226019
number good:  51831
percent bad:  0.44286994394315127
number bad:  42504
perfect transcripts: 0.5241315356242315
number of perfect transcripts:  50303


Unnamed: 0,seqid,strand,parent,uid,gID,match,startTrim,nonesenseTrim,numExonsStart,numExonsEnd,skippedExonBases_start,skippedExonBases_end,nmd
0,chr1,-,CHS.23.9,"732160,732981,736713,740187-732211,733213,7367...",CHS.23,2,214,0,2,0,149,0,215
1,chr1,-,CHS.23.10,"733082,736713,738834,739803,740129,743286,7466...",CHS.23,2,149,0,1,0,149,0,62
2,chr1,-,CHS.37.3,"916309,925518-923461,925604",CHS.37,1,1197,0,2,0,60,0,7147
3,chr1,-,CHS.37.5,"922909,924651-923461,924937",CHS.37,2,0,0,0,0,0,0,547
4,chr1,+,CHS.39.1,"923923,925922,930155,931039,935772,939040,9392...",CHS.39,0,0,0,0,0,0,0,1476


In [30]:
#================STEP5================
# Now perform the comparison within the CDS sequences of known isoforms
mergedDF["cds2"]=mergedDF["parent"]+"$"+mergedDF["uid"]
geneCDSBlocks=mergedDF.groupby("gID").agg({'cds2':groupBlocks}).reset_index()
knownCDSChains=pd.Series(geneCDSBlocks.cds2.values,index=geneCDSBlocks.gID).to_dict()

In [31]:
geneStrands=mergedDF.groupby("gID")[["strand"]].first().reset_index()
knownGeneStrands=pd.Series(geneStrands.strand.values,index=geneStrands.gID).to_dict()

In [32]:
resDF=pd.DataFrame([])
for g in set(knownCDSChains):
    curChains=knownCDSChains[g]
    if len(curChains)==1:
        resDF=pd.concat([resDF,pd.DataFrame([[curChains[0].split("$")[0],99999]])])
    else:
        for idx in range(len(curChains)):
            curParent,curChain=curChains[idx].split("$")
            rest=curChains[:idx]+curChains[idx+1:]
#             resDF["outOfBounds"]=resDF.apply(lambda row: outOfBounds(knownChains[row['gID']],row['uid']),axis=1)
            resDF=pd.concat([resDF,pd.DataFrame([[curParent]+compareKnownToKnown(curChain,rest,knownGeneStrands[g])])])

resDF.columns=['parent','match','startTrim','endTrim',"neStart","neEnd","skippedExonBases_start","skippedExonBases_end","nmd"]
resDF.reset_index(drop=True,inplace=True)

In [33]:
mergedExonDF.to_csv("./revision/mergedExonDF_2.03.csv")
resDF.to_csv("./revision/resDF_2.03.csv")

In [5]:
mergedExonDF=pd.read_csv("./revision/mergedExonDF_2.03.csv")
resDF=pd.read_csv("./revision/resDF.csv")
mergedExonDF.drop("Unnamed: 0",axis=1,inplace=True)
resDF.drop("Unnamed: 0",axis=1,inplace=True)

print("known incompatibilities: ",len(resDF[resDF["match"].isin([7777777,9999999])]))
print("novel incompatibilities: ",len(mergedExonDF[mergedExonDF["match"].isin([7777777,99999999])]))

print("\n")

print("known good: ",len(resDF[resDF['match'].isin([0,1])])/len(resDF))
print("known number good: ",len(resDF[(resDF["match"]==0)|(resDF["match"]==1)]))
print("novel good: ",len(mergedExonDF[mergedExonDF['match'].isin([0,1])])/len(mergedExonDF))
print("novel number good: ",len(mergedExonDF[(mergedExonDF["match"]==0)|(mergedExonDF["match"]==1)]))

print("\n")

print("known bad:",len(resDF[resDF['match']==2])/len(resDF))
print("known number bad: ",len(resDF[resDF["match"]==2]))
print("novel bad:",len(mergedExonDF[mergedExonDF['match']==2])/len(mergedExonDF))
print("novel number bad: ",len(mergedExonDF[mergedExonDF["match"]==2]))

print("\n")

print("known other",len(resDF[~(resDF['match'].isin([0,1,2]))])/len(resDF))
print("known number other",len(resDF[~(resDF['match'].isin([0,1,2]))]))
print("novel other",len(mergedExonDF[~(mergedExonDF['match'].isin([0,1,2]))])/len(mergedExonDF))
print("novel number other",len(mergedExonDF[~(mergedExonDF['match'].isin([0,1,2]))]))

print("\n")

print("known perfect transcripts with indel:",len(resDF[(resDF["match"].isin([0,1]))&(resDF["startTrim"]==0)])/len(resDF))
print("known number of perfect transcripts with indel: ",len(resDF[(resDF["match"].isin([0,1]))&(resDF["startTrim"]==0)]))
print("novel perfect transcripts with indel:",len(mergedExonDF[(mergedExonDF["match"].isin([0,1]))&(mergedExonDF["startTrim"]==0)])/len(mergedExonDF))
print("novel number of perfect transcripts with indel: ",len(mergedExonDF[(mergedExonDF["match"].isin([0,1]))&(mergedExonDF["startTrim"]==0)]))

print("\n")

print("known perfect transcripts without indel:",len(resDF[(resDF["match"].isin([0]))&(resDF["startTrim"]==0)])/len(resDF))
print("known number of perfect transcripts without indel: ",len(resDF[(resDF["match"].isin([0]))&(resDF["startTrim"]==0)]))
print("novel perfect transcripts without indel:",len(mergedExonDF[(mergedExonDF["match"].isin([0]))&(mergedExonDF["startTrim"]==0)])/len(mergedExonDF))
print("novel number of perfect transcripts without indel: ",len(mergedExonDF[(mergedExonDF["match"].isin([0]))&(mergedExonDF["startTrim"]==0)]))

known incompatibilities:  61
novel incompatibilities:  1023


known good:  0.8499056454030793
known number good:  96381
novel good:  0.5400525142226019
novel number good:  51831


known bad: 0.1310382532935927
known number bad:  14860
novel bad: 0.44286994394315127
novel number bad:  42504


known other 0.019056101303327983
known number other 2161
novel other 0.017077541834246774
novel number other 1639


known perfect transcripts with indel: 0.8305409075677678
known number of perfect transcripts with indel:  94185
novel perfect transcripts with indel: 0.5241315356242315
novel number of perfect transcripts with indel:  50303


known perfect transcripts without indel: 0.6740886404119857
known number of perfect transcripts without indel:  76443
novel perfect transcripts without indel: 0.4088815720924417
novel number of perfect transcripts without indel:  39242


In [32]:
resDF["gID"]=resDF.parent.str.extract('(CHS.(\d)*)',expand=True)[0]
print(len(set(resDF["gID"])))
print(len(set(mergedExonDF["gID"])))

16496
16495


In [8]:
df=pd.read_csv("./chess2.02.gff",sep="\t",names=gff3Cols)
df.dropna(inplace=True,axis=0)
df.reset_index(inplace=True,drop=True)
df=df[df["type"]=='CDS'].reset_index(drop=True)
df["id"]=df.attributes.str.split("Parent=",expand=True)[1].str.split(";",expand=True)[0]
df["geneID"]=np.where(df["type"].isin(['transcript','exon','CDS']),df.id.str.extract('(CHS.(\d)*)',expand=True)[0],df['id'])

In [34]:
# now group by geneID to compute the number of transcripts per gene
df2=df.groupby("geneID").count().reset_index()
df2=df2[(df2['seqid']>1)&(df2['geneID'].isin(set(mergedExonDF["gID"])))].reset_index(drop=True)
setGenes=set(df2['geneID'])
print(len(set(df[df['geneID'].isin(setGenes)]["id"])))
print(len(set(df[df["geneID"].isin(setGenes)]["geneID"])))

113073
16166


In [18]:
resDF=resDF.dropna(axis=0)
resDF=resDF[~(resDF["startTrim"]==7777777)].reset_index(drop=True)

In [19]:
mergedExonsDF=mergedExonDF.dropna(axis=0)
mergedExonsDF=mergedExonDF[~(mergedExonDF["startTrim"]==7777777)].reset_index(drop=True)

In [20]:
print("perfect transcripts:",len(mergedExonDF[(mergedExonDF["match"].isin([0,1]))&(mergedExonDF["startTrim"]==0)&(mergedExonDF["nonesenseTrim"]==0)])/len(mergedExonDF))
# looking at cases where hisat might have potentially been the reason for incomplete transcript
display(mergedExonDF[mergedExonDF["gID"]=="CHS.10130"].reset_index(drop=True))
print("potential hisat2 errors (soft-clip?)",len(mergedExonDF[(mergedExonDF["startTrim"]<=100)&(mergedExonDF["startTrim"]>0)]))
print("potential errors at end", len(mergedExonDF[(mergedExonDF["nonesenseTrim"]<=1)&(mergedExonDF["nonesenseTrim"]>0)]))
mergedExonDF[(mergedExonDF["nonesenseTrim"]<2)&(mergedExonDF["nonesenseTrim"]>0)]

perfect transcripts: 0.45617563090003543


Unnamed: 0,seqid,strand,parent,uid,gID,match,startTrim,nonesenseTrim,numExonsStart,numExonsEnd,skippedExonBases_start,skippedExonBases_end,nmd
0,chr11,+,CHS.10130.2,"118527495,118528502,118530654-118527612,118528...",CHS.10130,2,0,0,0,0,0,0,0


potential hisat2 errors (soft-clip?) 4132
potential errors at end 116


Unnamed: 0,seqid,strand,parent,uid,gID,match,startTrim,nonesenseTrim,numExonsStart,numExonsEnd,skippedExonBases_start,skippedExonBases_end,nmd
9461,chr1,-,CHS.4714.5,"225886296,225887399,225887786,225911119-225887...",CHS.4714,2,0,1,0,0,0,0,1035
9463,chr1,-,CHS.4714.7,"225886992,225887399,225887786,225900806-225887...",CHS.4714,2,0,1,0,0,0,0,339
9854,chr1,-,CHS.5013.4,"236215566,236217391,236220832,236221924,236225...",CHS.5013,2,0,1,0,0,0,0,2984
9855,chr1,-,CHS.5013.5,"236215571,236217391,236220832,236221924,236225...",CHS.5013,2,0,1,0,0,0,0,2979
13779,chr10,-,CHS.7569.4,"131967685,131970638,131970864,131973034,131973...",CHS.7569,2,0,1,0,0,0,0,885
13780,chr10,-,CHS.7569.6,"131967685,131970638,131970864,131973034,131973...",CHS.7569,2,0,1,0,0,0,0,1635
13781,chr10,-,CHS.7569.7,"131967685,131970638,131970864,131973034,131973...",CHS.7569,2,0,1,0,0,0,0,1648
13782,chr10,-,CHS.7569.8,"131967685,131970638,131970864,131973034,131973...",CHS.7569,2,0,1,0,0,0,0,885
13783,chr10,-,CHS.7569.9,"131967685,131970638,131970864,131973034,131973...",CHS.7569,2,0,1,0,0,0,0,946
13785,chr10,-,CHS.7569.11,"131967690,131970638,131970864,131973034,131973...",CHS.7569,2,163,1,1,0,163,0,1187


In [None]:
# gencode biotype tab - nonsense_mediated_decay definition

In [21]:
mergedExonDF[(mergedExonDF["nonesenseTrim"]<=1)&(mergedExonDF["nonesenseTrim"]>0)]

Unnamed: 0,seqid,strand,parent,uid,gID,match,startTrim,nonesenseTrim,numExonsStart,numExonsEnd,skippedExonBases_start,skippedExonBases_end,nmd
9461,chr1,-,CHS.4714.5,"225886296,225887399,225887786,225911119-225887...",CHS.4714,2,0,1,0,0,0,0,1035
9463,chr1,-,CHS.4714.7,"225886992,225887399,225887786,225900806-225887...",CHS.4714,2,0,1,0,0,0,0,339
9854,chr1,-,CHS.5013.4,"236215566,236217391,236220832,236221924,236225...",CHS.5013,2,0,1,0,0,0,0,2984
9855,chr1,-,CHS.5013.5,"236215571,236217391,236220832,236221924,236225...",CHS.5013,2,0,1,0,0,0,0,2979
13779,chr10,-,CHS.7569.4,"131967685,131970638,131970864,131973034,131973...",CHS.7569,2,0,1,0,0,0,0,885
13780,chr10,-,CHS.7569.6,"131967685,131970638,131970864,131973034,131973...",CHS.7569,2,0,1,0,0,0,0,1635
13781,chr10,-,CHS.7569.7,"131967685,131970638,131970864,131973034,131973...",CHS.7569,2,0,1,0,0,0,0,1648
13782,chr10,-,CHS.7569.8,"131967685,131970638,131970864,131973034,131973...",CHS.7569,2,0,1,0,0,0,0,885
13783,chr10,-,CHS.7569.9,"131967685,131970638,131970864,131973034,131973...",CHS.7569,2,0,1,0,0,0,0,946
13785,chr10,-,CHS.7569.11,"131967690,131970638,131970864,131973034,131973...",CHS.7569,2,163,1,1,0,163,0,1187


In [22]:
print("perfect transcripts:",len(resDF[(resDF["match"].isin([0,1]))&(resDF["startTrim"]==0)&(resDF["endTrim"]==0)])/len(resDF))
# looking at cases where hisat might have potentially been the reason for incomplete transcript
# display(resDF[resDF["gID"]=="CHS.10130"].reset_index(drop=True))
print("potential errors at start",len(resDF[(resDF["startTrim"]<=1)&(resDF["startTrim"]>0)]))
display(resDF[(resDF["startTrim"]<=1)&(resDF["startTrim"]>0)])
print("potential errors at end", len(resDF[(resDF["endTrim"]<=1)&(resDF["endTrim"]>0)]))

perfect transcripts: 0.5477443972179289
potential errors at start 38


Unnamed: 0,parent,match,startTrim,endTrim,neStart,neEnd,skippedExonBases_start,skippedExonBases_end,nmd
1201,CHS.38571.3,2,1.0,0.0,1.0,0.0,1.0,0.0,0.0
8997,CHS.57239.9,0,1.0,1036.0,0.0,5.0,0.0,175.0,0.0
11000,CHS.41084.9,2,1.0,0.0,1.0,0.0,1.0,0.0,0.0
12959,CHS.9843.5,2,1.0,0.0,0.0,0.0,0.0,0.0,0.0
14387,CHS.17100.9,2,1.0,0.0,0.0,0.0,0.0,0.0,0.0
16496,CHS.35527.16,0,1.0,922.0,0.0,10.0,0.0,128.0,0.0
16843,CHS.31657.17,2,1.0,53.0,0.0,0.0,0.0,0.0,0.0
16908,CHS.27184.12,0,1.0,1345.0,0.0,3.0,0.0,1093.0,0.0
24877,CHS.10501.14,2,1.0,0.0,1.0,0.0,1.0,0.0,0.0
24879,CHS.10501.17,2,1.0,0.0,1.0,0.0,1.0,0.0,0.0


potential errors at end 195


In [14]:
print("good: ",len(resDF[resDF['match'].isin([0,1])])/len(resDF))
print("bad: ",len(resDF[resDF['match']==2])/len(resDF))
print("other: ",len(resDF[resDF['match']==7777777])/len(resDF))

good:  0.7693466280590323
bad:  0.19804315337194098
other:  0.0003619465720156921


In [15]:
resDF[resDF['match']==7777777]

Unnamed: 0,parent,match,startTrim,endTrim,neStart,neEnd,skippedExonBases_start,skippedExonBases_end
3144,CHS.46653.23,7777777,7777777.0,7777777.0,7777777.0,7777777.0,7777777.0,7777777.0
7173,CHS.50850.9,7777777,7777777.0,7777777.0,7777777.0,7777777.0,7777777.0,7777777.0
9877,CHS.19079.2,7777777,7777777.0,7777777.0,7777777.0,7777777.0,7777777.0,7777777.0
9886,CHS.22752.8,7777777,7777777.0,7777777.0,7777777.0,7777777.0,7777777.0,7777777.0
10109,CHS.36693.5,7777777,7777777.0,7777777.0,7777777.0,7777777.0,7777777.0,7777777.0
14389,CHS.51.6,7777777,7777777.0,7777777.0,7777777.0,7777777.0,7777777.0,7777777.0
15038,CHS.20975.10,7777777,7777777.0,7777777.0,7777777.0,7777777.0,7777777.0,7777777.0
18706,CHS.7743.1,7777777,7777777.0,7777777.0,7777777.0,7777777.0,7777777.0,7777777.0
18794,CHS.29592.13,7777777,7777777.0,7777777.0,7777777.0,7777777.0,7777777.0,7777777.0
21614,CHS.26404.1,7777777,7777777.0,7777777.0,7777777.0,7777777.0,7777777.0,7777777.0


In [3]:
# lastly let's compare known ORFtoORF and known ExonToORF
odf=pd.read_csv("./revision/resDF.csv")
odf.drop("Unnamed: 0",axis=1,inplace=True)
odf.sort_values(by="parent",inplace=True)
edf=pd.read_csv("./revision/knownExonToCDS_2.csv")
edf.drop("Unnamed: 0",axis=1,inplace=True)
edf=edf[["parent","match","startTrim","nonesenseTrim","numExonsStart","numExonsEnd","skippedExonBases_start","skippedExonBases_end","nmd"]]
edf.columns=["parent","match","startTrim","endTrim","neStart","neEnd","skippedExonBases_start","skippedExonBases_end","nmd"]
edf.sort_values(by="parent",inplace=True)

print(len(odf))
print(len(edf))

edf.replace(999999999.0,99999.0,inplace=True)
mdf=edf.merge(odf,on="parent",how="outer",indicator=True)
mdf=mdf[mdf["_merge"]=="both"].reset_index(drop=True)
mdf=mdf[~(mdf["match_x"]==mdf["match_y"])].reset_index(drop=True)
mdf["geneID"]=mdf.parent.str.extract('(CHS.(\d)*)',expand=True)[0]
print(len(mdf))
mdf.head()

113402
150111
4221


Unnamed: 0,parent,match_x,startTrim_x,endTrim_x,neStart_x,neEnd_x,skippedExonBases_start_x,skippedExonBases_end_x,nmd_x,match_y,startTrim_y,endTrim_y,neStart_y,neEnd_y,skippedExonBases_start_y,skippedExonBases_end_y,nmd_y,_merge,geneID
0,CHS.10003.18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1887.0,2.0,450.0,0.0,5.0,0.0,58.0,0.0,0.0,both,CHS.10003
1,CHS.10028.7,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,363.0,0.0,1.0,0.0,363.0,0.0,both,CHS.10028
2,CHS.10030.21,0.0,0.0,746.0,0.0,2.0,0.0,533.0,0.0,2.0,0.0,796.0,0.0,2.0,0.0,536.0,0.0,both,CHS.10030
3,CHS.10036.26,0.0,0.0,770.0,0.0,5.0,0.0,119.0,0.0,2.0,0.0,1120.0,0.0,7.0,0.0,119.0,0.0,both,CHS.10036
4,CHS.1005.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,both,CHS.1005


In [4]:
# ok, clearly something is fundamentally wrong here
# we need to find out what the hell it is

# let's try to find a gene with the smalles number of transcripts quickly
# we can use that gene to manually inspect all the differences
df=pd.read_csv("./chess2.02.gff",sep="\t",names=gff3Cols)
df.dropna(inplace=True,axis=0)
df.reset_index(inplace=True,drop=True)
df["start"]=df["start"].astype(int)
df["end"]=df["end"].astype(int)
df["id"]=df.attributes.str.split("ID=",expand=True)[1].str.split(";",expand=True)[0]
df["parent"]=df.attributes.str.split("Parent=",expand=True)[1].str.split(";",expand=True)[0]
df["geneID"]=np.where(df["type"].isin(['transcript','exon','CDS']),df.parent.str.extract('(CHS.(\d)*)',expand=True)[0],df['id'])

# now need to get transcript counts for those
df=df[df["type"]=="exon"].reset_index(drop=True) # focus on exons, this way we can also evaluate the number of exons - ideally as few as possible
edf=df[["parent","source"]].groupby(by='parent').count()
edf.reset_index(inplace=True)
# now group by geneID
edf["geneID"]=edf.parent.str.extract('(CHS.(\d)*)',expand=True)[0]
countsDF=edf.groupby("geneID").agg({"source":["count","sum","mean"]}) # sum of all exons and count for the number of transcripts per gene
countsDF.reset_index(inplace=True)
countsDF.columns=['geneID','numTrans','totalNumExons','meanNumExons']
countsDF=countsDF[countsDF["geneID"].isin(set(mdf["geneID"]))].reset_index(drop=True)
countsDF.sort_values(by=["numTrans","meanNumExons"],ascending=True,inplace=True)
countsDF.reset_index(drop=True,inplace=True)
countsDF

Unnamed: 0,geneID,numTrans,totalNumExons,meanNumExons
0,CHS.32423,3,5,1.666667
1,CHS.13628,3,6,2.000000
2,CHS.14540,3,7,2.333333
3,CHS.38696,3,7,2.333333
4,CHS.50055,3,7,2.333333
5,CHS.29030,3,9,3.000000
6,CHS.33527,3,9,3.000000
7,CHS.58210,3,9,3.000000
8,CHS.51484,3,11,3.666667
9,CHS.17952,3,12,4.000000


In [6]:
odf=pd.read_csv("./revision/resDF.csv")
odf.drop("Unnamed: 0",axis=1,inplace=True)
odf.sort_values(by="parent",inplace=True)
edf=pd.read_csv("./revision/knownExonToCDS_2.csv")
edf.drop("Unnamed: 0",axis=1,inplace=True)
edf=edf[["parent","match","startTrim","nonesenseTrim","numExonsStart","numExonsEnd","skippedExonBases_start","skippedExonBases_end","nmd"]]
edf.columns=["parent","match","startTrim","endTrim","neStart","neEnd","skippedExonBases_start","skippedExonBases_end","nmd"]
edf.sort_values(by="parent",inplace=True)

In [7]:
odf["geneID"]=odf.parent.str.extract('(CHS.(\d)*)',expand=True)[0]
edf["geneID"]=edf.parent.str.extract('(CHS.(\d)*)',expand=True)[0]
display(odf[odf["geneID"]=="CHS.32423"])
display(edf[edf["geneID"]=="CHS.32423"])
display(mdf[mdf["geneID"]=="CHS.32423"])

Unnamed: 0,parent,match,startTrim,endTrim,neStart,neEnd,skippedExonBases_start,skippedExonBases_end,nmd,geneID
91531,CHS.32423.2,0,0.0,31.0,0.0,1.0,0.0,31.0,0.0,CHS.32423
91532,CHS.32423.3,2,0.0,4.0,0.0,0.0,0.0,0.0,31.0,CHS.32423


Unnamed: 0,parent,match,startTrim,endTrim,neStart,neEnd,skippedExonBases_start,skippedExonBases_end,nmd,geneID
89253,CHS.32423.2,2,0,0,0,0,0,0,2765,CHS.32423
89254,CHS.32423.3,2,0,4,0,0,0,0,2796,CHS.32423


Unnamed: 0,parent,match_x,startTrim_x,endTrim_x,neStart_x,neEnd_x,skippedExonBases_start_x,skippedExonBases_end_x,nmd_x,match_y,startTrim_y,endTrim_y,neStart_y,neEnd_y,skippedExonBases_start_y,skippedExonBases_end_y,nmd_y,_merge,geneID
2028,CHS.32423.2,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2765.0,0.0,0.0,31.0,0.0,1.0,0.0,31.0,0.0,both,CHS.32423
