In [1]:
import pandas as pd
import numpy as np
import subprocess
import sys
import os
import csv
import matplotlib.pyplot as plt
from matplotlib import animation
import seaborn as sns
plt.rcParams['figure.figsize'] = (20.0, 10.0)
plt.rcParams['font.family'] = "serif"
%matplotlib inline

gff3Cols=["seqid","source","type","start","end","score","strand","phase","attributes"]
novelSources=['CHESS','StringTie','FANTOM']

we shall investigate whether novel transcripts to known (RefSeq and Gencode genes) tend to preserve the original ORF
or instead would results in a novel protein

Later we shall also investigate whether in novel genes there are multiple ORFs or whether different isoforms
tend to be ORF-compatible

to achieve this the following must be done
1. we need to extract all genes and the corresponding transcript, CDS and exon entries
    - this must be done only for protein-coding genes


eventually we need to look at whether novel ORFs introduce a frameshift or not

In [2]:
def formBlocks(series):
    return ",".join([str(x) for x in sorted(series.tolist())])

def getUniqueOrfCount(series):
    return len(set(series.tolist()))

In [3]:
# first build a dataframe of all known transcripts only
df=pd.read_csv("./chess2.02.gff",sep="\t",names=gff3Cols)
df.dropna(inplace=True,axis=0)
df.reset_index(inplace=True,drop=True)
df["start"]=df["start"].astype(int)
df["end"]=df["end"].astype(int)

# next we shall create the subset of all non-novel genes
print(set(df["source"]))
df=df[~(df['source'].isin(novelSources))].reset_index(drop=True)

# now let's isolate the protein-coding genes and their transcripts, exons and CDS

# first let's extract IDs
df["id"]=df.attributes.str.split("ID=",expand=True)[1].str.split(";",expand=True)[0]
df["parent"]=df.attributes.str.split("Parent=",expand=True)[1].str.split(";",expand=True)[0]
df["geneID"]=np.where(df["type"].isin(['transcript','exon','CDS']),df.parent.str.extract('(CHS.(\d)*)',expand=True)[0],df['id'])
# first create a tmpdf (to be removed right after) of just the genes, so we can get their coding potential
tmp=df[df["type"]=="gene"].reset_index(drop=True)
# next let's extract information about the gene type
tmp["gene_type"]=tmp.attributes.str.split("GENE_TYPE=",expand=True)[1].str.split(";",expand=True)[0].str.strip("\n")
print("set of all gene_types in CHESS",set(tmp["gene_type"]))

# now we can get a subset of all known protein_coding genes
tmp=tmp[tmp["gene_type"]=="protein_coding"].reset_index(drop=True)
print("number of known protein-coding genes is: %d"%len(tmp))
# now we just need to get geneIDs for the protein_coding sequences
setProtGenes=set(tmp["geneID"])
del tmp
# df.drop(["id","parent"],inplace=True,axis=1)
df=df[df["geneID"].isin(setProtGenes)].reset_index(drop=True)

# next we would like to test whether all transcripts in these known protein-coding genes contain a CDS

# first get a set of parent IDs for all CDSs
cdsParents=set(df[df["type"]=="CDS"]["parent"])
# now form a set of IDs for all transcripts
transIDs=set(df[df["type"]=="transcript"]["id"])
print("the number of transcripts in protein-coding known genes is: %d\nthe number of CDSs associated with transcripts in protein-coding known genes is %d"%(len(transIDs),len(cdsParents)))

{'Gnomon', 'HAVANA', 'CHESS', 'StringTie', 'GENCODE', 'BestRefSeq', 'Curated Genomic', 'RefSeq', 'FANTOM', 'ENSEMBL'}
set of all gene_types in CHESS {'lncRNA', 'misc_RNA', 'antisense_RNA', 'protein_coding'}
number of known protein-coding genes is: 22659
the number of transcripts in protein-coding known genes is: 169965
the number of CDSs associated with transcripts in protein-coding known genes is 129847


In [4]:
# First order of business is to find out how to get ORFs out of the gffs

# one way would be to group all CDS-sequences from same transcript
# and to compile all starts and ends into format similar to tlst
# the tlst block format can then be used as a unique identifier of the intron chain as well as the orf in general
# if multiple transcripts exist for the same gene with the same CDS - those can later be merged as well.
# Otherwise a gene has isoforms coding different proteins
cdsDF=df[df['type']=='CDS'].reset_index(drop=True)

cdsGDF=cdsDF.groupby("parent").agg({'start':formBlocks,'end':formBlocks}).reset_index()
mergedDF=cdsDF[['seqid','parent','strand']].merge(cdsGDF,on='parent',how='outer',indicator=True)
assert len(mergedDF[mergedDF["_merge"]=="both"])==len(mergedDF), "ids don't match"
mergedDF.drop_duplicates(inplace=True)
mergedDF.reset_index(drop=True,inplace=True)
mergedDF.drop('_merge',inplace=True,axis=1)

mergedDF['uid']=mergedDF['seqid']+mergedDF['strand']+mergedDF["start"]+"-"+mergedDF["end"]
mergedDF=mergedDF[['parent','uid']]

# now we can extract gene IDs and groupby them
# we should count the total number of isoforms included
# as well as the number of unique ORFs present for that gene

# first get the gene ID
mergedDF.reset_index(drop=True,inplace=True)
mergedDF["gID"]=mergedDF["parent"].reset_index(drop=True).str.extract('(CHS.(\d)*)',expand=True)[0]

# now we shall group these sequences by the gene ID and compute things of interest
mergedGDF=mergedDF.groupby('gID').agg({'uid':[getUniqueOrfCount,'count']}).reset_index()
mergedGDF.columns=['gID','uniqueORFs','numTranscripts']

# now perform a protein search from gffread output
mergedGDF["numProteins"]=np.nan
mergedGDF["numUniqueProteins"]=np.nan

prots={}
setProts=set()
chsID=""
geneID=""
tmp=""
curGeneID=""
skipWrite=True
skipGene=True
allKnownIso=set(mergedDF["parent"].tolist())

firstGeneID=""
c=0

with open("/home/sparrow/JHU/chess/www/data_update/chess2.02.protein.fa","r") as inFP:
    for line in inFP.readlines():
        if line[0]==">":
            if not skipWrite:
                setProts.add(prots[chsID])
            # first check if given isoform ID is in known isoforms
            tmpID=line[1:].split("\t")[0].strip("\n")
            if tmpID in allKnownIso:
                # proceed safely
                skipWrite=False
                chsID=tmpID
                curGeneID="CHS."+chsID.split(".")[1]
                if c==0:
                    c+=1
                    geneID=curGeneID
                if not curGeneID==geneID and not skipGene:
                    # we are done with the current gene - should compare isoforms
                    mergedGDF.loc[mergedGDF["gID"]==geneID,"numProteins"]=len(prots)
                    mergedGDF.loc[mergedGDF["gID"]==geneID,"numUniqueProteins"]=len(setProts)
                    geneID=curGeneID
                    prots={}
                    setProts=set()
                assert chsID not in prots,"duplicate isoform"
                prots[chsID]=""
                skipGene=False # on next entry into this section we can safely save the information into the
            else:
                skipWrite=True
        elif not skipWrite:
            prots[chsID]+=line.strip("\n")
        else:
            continue
            
    if not skipGene:
        mergedGDF.loc[mergedGDF["gID"]==geneID,"numProteins"]=len(prots)
        mergedGDF.loc[mergedGDF["gID"]==geneID,"numUniqueProteins"]=len(setProts)
        
assert len(mergedGDF[mergedGDF["numUniqueProteins"].isnull()])==0,"something is wrong with the proteins - null entries present"
# now here is an interesting observation
# Some of the proteins are the same despite having been translated from different exon chains
mergedGDF["numProteins"]=mergedGDF["numProteins"].astype(int)
mergedGDF["numUniqueProteins"]=mergedGDF["numUniqueProteins"].astype(int)
mergedGDF[mergedGDF["uniqueORFs"]!=mergedGDF["numUniqueProteins"]]

# important to verify that the number of isoforms present in the fasta file is the same as the ones deduced
mergedGDF[mergedGDF["numProteins"]!=mergedGDF["numTranscripts"]]

# First order of business is to find out how to get ORFs out of the gffs

# one way would be to group all CDS-sequences from same transcript
# and to compile all starts and ends into format similar to tlst
# the tlst block format can then be used as a unique identifier of the intron chain as well as the orf in general
# if multiple transcripts exist for the same gene with the same CDS - those can later be merged as well.
# Otherwise a gene has isoforms coding different proteins
cdsDF=df[df['type']=='CDS'].reset_index(drop=True)

cdsGDF=cdsDF.groupby("parent").agg({'start':formBlocks,'end':formBlocks}).reset_index()
mergedDF_2=cdsDF[['seqid','parent','strand']].merge(cdsGDF,on='parent',how='outer',indicator=True)
assert len(mergedDF_2[mergedDF_2["_merge"]=="both"])==len(mergedDF_2), "ids don't match"
mergedDF_2.drop_duplicates(inplace=True)
mergedDF_2.reset_index(drop=True,inplace=True)
mergedDF_2.drop('_merge',inplace=True,axis=1)

mergedDF_2['uid']=mergedDF_2['seqid']+mergedDF_2['strand']+mergedDF_2["start"]+"-"+mergedDF_2["end"]
mergedDF_2=mergedDF_2[['parent','uid']]

# now we can extract gene IDs and groupby them
# we should count the total number of isoforms included
# as well as the number of unique ORFs present for that gene

# first get the gene ID
mergedDF_2["gID"]=mergedDF_2["parent"].reset_index(drop=True).str.extract('(CHS.(\d)*)',expand=True)[0]

# now we shall group these sequences by the gene ID and compute things of interest
mergedGDF_2=mergedDF_2.groupby('gID').agg({'uid':[getUniqueOrfCount,'count']}).reset_index()
mergedGDF_2.columns=['gID','uniqueORFs','numTranscripts']
mergedGDF_2

resDF=mergedGDF.merge(right=mergedGDF_2,how="left",on="gID")
resDF

500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000
19500
20000
20500
21000
21500
22000
22500


Unnamed: 0,gID,uniqueORFs_x,numTranscripts_x,numProteins,numUniqueProteins,uniqueORFs_y,numTranscripts_y
0,CHS.10000,8,8,8,8,8,8
1,CHS.10003,10,14,14,10,10,14
2,CHS.10004,6,6,6,6,6,6
3,CHS.10005,6,7,7,5,6,7
4,CHS.10008,6,7,7,6,6,7
5,CHS.10009,3,3,3,3,3,3
6,CHS.1001,3,8,8,3,3,8
7,CHS.10010,1,1,1,1,1,1
8,CHS.10011,28,30,30,28,28,30
9,CHS.10013,3,3,3,3,3,3


In [5]:
resDF.to_csv("./revision/ORFpreservation.csv")

In [None]:
# one of the issues is that potentially, there could be two transcripts with different exon/intron-chains
# however the ORFs can potentially stay the same.
# perhaps we should translate all ORFs and compare them directly instead of relying on the intron chain
# in order to learn whether there are any cases where two different chains result in the same ORF

In [None]:
# next, let's see if all transcripts within protein-coding genes contain CDS sequences
# if so - no questions
# if yes - is that also the case for novel protein-coding genes?