In [1]:
import os
import sys
import pandas as pd
import numpy as np
from shutil import copyfile

version="2.04"
gff3Cols=["seqid","source","type","start","end","score","strand","phase","attributes"]

In [2]:
# this notebook is dedicated to parsing the chess2.02.gff files, and introducing several improvements/corrections
# the corrections include:
# 1. Extension of the transcript start based on a suspected trimmed ORF (as inferred from the known isoforms)
# 2. Extension of the transctipt end based on a suspected trimming og the ORF (inferred from the known isoforms)
# 3. 

In [3]:
# first need to load the frame
df=pd.read_csv("./revision/mergedExonDF.csv")
df.drop("Unnamed: 0",axis=1,inplace=True)

# case 99999 should never exist for novel however, since all novel should have a corresponding known transcript
# assert len(df[df["match"]>2])==0, 'something is wrong witth the known and novel transcripts. Some novel transcript does not have a known pair'
df=df[df["match"].isin([0,1,2])].reset_index(drop=True)
# let's isolate suspicious cases such as 777777
setStartTrimShort=set(df[df["startTrim"]==7777777]["parent"])
setEndTrimShort=set(df[df["nonesenseTrim"]==7777777]["parent"])
assert setStartTrimShort==setEndTrimShort, "something is wrong with the ORF fitting inside the novel transcript"

print("number of transcripts with extremely short or non-fitting ORFs is: ",len(setStartTrimShort))
print("total number of transcripts considered: ",len(df))

number of transcripts with extremely short or non-fitting ORFs is:  0
total number of transcripts considered:  94337


In [4]:
# now we need to work with the rest of the transcripts and correct their annotation in the CHESS.gff

# first let's isolate those transcripts for which the start is missing:
startDF=df[(df["startTrim"]<=100)&(df["startTrim"]>0)&(df["numExonsStart"]==0)].reset_index(drop=True)
endDF=df[(df["nonesenseTrim"]<=100)&(df["nonesenseTrim"]>0)&(df["numExonsEnd"]==0)].reset_index(drop=True)
print('numnber of transcripts with trimmed start AND end is ',len(set(startDF['parent']).intersection(set(endDF['parent']))))
print('total number of transcripts that require adjustment',len(set(startDF['parent']).union(set(endDF['parent']))))
# now need to build dictionaries linking each parent to the number of bases trimmed of either start OR end (separate dataframes)
startDict=pd.Series(startDF.startTrim.values,index=startDF.parent).to_dict()
endDict=pd.Series(endDF.nonesenseTrim.values,index=endDF.parent).to_dict()
setStarts=set(startDict) # to be used for quick lookup when traversing the GFF
setEnds=set(endDict) # to be used for quick lookup when traversing the GFF

numnber of transcripts with trimmed start AND end is  38
total number of transcripts that require adjustment 2971


In [5]:
len(startDF[startDF["startTrim"]<5])

278

In [6]:
outFP=open("./chess2.04.gff","w+")
tmpLine=None
with open("./chess2.03.gff","r") as fp:
    ifp=iter(fp)
    while True:
        try:
            line=""
            if tmpLine==None:
                line=next(ifp)
            else:
                line=tmpLine
                tmpLine=None
                
            if line[0]=="#":
                if line[:7]=="##CHESS":
                    outFP.write("##CHESS"+version+"\n")
                else:
                    outFP.write(line)
                continue
            else:
                lineCols=line.split("\t")
                if lineCols[2]=="transcript":
                    chsID=lineCols[8].split("ID=")[-1].split(";")[0].strip()
                    if chsID in setStarts:
                        # now we shall collect all the necessary parts (exons) for the given novel transcript
                        if lineCols[6]=="+":
                            # change right-most coordinate of the transcript
                            transStart=lineCols[3]
                            lineCols[3]=str(int(transStart)-startDict[chsID])
                            outFP.write("\t".join(lineCols))
                            foundExon=False
                            while True:
                                line=next(ifp)
                                if line[0]=="#":
                                    tmpLine=line
                                    assert foundExon==True,"- wrong element, something is wrong with the gff format "+chsID+" "+transStart
                                    break
                                lineCols=line.split("\t")
                                if not lineCols[2]=="exon":
                                    tmpLine=line
                                    assert foundExon==True,"+ wrong element, something is wrong with the gff format "+chsID+" "+transStart
                                    break
                                else:
                                    parentID=lineCols[8].split("Parent=")[-1].split(";")[-1].strip()
                                    assert parentID in setStarts and parentID==chsID, "wrong exon identified "+parentID+" "+chsID
                                    if lineCols[3]==transStart:
                                        lineCols[3]=str(int(lineCols[3])-startDict[chsID])
                                        foundExon=True
                                        outFP.write("\t".join(lineCols))
                                    else:
                                        outFP.write(line)
                        else:
                            # change left-most coordinate of the transcript
                            transEnd=lineCols[4]
                            lineCols[4]=str(int(transEnd)+startDict[chsID])
                            outFP.write("\t".join(lineCols))
                            foundExon=False
                            while True:
                                line=next(ifp)
                                if line[0]=="#":
                                    tmpLine=line
                                    assert foundExon==True,"- wrong element, something is wrong with the gff format "+chsID+" "+transEnd
                                    break
                                lineCols=line.split("\t")
                                if not lineCols[2]=="exon":
                                    tmpLine=line
                                    assert foundExon==True,"- wrong element, something is wrong with the gff format "+chsID+" "+transEnd
                                    break
                                else:
                                    parentID=lineCols[8].split("Parent=")[-1].split(";")[-1].strip()
                                    assert parentID in setStarts and parentID==chsID, "wrong exon identified "+parentID+" "+chsID
                                    if lineCols[4]==transEnd:
                                        lineCols[4]=str(int(lineCols[4])+startDict[chsID])
                                        foundExon=True
                                        outFP.write("\t".join(lineCols))
                                    else:
                                        outFP.write(line)
                                        
                    elif chsID in setEnds:
                        # now we shall collect all the necessary parts (exons) for the given novel transcript
                        if lineCols[6]=="+":
                            # change right-most coordinate of the transcript
                            transEnd=lineCols[4]
                            lineCols[4]=str(int(transEnd)+endDict[chsID])
                            outFP.write("\t".join(lineCols))
                            foundExon=False
                            while True:
                                line=next(ifp)
                                if line[0]=="#":
                                    tmpLine=line
                                    assert foundExon==True,"- wrong element, something is wrong with the gff format "+chsID+" "+transEnd
                                    break
                                lineCols=line.split("\t")
                                if not lineCols[2]=="exon":
                                    tmpLine=line
                                    assert foundExon==True,"+ wrong element, something is wrong with the gff format "+chsID+" "+transEnd
                                    break
                                else:
                                    parentID=lineCols[8].split("Parent=")[-1].split(";")[-1].strip()
                                    assert parentID in setEnds and parentID==chsID, "wrong exon identified "+parentID+" "+chsID
                                    if lineCols[4]==transEnd:
                                        lineCols[4]=str(int(lineCols[4])-endDict[chsID])
                                        foundExon=True
                                        outFP.write("\t".join(lineCols))
                                    else:
                                        outFP.write(line)
                        else:
                            # change left-most coordinate of the transcript
                            transStart=lineCols[3]
                            lineCols[3]=str(int(transStart)+endDict[chsID])
                            outFP.write("\t".join(lineCols))
                            foundExon=False
                            while True:
                                line=next(ifp)
                                if line[0]=="#":
                                    tmpLine=line
                                    assert foundExon==True,"- wrong element, something is wrong with the gff format "+chsID+" "+transStart
                                    break
                                lineCols=line.split("\t")
                                if not lineCols[2]=="exon":
                                    tmpLine=line
                                    assert foundExon==True,"- wrong element, something is wrong with the gff format "+chsID+" "+transStart
                                    break
                                else:
                                    parentID=lineCols[8].split("Parent=")[-1].split(";")[-1].strip()
                                    assert parentID in setEnds and parentID==chsID, "wrong exon identified "+parentID+" "+chsID
                                    if lineCols[3]==transStart:
                                        lineCols[3]=str(int(lineCols[3])+endDict[chsID])
                                        foundExon=True
                                        outFP.write("\t".join(lineCols))
                                    else:
                                        outFP.write(line)
                    else:
                        outFP.write(line)
                else:
                    outFP.write(line)
        except StopIteration:
            break
            
outFP.close()

In [7]:
# now to do the same with the chess2.02_assembly.gff version

outFP=open("./chess2.04_assembly.gff","w+")
tmpLine=None
with open("./chess2.03_assembly.gff","r") as fp:
    ifp=iter(fp)
    while True:
        try:
            line=""
            if tmpLine==None:
                line=next(ifp)
            else:
                line=tmpLine
                tmpLine=None
                
            if line[0]=="#":
                if line[:7]=="##CHESS":
                    outFP.write("##CHESS"+version+"\n")
                else:
                    outFP.write(line)
                continue
            else:
                lineCols=line.split("\t")
                if lineCols[2]=="transcript":
                    chsID=lineCols[8].split("ID=")[-1].split(";")[0].strip()
                    if chsID in setStarts:
                        # now we shall collect all the necessary parts (exons) for the given novel transcript
                        if lineCols[6]=="+":
                            # change right-most coordinate of the transcript
                            transStart=lineCols[3]
                            lineCols[3]=str(int(transStart)-startDict[chsID])
                            outFP.write("\t".join(lineCols))
                            foundExon=False
                            while True:
                                line=next(ifp)
                                if line[0]=="#":
                                    tmpLine=line
                                    assert foundExon==True,"- wrong element, something is wrong with the gff format "+chsID+" "+transStart
                                    break
                                lineCols=line.split("\t")
                                if not lineCols[2]=="exon":
                                    tmpLine=line
                                    assert foundExon==True,"+ wrong element, something is wrong with the gff format "+chsID+" "+transStart
                                    break
                                else:
                                    parentID=lineCols[8].split("Parent=")[-1].split(";")[-1].strip()
                                    assert parentID in setStarts and parentID==chsID, "wrong exon identified "+parentID+" "+chsID
                                    if lineCols[3]==transStart:
                                        lineCols[3]=str(int(lineCols[3])-startDict[chsID])
                                        foundExon=True
                                        outFP.write("\t".join(lineCols))
                                    else:
                                        outFP.write(line)
                        else:
                            # change left-most coordinate of the transcript
                            transEnd=lineCols[4]
                            lineCols[4]=str(int(transEnd)+startDict[chsID])
                            outFP.write("\t".join(lineCols))
                            foundExon=False
                            while True:
                                line=next(ifp)
                                if line[0]=="#":
                                    tmpLine=line
                                    assert foundExon==True,"- wrong element, something is wrong with the gff format "+chsID+" "+transEnd
                                    break
                                lineCols=line.split("\t")
                                if not lineCols[2]=="exon":
                                    tmpLine=line
                                    assert foundExon==True,"- wrong element, something is wrong with the gff format "+chsID+" "+transEnd
                                    break
                                else:
                                    parentID=lineCols[8].split("Parent=")[-1].split(";")[-1].strip()
                                    assert parentID in setStarts and parentID==chsID, "wrong exon identified "+parentID+" "+chsID
                                    if lineCols[4]==transEnd:
                                        lineCols[4]=str(int(lineCols[4])+startDict[chsID])
                                        foundExon=True
                                        outFP.write("\t".join(lineCols))
                                    else:
                                        outFP.write(line)
                                        
                    elif chsID in setEnds:
                        # now we shall collect all the necessary parts (exons) for the given novel transcript
                        if lineCols[6]=="+":
                            # change right-most coordinate of the transcript
                            transEnd=lineCols[4]
                            lineCols[4]=str(int(transEnd)+endDict[chsID])
                            outFP.write("\t".join(lineCols))
                            foundExon=False
                            while True:
                                line=next(ifp)
                                if line[0]=="#":
                                    tmpLine=line
                                    assert foundExon==True,"- wrong element, something is wrong with the gff format "+chsID+" "+transEnd
                                    break
                                lineCols=line.split("\t")
                                if not lineCols[2]=="exon":
                                    tmpLine=line
                                    assert foundExon==True,"+ wrong element, something is wrong with the gff format "+chsID+" "+transEnd
                                    break
                                else:
                                    parentID=lineCols[8].split("Parent=")[-1].split(";")[-1].strip()
                                    assert parentID in setEnds and parentID==chsID, "wrong exon identified "+parentID+" "+chsID
                                    if lineCols[4]==transEnd:
                                        lineCols[4]=str(int(lineCols[4])-endDict[chsID])
                                        foundExon=True
                                        outFP.write("\t".join(lineCols))
                                    else:
                                        outFP.write(line)
                        else:
                            # change left-most coordinate of the transcript
                            transStart=lineCols[3]
                            lineCols[3]=str(int(transStart)+endDict[chsID])
                            outFP.write("\t".join(lineCols))
                            foundExon=False
                            while True:
                                line=next(ifp)
                                if line[0]=="#":
                                    tmpLine=line
                                    assert foundExon==True,"- wrong element, something is wrong with the gff format "+chsID+" "+transStart
                                    break
                                lineCols=line.split("\t")
                                if not lineCols[2]=="exon":
                                    tmpLine=line
                                    assert foundExon==True,"- wrong element, something is wrong with the gff format "+chsID+" "+transStart
                                    break
                                else:
                                    parentID=lineCols[8].split("Parent=")[-1].split(";")[-1].strip()
                                    assert parentID in setEnds and parentID==chsID, "wrong exon identified "+parentID+" "+chsID
                                    if lineCols[3]==transStart:
                                        lineCols[3]=str(int(lineCols[3])+endDict[chsID])
                                        foundExon=True
                                        outFP.write("\t".join(lineCols))
                                    else:
                                        outFP.write(line)
                    else:
                        outFP.write(line)
                else:
                    outFP.write(line)
        except StopIteration:
            break
            
outFP.close()

In [8]:
# now to do the same with the chess2.02_and_refseq.gff version

outFP=open("./chess2.04_and_refseq.gff","w+")
tmpLine=None
with open("./chess2.03_and_refseq.gff","r") as fp:
    ifp=iter(fp)
    while True:
        try:
            line=""
            if tmpLine==None:
                line=next(ifp)
            else:
                line=tmpLine
                tmpLine=None
                
            if line[0]=="#":
                if line[:7]=="##CHESS":
                    outFP.write("##CHESS"+version+"\n")
                else:
                    outFP.write(line)
                continue
            else:
                lineCols=line.split("\t")
                if lineCols[2]=="transcript":
                    chsID=lineCols[8].split("ID=")[-1].split(";")[0].strip()
                    if chsID in setStarts:
                        # now we shall collect all the necessary parts (exons) for the given novel transcript
                        if lineCols[6]=="+":
                            # change right-most coordinate of the transcript
                            transStart=lineCols[3]
                            lineCols[3]=str(int(transStart)-startDict[chsID])
                            outFP.write("\t".join(lineCols))
                            foundExon=False
                            while True:
                                line=next(ifp)
                                if line[0]=="#":
                                    tmpLine=line
                                    assert foundExon==True,"- wrong element, something is wrong with the gff format "+chsID+" "+transStart
                                    break
                                lineCols=line.split("\t")
                                if not lineCols[2]=="exon":
                                    tmpLine=line
                                    assert foundExon==True,"+ wrong element, something is wrong with the gff format "+chsID+" "+transStart
                                    break
                                else:
                                    parentID=lineCols[8].split("Parent=")[-1].split(";")[-1].strip()
                                    assert parentID in setStarts and parentID==chsID, "wrong exon identified "+parentID+" "+chsID
                                    if lineCols[3]==transStart:
                                        lineCols[3]=str(int(lineCols[3])-startDict[chsID])
                                        foundExon=True
                                        outFP.write("\t".join(lineCols))
                                    else:
                                        outFP.write(line)
                        else:
                            # change left-most coordinate of the transcript
                            transEnd=lineCols[4]
                            lineCols[4]=str(int(transEnd)+startDict[chsID])
                            outFP.write("\t".join(lineCols))
                            foundExon=False
                            while True:
                                line=next(ifp)
                                if line[0]=="#":
                                    tmpLine=line
                                    assert foundExon==True,"- wrong element, something is wrong with the gff format "+chsID+" "+transEnd
                                    break
                                lineCols=line.split("\t")
                                if not lineCols[2]=="exon":
                                    tmpLine=line
                                    assert foundExon==True,"- wrong element, something is wrong with the gff format "+chsID+" "+transEnd
                                    break
                                else:
                                    parentID=lineCols[8].split("Parent=")[-1].split(";")[-1].strip()
                                    assert parentID in setStarts and parentID==chsID, "wrong exon identified "+parentID+" "+chsID
                                    if lineCols[4]==transEnd:
                                        lineCols[4]=str(int(lineCols[4])+startDict[chsID])
                                        foundExon=True
                                        outFP.write("\t".join(lineCols))
                                    else:
                                        outFP.write(line)
                                        
                    elif chsID in setEnds:
                        # now we shall collect all the necessary parts (exons) for the given novel transcript
                        if lineCols[6]=="+":
                            # change right-most coordinate of the transcript
                            transEnd=lineCols[4]
                            lineCols[4]=str(int(transEnd)+endDict[chsID])
                            outFP.write("\t".join(lineCols))
                            foundExon=False
                            while True:
                                line=next(ifp)
                                if line[0]=="#":
                                    tmpLine=line
                                    assert foundExon==True,"- wrong element, something is wrong with the gff format "+chsID+" "+transEnd
                                    break
                                lineCols=line.split("\t")
                                if not lineCols[2]=="exon":
                                    tmpLine=line
                                    assert foundExon==True,"+ wrong element, something is wrong with the gff format "+chsID+" "+transEnd
                                    break
                                else:
                                    parentID=lineCols[8].split("Parent=")[-1].split(";")[-1].strip()
                                    assert parentID in setEnds and parentID==chsID, "wrong exon identified "+parentID+" "+chsID
                                    if lineCols[4]==transEnd:
                                        lineCols[4]=str(int(lineCols[4])-endDict[chsID])
                                        foundExon=True
                                        outFP.write("\t".join(lineCols))
                                    else:
                                        outFP.write(line)
                        else:
                            # change left-most coordinate of the transcript
                            transStart=lineCols[3]
                            lineCols[3]=str(int(transStart)+endDict[chsID])
                            outFP.write("\t".join(lineCols))
                            foundExon=False
                            while True:
                                line=next(ifp)
                                if line[0]=="#":
                                    tmpLine=line
                                    assert foundExon==True,"- wrong element, something is wrong with the gff format "+chsID+" "+transStart
                                    break
                                lineCols=line.split("\t")
                                if not lineCols[2]=="exon":
                                    tmpLine=line
                                    assert foundExon==True,"- wrong element, something is wrong with the gff format "+chsID+" "+transStart
                                    break
                                else:
                                    parentID=lineCols[8].split("Parent=")[-1].split(";")[-1].strip()
                                    assert parentID in setEnds and parentID==chsID, "wrong exon identified "+parentID+" "+chsID
                                    if lineCols[3]==transStart:
                                        lineCols[3]=str(int(lineCols[3])+endDict[chsID])
                                        foundExon=True
                                        outFP.write("\t".join(lineCols))
                                    else:
                                        outFP.write(line)
                    else:
                        outFP.write(line)
                else:
                    outFP.write(line)
        except StopIteration:
            break
            
outFP.close()

In [9]:
# mapfile stays the same here
copyfile("./mapfile2.03.txt","mapfile2.04.txt")
# genes file should be just fine, although it would be best to verify that everything is indeed correct
ann=pd.read_csv("./chess2.04.gff",sep="\t",names=gff3Cols)
ann.dropna(inplace=True,axis=0)
ann=ann[ann["type"]=="gene"].reset_index(drop=True)
ann.head()
# the real problem here is the transcripts file, since we need to provide updated coordinates for it.

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes
0,chr1,RefSeq,gene,11874.0,14409.0,.,+,.,ID=CHS.1;GENE_TYPE=misc_RNA;STATUS=known_refse...
1,chr1,RefSeq,gene,14362.0,29370.0,.,-,.,ID=CHS.2;GENE_TYPE=misc_RNA;STATUS=known_refse...
2,chr1,RefSeq,gene,29926.0,31295.0,.,+,.,ID=CHS.3;GENE_TYPE=lncRNA;STATUS=known_refseq;...
3,chr1,RefSeq,gene,34611.0,36081.0,.,-,.,ID=CHS.4;GENE_TYPE=lncRNA;STATUS=known_refseq;...
4,chr1,RefSeq,gene,51943.0,53959.0,.,+,.,ID=CHS.5;GENE_TYPE=lncRNA;STATUS=known_refseq;...


In [10]:
ann["id"]=ann.attributes.str.split("ID=",expand=True)[1].str.split(";",expand=True)[0]
ann.head()
ann["start"]==ann['start'].astype(int)
ann["end"]==ann['end'].astype(int)
ann.head()

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,id
0,chr1,RefSeq,gene,11874.0,14409.0,.,+,.,ID=CHS.1;GENE_TYPE=misc_RNA;STATUS=known_refse...,CHS.1
1,chr1,RefSeq,gene,14362.0,29370.0,.,-,.,ID=CHS.2;GENE_TYPE=misc_RNA;STATUS=known_refse...,CHS.2
2,chr1,RefSeq,gene,29926.0,31295.0,.,+,.,ID=CHS.3;GENE_TYPE=lncRNA;STATUS=known_refseq;...,CHS.3
3,chr1,RefSeq,gene,34611.0,36081.0,.,-,.,ID=CHS.4;GENE_TYPE=lncRNA;STATUS=known_refseq;...,CHS.4
4,chr1,RefSeq,gene,51943.0,53959.0,.,+,.,ID=CHS.5;GENE_TYPE=lncRNA;STATUS=known_refseq;...,CHS.5


In [11]:
genes=pd.read_csv("./chess2.03.genes",sep="\t")
genes.head()

Unnamed: 0,Gene_Type,Gene_Name,GFF_ID,Location,Database,RefSeq_GeneID,Description
0,protein_coding,LOC105371921,CHS.23496,chr17:80885776-80892131:-,RefSeq,105371921,uncharacterized LOC105371921
1,lncRNA,LOC107986127,CHS.38280,chr3:127744219-127751505:-,RefSeq,107986127,uncharacterized LOC107986127
2,protein_coding,RPS11,CHS.27254,chr19:49496365-49499712:+,RefSeq,6205,ribosomal protein S11
3,protein_coding,CREB3L1,CHS.8579,chr11:46276634-46321430:+,RefSeq,90993,cAMP responsive element binding protein 3-like 1
4,lncRNA,LOC101930053,CHS.54970,chr9:2385141-2494223:-,RefSeq,101930053,uncharacterized LOC101930053


In [12]:
# credit goes to https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows/40449726#40449726
def explode(df, lst_cols, fill_value=''):
    # make sure `lst_cols` is a list
    if lst_cols and not isinstance(lst_cols, list):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)

    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()

    if (lens > 0).all():
        # ALL lists in cells aren't empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, lens)
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .loc[:, df.columns]
    else:
        # at least one list in cells is empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, lens)
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .append(df.loc[lens==0, idx_cols]).fillna(fill_value) \
          .loc[:, df.columns]
    
genes=explode(genes.assign(GFF_ID=genes.GFF_ID.str.split(',')), 'GFF_ID')

In [13]:
mdf=genes.merge(ann[['id','start','end']],how="outer",left_on='GFF_ID',right_on='id',indicator=True)
mdf.head()

Unnamed: 0,Gene_Type,Gene_Name,GFF_ID,Location,Database,RefSeq_GeneID,Description,id,start,end,_merge
0,protein_coding,LOC105371921,CHS.23496,chr17:80885776-80892131:-,RefSeq,105371921,uncharacterized LOC105371921,CHS.23496,80885776.0,80892131.0,both
1,lncRNA,LOC107986127,CHS.38280,chr3:127744219-127751505:-,RefSeq,107986127,uncharacterized LOC107986127,CHS.38280,127744219.0,127751505.0,both
2,protein_coding,RPS11,CHS.27254,chr19:49496365-49499712:+,RefSeq,6205,ribosomal protein S11,CHS.27254,49496365.0,49499712.0,both
3,protein_coding,CREB3L1,CHS.8579,chr11:46276634-46321430:+,RefSeq,90993,cAMP responsive element binding protein 3-like 1,CHS.8579,46276634.0,46321430.0,both
4,lncRNA,LOC101930053,CHS.54970,chr9:2385141-2494223:-,RefSeq,101930053,uncharacterized LOC101930053,CHS.54970,2385141.0,2494223.0,both


In [14]:
assert len(mdf[mdf["_merge"]=="left_only"])==0, "there are genes present in the .genes file but not in the annotation: "+";".join(mdf[mdf["_merge"]=="left_only"]["id"])
assert len(mdf[mdf["_merge"]=="right_only"])==0, " there are genes present in the annotation but not in the .genes file: "+";".join(mdf[mdf["_merge"]=="right_only"]["id"])

In [19]:
# genes file should be identical to the old one, since transcript coordinates should have never extended the old genes
# so let's verify this assumption is also correct
mdf['start']=mdf['start'].astype(int)
mdf['end']=mdf['end'].astype(int)
mdf[['start_old','end_old']]=mdf['Location'].str.split(':',expand=True)[1].str.split("-",expand=True)
mdf['start_old']=mdf['start_old'].astype(int)
mdf['end_old']=mdf['end_old'].astype(int)
display(mdf[~((mdf['start']==mdf['start_old'])&(mdf['end']==mdf['end_old']))])
assert len(mdf[(mdf['start']==mdf['start_old'])&(mdf['end']==mdf['end_old'])])==len(mdf),"something's wrong with the gene start and end coordinates"

Unnamed: 0,Gene_Type,Gene_Name,GFF_ID,Location,Database,RefSeq_GeneID,Description,id,start,end,_merge,start_old,end_old
48,protein_coding,ZBTB12,CHS.48511,"chr6:31899194-31902331:-,chr6_GL000251v2_alt:3...",RefSeq,221527,zinc finger and BTB domain containing 12,CHS.48511,3377042,3379417,both,31899194,31902331
49,protein_coding,ZBTB12,CHS.48732,"chr6:31899194-31902331:-,chr6_GL000251v2_alt:3...",RefSeq,221527,zinc finger and BTB domain containing 12,CHS.48732,3147391,3149766,both,31899194,31902331
50,protein_coding,ZBTB12,CHS.49124,"chr6:31899194-31902331:-,chr6_GL000251v2_alt:3...",RefSeq,221527,zinc finger and BTB domain containing 12,CHS.49124,3241692,3244452,both,31899194,31902331
51,protein_coding,ZBTB12,CHS.49345,"chr6:31899194-31902331:-,chr6_GL000251v2_alt:3...",RefSeq,221527,zinc finger and BTB domain containing 12,CHS.49345,3155590,3157967,both,31899194,31902331
75,lncRNA,LOC105375115,CHS.52474,"chr7:174920-176013:+,chr7_KI270804v1_alt:13432...",RefSeq,105375115,uncharacterized LOC105375115,CHS.52474,134324,135417,both,174920,176013
76,lncRNA,LOC105375115,CHS.52499,"chr7:174920-176013:+,chr7_KI270804v1_alt:13432...",RefSeq,105375115,uncharacterized LOC105375115,CHS.52499,167241,168334,both,174920,176013
85,protein_coding,SERPINF1,CHS.23883,"chr17:1761965-1779392:+,chr17_KI270861v1_alt:1...",RefSeq,5176,serpin family F member 1,CHS.23883,190927,195947,both,1761965,1779392
166,protein_coding,ABR,CHS.23887,"chr17:999953-1229713:-,chr17_KI270862v1_alt:42...",RefSeq,29,active BCR-related,CHS.23887,42765,231794,both,999953,1229713
167,protein_coding,ABR,CHS.23947,"chr17:999953-1229713:-,chr17_KI270862v1_alt:42...",RefSeq,29,active BCR-related,CHS.23947,42765,153205,both,999953,1229713
169,protein_coding,OR9G1,CHS.10525,"chr11:56700388-56701305:+,chr11_JH159137v1_alt...",RefSeq,390174,olfactory receptor family 9 subfamily G member 1,CHS.10525,7154,8071,both,56700388,56701305


AssertionError: something's wrong with the gene start and end coordinates

In [37]:
outFP=open("./chess2.04.genes","w+")

genesEvaluated=[]
wrongCoords=[]
header=True
with open("./chess2.03.genes") as inFP:
    for line in inFP.readlines():
        if header: # skip header
            outFP.write(line)
            header=False
            continue
        lineCols=line.split("\t")
        gids=lineCols[2].split(",")
        locs=lineCols[3].split(",")
        assert len(gids)==len(locs),"something is wrong with the following locus: "+lineCols[2]+"\t"+lineCols[3]
        newLocs=[]
        for gl in zip(gids,locs):
            genesEvaluated.append(gl[0]) # append geneID for verification later
            location=gl[1].split(":")
            coords=[int(x) for x in location[1].split("-")]
            ann_row=ann[ann['id']==gl[0]].reset_index(drop=True)
            assert len(ann_row)==1,"multiple rows?"
            if not int(ann_row['start'])==coords[0] and int(ann_row['end'])==coords[1]:
                wrongCoords.append(gl[0])
                
            newLocs.append(ann_row['seqid'].iloc[0]+":"+str(int(ann_row['start'].iloc[0]))+"-"+str(int(ann_row['end'].iloc[0]))+":"+location[-1])
        lineCols[3]=",".join(newLocs)
        outFP.write('\t'.join(lineCols))
inFP.close()
outFP.close()

In [41]:
print(wrongCoords)
print(len(genesEvaluated))
len(ann[~(ann["id"].isin(genesEvaluated))])

['CHS.4228']
46972


323817

In [39]:
# now we need to replace old transcript coordinates with updated ones
ann=pd.read_csv("./chess2.04.gff",sep="\t",names=gff3Cols)
ann.dropna(inplace=True,axis=0)
ann=ann[ann["type"]=="transcript"].reset_index(drop=True)
ann["id"]=ann.attributes.str.split("ID=",expand=True)[1].str.split(";",expand=True)[0]
ann["start"]==ann['start'].astype(int)
ann["end"]==ann['end'].astype(int)
ann.head()

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,id
0,chr1,BestRefSeq,transcript,11874.0,14409.0,.,+,.,ID=CHS.1.1;Parent=CHS.1;gene_name=DDX11L1;Dbxr...,CHS.1.1
1,chr1,BestRefSeq,transcript,14362.0,29370.0,.,-,.,ID=CHS.2.1;Parent=CHS.2;gene_name=WASH7P;Dbxre...,CHS.2.1
2,chr1,StringTie,transcript,14410.0,29369.0,.,-,.,ID=CHS.2.2;Parent=CHS.2;STATUS=novel;ASSEMBLED...,CHS.2.2
3,chr1,StringTie,transcript,14410.0,29369.0,.,-,.,ID=CHS.2.3;Parent=CHS.2;STATUS=novel;ASSEMBLED...,CHS.2.3
4,chr1,StringTie,transcript,14410.0,29369.0,.,-,.,ID=CHS.2.4;Parent=CHS.2;STATUS=novel;ASSEMBLED...,CHS.2.4


In [51]:
ann['coords']=ann["start"].astype(int).astype(str)+"-"+ann["end"].astype(int).astype(str)
annDict=pd.Series(ann.coords.values,index=ann.id).to_dict()

In [54]:
outFP=open("./chess2.04.transcripts","w+")

transcriptsEvaluated=[]
wrongCoords=[]
header=True
with open("./chess2.03.transcripts") as inFP:
    for line in inFP.readlines():
        if header: # skip header
            outFP.write(line)
            header=False
            continue
        lineCols=line.split("\t")
        tid=lineCols[0]
        location=lineCols[7].split(":")
        transcriptsEvaluated.append(tid) # append geneID for verification later
        coords=[int(x) for x in location[1].split("-")]
        annCoords=[int(x) for x in annDict[tid].split("-")]
        if not int(annCoords[0])==coords[0] and int(annCoords[1])==coords[1]:
            wrongCoords.append(tl[0])
        newLoc=location[0]+":"+str(annCoords[0])+"-"+str(annCoords[1])+":"+location[-1]
        lineCols[7]=newLoc
        outFP.write('\t'.join(lineCols))
inFP.close()
outFP.close()

In [55]:
print(wrongCoords)
print(len(transcriptsEvaluated))
print(len(ann[~(ann["id"].isin(transcriptsEvaluated))]))

['CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.36462.6', 'CHS.3646