In [1]:
import pandas as pd
import numpy as np
import os
import re

gff3Cols=["seqid","source","type","start","end","score","strand","phase","attributes"]
chrMapCols=["name","role","molecule","type","genbank","rel","refseq","unit","seqLen","ucsc"]

In [2]:
# Here we shall develop a method for merging two GFF files
# Main idea - need to convert any arbitrary features into gene-transcript-exon features and discard anything unnecessary (such as CDS)
#    - first we shall try to guess parent-child hierarchy
#    - based on the hierarchy we can decide which entries to keep
#    - alternatively, we can rely on exons, and assign "transcript" to their features and "gene" to the respective grandparents

In [2]:
chrMap=pd.read_csv("/home/sparrow/genomicData/hg38/GCF_000001405.38_GRCh38.p12_assembly_report.txt",sep="\t",names=chrMapCols,comment="#")
chrMap

Unnamed: 0,name,role,molecule,type,genbank,rel,refseq,unit,seqLen,ucsc
0,1,assembled-molecule,1,Chromosome,CM000663.2,=,NC_000001.11,Primary Assembly,248956422,chr1
1,2,assembled-molecule,2,Chromosome,CM000664.2,=,NC_000002.12,Primary Assembly,242193529,chr2
2,3,assembled-molecule,3,Chromosome,CM000665.2,=,NC_000003.12,Primary Assembly,198295559,chr3
3,4,assembled-molecule,4,Chromosome,CM000666.2,=,NC_000004.12,Primary Assembly,190214555,chr4
4,5,assembled-molecule,5,Chromosome,CM000667.2,=,NC_000005.10,Primary Assembly,181538259,chr5
5,6,assembled-molecule,6,Chromosome,CM000668.2,=,NC_000006.12,Primary Assembly,170805979,chr6
6,7,assembled-molecule,7,Chromosome,CM000669.2,=,NC_000007.14,Primary Assembly,159345973,chr7
7,8,assembled-molecule,8,Chromosome,CM000670.2,=,NC_000008.11,Primary Assembly,145138636,chr8
8,9,assembled-molecule,9,Chromosome,CM000671.2,=,NC_000009.12,Primary Assembly,138394717,chr9
9,10,assembled-molecule,10,Chromosome,CM000672.2,=,NC_000010.11,Primary Assembly,133797422,chr10


In [None]:
df=pd.read_csv("/home/sparrow/genomicData/hg38/annotations/hg38_p12_gencode_v29.gff3",sep="\t",skiprows=7,names=gff3Cols)

# df=df[df["seqid"]=="chr11"].reset_index(drop=True)

df["id"]=df.attributes.str.split("ID=",expand=True)[1].str.split(";",expand=True)[0]
dfE=df[df["type"]=="exon"].reset_index(drop=True)
dfE["transID"]=dfE.attributes.str.split("Parent=",expand=True)[1].str.split(";",expand=True)[0]
transcriptSet=set(dfE["transID"])
dfT=df[df["id"].isin(transcriptSet)].reset_index(drop=True)
assert len(set(dfT["id"]))==len(transcriptSet)
dfT["geneID"]=dfT.attributes.str.split("Parent=",expand=True)[1].str.split(";",expand=True)[0]
dfT["transID"]=dfT["id"]

# here's the time to deal with inconsistencies
# first remove any enties that do not have a parent - those do not conform to the expected structure that we define
# however, this way we remove PSEUDOGENES, which may not be the desired effect - need to think more/discuss
dfT=dfT[~(dfT["geneID"].isnull())].reset_index(drop=True)
dfE=dfE[dfE["transID"].isin(dfT["transID"])].reset_index(drop=True)

# next it is time to iteratively, bring the parents to down
# for instance, a feature that is defined as a transcript at this level (eg. miRNA) may have aother feature at this level as a parent
while True:
    dfI=dfT[["transID","geneID"]].merge(dfT[["transID","geneID"]],how="inner",left_on="geneID",right_on="transID")
    dfI=dfI[["transID_x","geneID_y"]]
    dfT=dfT.merge(dfI,how="left",left_on="transID",right_on="transID_x")
    dfT["geneID"]=np.where(dfT["geneID_y"].isnull(),dfT["geneID"],dfT["geneID_y"])
    if len(set(dfI["geneID_y"]).intersection(set(dfT["transID"])))==0:
        break

dfT["attributes"]=dfT["attributes"]+";old_type="+dfT["type"]
dfT["type"]="transcript"
geneSet=set(dfT["geneID"])
dfG=df[df["id"].isin(geneSet)].reset_index(drop=True)
dfG["attributes"]=dfG["attributes"]+";old_type="+dfG["type"]
dfG["type"]="gene"
assert len(set(dfG["id"]))==len(geneSet)
dfG["geneID"]=dfG["id"]
dfG["transID"]=np.inf

dfE=dfE.merge(dfT[["transID","geneID"]],how="left",left_on="transID",right_on="transID")
assert len(dfE[dfE["geneID"].isnull()])==0,"incompatible features"

# now need to concatenate all these three dataframes
dfM=pd.concat([dfG[["seqid","source","type","start","end","score","strand","phase","attributes","transID","geneID"]],dfT[["seqid","source","type","start","end","score","strand","phase","attributes","transID","geneID"]],dfE[["seqid","source","type","start","end","score","strand","phase","attributes","transID","geneID"]]],axis=0).reset_index(drop=True)
dfM["type"]=pd.Categorical(dfM["type"],categories=["gene","transcript","exon"],ordered=True)
dfM=dfM.sort_values(by=["geneID","transID","type"]).reset_index(drop=True)
# dfM[["seqid","source","type","start","end","score","strand","phase","attributes"]]
del dfE
del dfT
del dfG
# now need to convert the chromosome names to a standard (genbank)
chrCurMap={}
setNames=set(dfM["seqid"])
for i in setNames:
    for j in ["name","molecule","genbank","refseq","ucsc"]:
        if len(chrMap[chrMap[j]==i])>0:
            chrCurMap[i]=chrMap[chrMap[j]==i]["genbank"].iloc[0]
dfM.replace(chrCurMap,inplace=True)
dfM["start"]=dfM["start"].astype(int)
dfM["end"]=dfM["end"].astype(int)
# dfM=dfM[gff3Cols]
# some pseudogenes are still present

# now we can proceed to merge annotations
# dfM_g=dfM[dfM["type"].isin(["exon"])][["seqid","strand","type","start","end","transID"]].reset_index(drop=True)
# dfM_g=dfM_g.sort_values(by=["transID","start"]).reset_index(drop=True)
# dfM_g["start"]=dfM_g.start.astype(str)+"+"
# dfM_g["end"]=dfM_g.end.astype(str)+"+"
# dfM_g=dfM_g.groupby(by="transID").agg({'start':'sum','end':'sum','strand':'min','seqid':'min'})
# dfM_g.reset_index(inplace=True)
# dfM_g["chain"]=dfM_g['seqid']+":"+dfM_g['strand']+'@'+dfM_g['start']+'-'+dfM_g['end']
# dfM_g

dfM_g=dfM[dfM["type"].isin(["exon"])][["seqid","strand","type","start","end","transID"]].reset_index(drop=True)
dfM_g=dfM_g.sort_values(by=["transID","start"]).reset_index(drop=True)
dfM_g["ns"]=dfM_g.start.astype(str).shift(-1)
dfM_g["nid"]=dfM_g.transID.shift(-1)
dfM_g["ns"]=np.where(dfM_g["transID"]==dfM_g["nid"],dfM_g["ns"],"***")
dfM_g["ns"]=dfM_g.ns.astype(str)+"+"
dfM_g["ends"]=dfM_g.end.astype(str)+"+"
dfM_g=dfM_g.groupby(by="transID").agg({'start':'min','ns':'sum','ends':'sum','end':'max','strand':'min','seqid':'min'})
dfM_g.reset_index(inplace=True)
dfM_g.columns=['transID','start_min',"ns",'end','end_max','strand','seqid']
dfM_g["ns"]=dfM_g["ns"].str.rstrip("***+")
dfM_g["end"]=dfM_g["end"].str.rstrip("***+")
dfM_g["chain"]=dfM_g['seqid']+":"+dfM_g['strand']+'@'+dfM_g['end']+'-'+dfM_g['ns']
dfM_g=dfM_g[['transID','start_min','end_max','chain']]

del setNames
del chrCurMap
del geneSet
del transcriptSet
dfM=dfM[gff3Cols+["transID"]]

# now to remove duplicate intron chains
# dfM_g[dfM_g.chain.isin(set(dfM_g[dfM_g["chain"].duplicated()]["chain"]))].sort_values(by="chain")
dfM_g.drop_duplicates("chain",keep='first',inplace=True)
dfM_g

In [None]:
df=pd.read_csv("/home/sparrow/genomicData/hg38/annotations/hg38_p12_refseq.gff",sep="\t",skiprows=7,names=gff3Cols)

# df=df[df["seqid"]=="NC_000011.10"]

df["id"]=df.attributes.str.split("ID=",expand=True)[1].str.split(";",expand=True)[0]
dfE=df[df["type"]=="exon"].reset_index(drop=True)
dfE["transID"]=dfE.attributes.str.split("Parent=",expand=True)[1].str.split(";",expand=True)[0]
transcriptSet=set(dfE["transID"])
dfT=df[df["id"].isin(transcriptSet)].reset_index(drop=True)
assert len(set(dfT["id"]))==len(transcriptSet)
dfT["geneID"]=dfT.attributes.str.split("Parent=",expand=True)[1].str.split(";",expand=True)[0]
dfT["transID"]=dfT["id"]

# here's the time to deal with inconsistencies
# first remove any enties that do not have a parent - those do not conform to the expected structure that we define
# however, this way we remove PSEUDOGENES, which may not be the desired effect - need to think more/discuss
dfT=dfT[~(dfT["geneID"].isnull())].reset_index(drop=True)
dfE=dfE[dfE["transID"].isin(dfT["transID"])].reset_index(drop=True)

# next it is time to iteratively, bring the parents to down
# for instance, a feature that is defined as a transcript at this level (eg. miRNA) may have aother feature at this level as a parent
while True:
    dfI=dfT[["transID","geneID"]].merge(dfT[["transID","geneID"]],how="inner",left_on="geneID",right_on="transID")
    dfI=dfI[["transID_x","geneID_y"]]
    dfT=dfT.merge(dfI,how="left",left_on="transID",right_on="transID_x")
    dfT["geneID"]=np.where(dfT["geneID_y"].isnull(),dfT["geneID"],dfT["geneID_y"])
    if len(set(dfI["geneID_y"]).intersection(set(dfT["transID"])))==0:
        break

dfT["attributes"]=dfT["attributes"]+";old_type="+dfT["type"]
dfT["type"]="transcript"
geneSet=set(dfT["geneID"])
dfG=df[df["id"].isin(geneSet)].reset_index(drop=True)
del df
dfG["attributes"]=dfG["attributes"]+";old_type="+dfG["type"]
dfG["type"]="gene"
assert len(set(dfG["id"]))==len(geneSet)
dfG["geneID"]=dfG["id"]
dfG["transID"]=np.inf

dfE=dfE.merge(dfT[["transID","geneID"]],how="left",left_on="transID",right_on="transID")
assert len(dfE[dfE["geneID"].isnull()])==0,"incompatible features"

# now need to concatenate all these three dataframes
dfM_2=pd.concat([dfG[["seqid","source","type","start","end","score","strand","phase","attributes","transID","geneID"]],dfT[["seqid","source","type","start","end","score","strand","phase","attributes","transID","geneID"]],dfE[["seqid","source","type","start","end","score","strand","phase","attributes","transID","geneID"]]],axis=0).reset_index(drop=True)
dfM_2["type"]=pd.Categorical(dfM_2["type"],categories=["gene","transcript","exon"],ordered=True)
dfM_2=dfM_2.sort_values(by=["geneID","transID","type"]).reset_index(drop=True)
# dfM_2[["seqid","source","type","start","end","score","strand","phase","attributes"]]
dfE
del dfT
del dfG
# now need to convert the chromosome names to a standard (genbank)
chrCurMap={}
setNames=set(dfM_2["seqid"])
for i in setNames:
    for j in ["name","molecule","genbank","refseq","ucsc"]:
        if len(chrMap[chrMap[j]==i])>0:
            chrCurMap[i]=chrMap[chrMap[j]==i]["genbank"].iloc[0]
dfM_2.replace(chrCurMap,inplace=True)
dfM_2["start"]=dfM_2["start"].astype(int)
dfM_2["end"]=dfM_2["end"].astype(int)
# dfM_2=dfM_2[gff3Cols]
# some pseudogenes are still present

# now we can proceed to merge annotations
# dfM_2_g=dfM_2[dfM_2["type"].isin(["exon"])][["seqid","strand","type","start","end","transID"]].reset_index(drop=True)
# dfM_2_g=dfM_2_g.sort_values(by=["transID","start"]).reset_index(drop=True)
# dfM_2_g["start"]=dfM_2_g.start.astype(str)+"+"
# dfM_2_g["end"]=dfM_2_g.end.astype(str)+"+"
# dfM_2_g=dfM_2_g.groupby(by="transID").agg({'start':'sum','end':'sum','strand':'min','seqid':'min'})
# dfM_2_g.reset_index(inplace=True)
# dfM_2_g["chain"]=dfM_2_g['seqid']+":"+dfM_2_g['strand']+'@'+dfM_2_g['start']+'-'+dfM_2_g['end']
# dfM_2_g

dfM_2_g=dfM_2[dfM_2["type"].isin(["exon"])][["seqid","strand","type","start","end","transID"]].reset_index(drop=True)
dfM_2_g=dfM_2_g.sort_values(by=["transID","start"]).reset_index(drop=True)
dfM_2_g["ns"]=dfM_2_g.start.astype(str).shift(-1)
dfM_2_g["nid"]=dfM_2_g.transID.shift(-1)
dfM_2_g["ns"]=np.where(dfM_2_g["transID"]==dfM_2_g["nid"],dfM_2_g["ns"],"***")
dfM_2_g["ns"]=dfM_2_g.ns.astype(str)+"+"
dfM_2_g["ends"]=dfM_2_g.end.astype(str)+"+"
dfM_2_g=dfM_2_g.groupby(by="transID").agg({'start':'min','ns':'sum','ends':'sum','end':'max','strand':'min','seqid':'min'})
dfM_2_g.reset_index(inplace=True)
dfM_2_g.columns=['transID','start_min',"ns",'end','end_max','strand','seqid']
dfM_2_g["ns"]=dfM_2_g["ns"].str.rstrip("***+")
dfM_2_g["end"]=dfM_2_g["end"].str.rstrip("***+")
dfM_2_g["chain"]=dfM_2_g['seqid']+":"+dfM_2_g['strand']+'@'+dfM_2_g['end']+'-'+dfM_2_g['ns']
dfM_2_g=dfM_2_g[['transID','start_min','end_max','chain']]

del setNames
del chrCurMap
del geneSet
del transcriptSet
dfM_2=dfM_2[gff3Cols+["transID"]]

# now to remove duplicate intron chains
# dfM_2_g[dfM_2_g.chain.isin(set(dfM_2_g[dfM_2_g["chain"].duplicated()]["chain"]))].sort_values(by="chain")
dfM_2_g.drop_duplicates("chain",keep='first',inplace=True)
dfM_2_g

In [1]:
# now we can merge two annotations

# in doing so, we need to first identify all which overlap completely (that is having the sme intron chain,start and end)
shared_chain=dfM_g.merge(dfM_2_g,how="inner",on=["chain"])#[["transID_x",'transID_y']]
del dfM_g
del dfM_2_g
shared_all=shared_chain[(shared_chain['start_min_x']==shared_chain['start_min_y']) & (shared_chain['end_max_x']==shared_chain['end_max_y'])]
# now we need to deal with those that share intron chain but have different start end
# in these cases, the start or end or both exons need to be updated as well as the start or end or both of the transcripts
sc=shared_chain[shared_chain["start_min_x"]>shared_chain["start_min_y"]]
dfM=dfM.merge(sc,how="left",left_on="transID",right_on="transID_x")
dfM['start']=np.where(dfM['start']==dfM['start_min_x'],dfM['start_min_y'],dfM['start'])
dfM=dfM[gff3Cols+["transID"]]
# second case
sc=shared_chain[shared_chain["end_max_x"]<shared_chain["end_max_y"]]
dfM=dfM.merge(sc,how="left",left_on="transID",right_on="transID_x")

dfM['end']=np.where(dfM['end']==dfM['end_max_x'],dfM['end_max_y'],dfM['end'])
# now we only need to merge the rest of transcripts

# right now there is no need to create new gene entries and standardize the parents
# this is unnecessary since, we are using this annotation only to create a transcriptome fasta reference file
# which by definition will only extract transcripts, and we only need to make sure there are no intron-chin duplicates
dfM_2=dfM_2[~(dfM_2['transID'].isin(set(shared_chain["transID_x"])))].reset_index(drop=True)
# lastly, we shall build the final GFF by merging two annotations
dfF=pd.concat([dfM[gff3Cols],dfM_2[gff3Cols]],axis=0)
dfF.to_csv("./test.gff3",sep='\t',index=False,header=False)

NameError: name 'dfM_g' is not defined

In [118]:
# another thing: how should we handle pseudogenes?

In [None]:
# at the end we shall provide add this interface to the trans2genome as an option to pass multiple annotation files