In [1]:
import pandas as pd
import numpy as np
import os

gff3Cols=["seqid","source","type","start","end","score","strand","phase","attributes"]

In [2]:
# this assumes that intersection has already been computed

df = pd.read_csv("./prepAnnotation/refseq_vs_gencode_intersection.gff",sep="\t",comment="#",names=gff3Cols)
display(df.head())
df_tab_header = pd.read_csv("./prepAnnotation/refseq_dta_gencode_dta.tab.header",sep="\t",comment="1")
df_tab_header.reset_index(inplace=True)
tab_header = list(df_tab_header.iloc[0])
df_tab = pd.read_csv("./prepAnnotation/refseq_dta_gencode_dta.tab",sep="\t",names=tab_header)
print(len(df_tab))
df_tab.head()

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes
0,chr1,Curated Genomic,pseudogene,258511,259036,.,-,.,ID=gene-RPL23AP21;geneID=gene-RPL23AP21;gene_n...
1,chr1,Curated Genomic,exon,258511,259036,.,-,.,Parent=gene-RPL23AP21
2,chr1,BestRefSeq,lnc_RNA,490756,495445,.,-,.,ID=rna-NR_028322.1;geneID=gene-LOC100132287;ge...
3,chr1,BestRefSeq,exon,490756,494898,.,-,.,Parent=rna-NR_028322.1
4,chr1,BestRefSeq,exon,494992,495049,.,-,.,Parent=rna-NR_028322.1


61879


  return _read(filepath_or_buffer, kwds)


Unnamed: 0,chromosome,refseqID,covlen,CDSlen,gbkey,gene_biotype,gencodeID,covlen.1,CDSlen.1,transcript_type,gene_type
0,chr10,gene-AKR1B10P1,1201,0,Gene,pseudogene,ENST00000425765.1,942,0,processed_pseudogene,processed_pseudogene
1,chr10,gene-ALDOAP2,1377,0,Gene,pseudogene,ENST00000398050.3,1076,0,processed_pseudogene,processed_pseudogene
2,chr10,gene-ATP5MC1P7,550,0,Gene,pseudogene,ENST00000444398.1,400,0,processed_pseudogene,processed_pseudogene
3,chr10,gene-CCND3P1,628,0,Gene,pseudogene,ENST00000438347.1,580,0,processed_pseudogene,processed_pseudogene
4,chr10,gene-CEP164P1,504,0,Gene,pseudogene,ENST00000455142.1,418,0,transcribed_processed_pseudogene,transcribed_processed_pseudogene


In [3]:
# first we need to replicate the intersection counts and a table for the gencode vs refseq
dfg_tab = df_tab[["refseqID","gene_biotype","transcript_type","gene_type"]].groupby(by=["gene_biotype","gene_type"]).count().reset_index()
dfg_tab.columns = ["refseq","gencode","count","transcript_type"]

# define types of interest
include_refseq_either = ['protein_coding','lncRNA']
include_gencode_either= ['non_coding','bidirectional_promoter_lncRNA','protein_coding','processed_transcript']
additional_gencode_for_lncRNA_refseq = ['lincRNA','antisense']

# select by conservative thresholds
cons_df=dfg_tab[(dfg_tab["refseq"].isin(include_refseq_either))&(dfg_tab["gencode"].isin(include_gencode_either))|
                (dfg_tab["refseq"].isin(include_refseq_either))&(dfg_tab["gencode"].isin(additional_gencode_for_lncRNA_refseq))]
print(cons_df["count"].sum())
display(cons_df.head()) # this is where the conservatively selected transcripts are

# now select the non-conservative thresholds
noncons_df = dfg_tab[~dfg_tab.isin(cons_df)].dropna()
print(noncons_df["count"].sum())
print(set(noncons_df["refseq"]))
noncons_df[noncons_df["refseq"]=="protein_coding"]

54166


Unnamed: 0,refseq,gencode,count,transcript_type
32,lncRNA,antisense,2125,2125
33,lncRNA,bidirectional_promoter_lncRNA,43,43
34,lncRNA,lincRNA,2735,2735
35,lncRNA,non_coding,1,1
36,lncRNA,processed_transcript,586,586


7713.0
{'telomerase_RNA', 'rRNA', 'vault_RNA', 'C_region', 'V_segment', 'scRNA', 'ncRNA_pseudogene', 'lncRNA', 'C_region_pseudogene', 'snRNA', 'tRNA', 'guide_RNA', 'misc_RNA', 'D_segment', 'transcribed_pseudogene', 'J_segment', 'antisense_RNA', 'Y_RNA', 'miRNA', 'protein_coding', 'pseudogene', 'other', 'RNase_MRP_RNA', 'V_segment_pseudogene', 'snoRNA', 'J_segment_pseudogene'}


Unnamed: 0,refseq,gencode,count,transcript_type
54,protein_coding,IG_V_gene,1.0,1.0
55,protein_coding,TEC,3.0,3.0
59,protein_coding,polymorphic_pseudogene,54.0,54.0
60,protein_coding,processed_pseudogene,5.0,5.0
63,protein_coding,sense_intronic,1.0,1.0
64,protein_coding,sense_overlapping,3.0,3.0
65,protein_coding,transcribed_processed_pseudogene,1.0,1.0
66,protein_coding,transcribed_unitary_pseudogene,8.0,8.0
67,protein_coding,transcribed_unprocessed_pseudogene,37.0,37.0
68,protein_coding,unitary_pseudogene,2.0,2.0


In [4]:
# now need entries from the original tab that correspond to the identified pairs of types
df_tab_final = df_tab[(df_tab["gene_biotype"].isin(include_refseq_either))&(df_tab["gene_type"].isin(include_gencode_either))|
       (df_tab["gene_biotype"].isin(include_refseq_either))&(df_tab["gene_type"].isin(additional_gencode_for_lncRNA_refseq))]
df_tab_final.head()
df_tab_final[df_tab_final["transcript_type"]=="transcribed_unprocessed_pseudogene"]
print(len(df_tab_final))

54166


In [5]:
# now need to create a new GFF

# begin by separating into a hierarchy
has_parent = set(df[df["attributes"].str.contains("Parent=")]["type"])
print(has_parent)
has_no_parent = set(df[~(df["attributes"].str.contains("Parent="))]["type"])
print(has_no_parent)
lower_df = df[df["type"].isin(has_parent)].reset_index(drop=True) # contains lower end of the hierarchy
lower_df["parent"] = lower_df["attributes"].str.split("Parent=",expand=True)[1].str.split(";",expand=True)[0]
print("number of unique parents before subsetting is: "+str(len(set(lower_df["parent"]))))
display(lower_df.head())
upper_df = df[df["type"].isin(has_no_parent)].reset_index(drop=True) #contains upper end of the hierarchy
upper_df["transid"] = upper_df["attributes"].str.split("ID=",expand=True)[1].str.split(";",expand=True)[0]
upper_df["geneid"] = upper_df["attributes"].str.split("geneID=",expand=True)[1].str.split(";",expand=True)[0]
print("number of transcripts before subsetting is: "+str(len(upper_df["transid"])))
print("number of unique transcripts before subsetting is: "+str(len(set(upper_df["transid"]))))
print("number of genes before subsetting is "+str(len(upper_df["geneid"])))
print("number of unique genes before subsetting is: "+str(len(set(upper_df["geneid"]))))
display(upper_df.head())

{'exon', 'CDS'}
{'J_gene_segment', 'pseudogene', 'mRNA', 'rRNA', 'C_gene_segment', 'lnc_RNA', 'snRNA', 'snoRNA', 'antisense_RNA', 'V_gene_segment', 'transcript', 'guide_RNA'}
number of unique parents before subsetting is: 61879


Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,parent
0,chr1,Curated Genomic,exon,258511,259036,.,-,.,Parent=gene-RPL23AP21,gene-RPL23AP21
1,chr1,BestRefSeq,exon,490756,494898,.,-,.,Parent=rna-NR_028322.1,rna-NR_028322.1
2,chr1,BestRefSeq,exon,494992,495049,.,-,.,Parent=rna-NR_028322.1,rna-NR_028322.1
3,chr1,BestRefSeq,exon,495277,495445,.,-,.,Parent=rna-NR_028322.1,rna-NR_028322.1
4,chr1,BestRefSeq,exon,827591,827775,.,+,.,Parent=rna-NR_047526.1,rna-NR_047526.1


number of transcripts before subsetting is: 61879
number of unique transcripts before subsetting is: 61879
number of genes before subsetting is 61879
number of unique genes before subsetting is: 20892


Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,transid,geneid
0,chr1,Curated Genomic,pseudogene,258511,259036,.,-,.,ID=gene-RPL23AP21;geneID=gene-RPL23AP21;gene_n...,gene-RPL23AP21,gene-RPL23AP21
1,chr1,BestRefSeq,lnc_RNA,490756,495445,.,-,.,ID=rna-NR_028322.1;geneID=gene-LOC100132287;ge...,rna-NR_028322.1,gene-LOC100132287
2,chr1,BestRefSeq,lnc_RNA,827591,843604,.,+,.,ID=rna-NR_047526.1;geneID=gene-LINC01128;gene_...,rna-NR_047526.1,gene-LINC01128
3,chr1,BestRefSeq,lnc_RNA,827591,859446,.,+,.,ID=rna-NR_047519.1;geneID=gene-LINC01128;gene_...,rna-NR_047519.1,gene-LINC01128
4,chr1,BestRefSeq,lnc_RNA,827591,859446,.,+,.,ID=rna-NR_047521.1;geneID=gene-LINC01128;gene_...,rna-NR_047521.1,gene-LINC01128


In [6]:
# now need to get the correct genes

# first select transcripts
upper_df = upper_df[(upper_df["transid"].isin(set(df_tab_final["gencodeID"])))|
                    (upper_df["transid"].isin(set(df_tab_final["refseqID"])))]
upper_df["type"] = "transcript"
print(len(upper_df))
upper_df.head()

# second - select exons
lower_df = lower_df[(lower_df["parent"].isin(set(df_tab_final["gencodeID"])))|
                    (lower_df["parent"].isin(set(df_tab_final["refseqID"])))]
print(len(lower_df))
print(len(set(lower_df["parent"])))
lower_df.head()

54166
982115
54166


Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,parent
4,chr1,BestRefSeq,exon,827591,827775,.,+,.,Parent=rna-NR_047526.1,rna-NR_047526.1
5,chr1,BestRefSeq,exon,829003,829104,.,+,.,Parent=rna-NR_047526.1,rna-NR_047526.1
6,chr1,BestRefSeq,exon,841200,843604,.,+,.,Parent=rna-NR_047526.1,rna-NR_047526.1
7,chr1,BestRefSeq,exon,827591,827775,.,+,.,Parent=rna-NR_047519.1,rna-NR_047519.1
8,chr1,BestRefSeq,exon,829003,829104,.,+,.,Parent=rna-NR_047519.1,rna-NR_047519.1


In [7]:
# now to join the upper and lower
full_df = pd.concat([upper_df,lower_df],axis=0)
# now to get the parent and transid in the same column
full_df["parent"] = np.where(full_df["parent"].isnull(),full_df["transid"],full_df["parent"])
# now to sort the transcripts and exons and CDS accordingly
full_df["type"]=pd.Categorical(full_df["type"],categories=["transcript","exon","CDS"],ordered=True)
full_df=full_df.sort_values(by=["parent","type"]).reset_index(drop=True)
# now to select only the relevant columns and order them
full_df = full_df[gff3Cols]
full_df.to_csv("./prepAnnotation/refseq_vs_gencode_intersection.prot_lnc.gff",sep='\t',index=False,header=False)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [53]:
# now need to add genes to the intersection
full_df_tr = full_df[full_df["type"]=="transcript"].reset_index(drop=True)
full_df_tr["gene_id"] = full_df_tr.attributes.str.split("gene_id=",expand=True)[1].str.split(";",expand=True)[0]
full_df_tr.head()

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,gene_id
0,chr7,HAVANA,transcript,127588345,127591705,.,+,.,ID=ENST00000000233.9;gene_id=ENSG00000004059.1...,ENSG00000004059.10
1,chr12,HAVANA,transcript,8940365,8949955,.,-,.,ID=ENST00000000412.7;gene_id=ENSG00000003056.7...,ENSG00000003056.7
2,chr2,HAVANA,transcript,72129238,72148038,.,-,.,ID=ENST00000001146.6;gene_id=ENSG00000003137.8...,ENSG00000003137.8
3,chr4,HAVANA,transcript,11393150,11429765,.,-,.,ID=ENST00000002596.5;gene_id=ENSG00000002587.9...,ENSG00000002587.9
4,chr3,HAVANA,transcript,50155129,50189075,.,+,.,ID=ENST00000002829.7;gene_id=ENSG00000001617.1...,ENSG00000001617.11


In [54]:
set_geneID = set(full_df_tr[~full_df_tr["gene_id"].isnull()]["source"])
print(set_geneID)
set_no_geneID = set(full_df_tr[full_df_tr["gene_id"].isnull()]["source"])
print(set_no_geneID)
assert len(set_geneID.intersection(set_no_geneID)) == 0, "there's an overlap"

{'HAVANA', 'ENSEMBL'}
{'Gnomon', 'BestRefSeq', 'Curated Genomic'}


In [63]:
full_df_tr["geneID"] = full_df_tr.attributes.str.split("geneID=",expand=True)[1].str.split(";",expand=True)[0]
full_df_tr["geneID"] = np.where(full_df_tr.geneID.isnull(),
                        full_df_tr.attributes.str.split("gene_id=",expand=True)[1].str.split(";",expand=True)[0],
                        full_df_tr.geneID)
full_df_tr.head()

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,gene_id,geneID
0,chr7,HAVANA,transcript,127588345,127591705,.,+,.,ID=ENST00000000233.9;gene_id=ENSG00000004059.1...,ENSG00000004059.10,ENSG00000004059.10
1,chr12,HAVANA,transcript,8940365,8949955,.,-,.,ID=ENST00000000412.7;gene_id=ENSG00000003056.7...,ENSG00000003056.7,ENSG00000003056.7
2,chr2,HAVANA,transcript,72129238,72148038,.,-,.,ID=ENST00000001146.6;gene_id=ENSG00000003137.8...,ENSG00000003137.8,ENSG00000003137.8
3,chr4,HAVANA,transcript,11393150,11429765,.,-,.,ID=ENST00000002596.5;gene_id=ENSG00000002587.9...,ENSG00000002587.9,ENSG00000002587.9
4,chr3,HAVANA,transcript,50155129,50189075,.,+,.,ID=ENST00000002829.7;gene_id=ENSG00000001617.1...,ENSG00000001617.11,ENSG00000001617.11


In [65]:
assert len(full_df_tr[full_df_tr["geneID"].isnull()]) == 0, "some gene ids have not been extracted"

In [68]:
# now we need merge two types of geneIDs together
full_df_tr["trans_parent"] = full_df_tr.geneID
full_df_tr.head()

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,gene_id,geneID,trans_parent
0,chr7,HAVANA,transcript,127588345,127591705,.,+,.,ID=ENST00000000233.9;gene_id=ENSG00000004059.1...,ENSG00000004059.10,ENSG00000004059.10,ENSG00000004059.10
1,chr12,HAVANA,transcript,8940365,8949955,.,-,.,ID=ENST00000000412.7;gene_id=ENSG00000003056.7...,ENSG00000003056.7,ENSG00000003056.7,ENSG00000003056.7
2,chr2,HAVANA,transcript,72129238,72148038,.,-,.,ID=ENST00000001146.6;gene_id=ENSG00000003137.8...,ENSG00000003137.8,ENSG00000003137.8,ENSG00000003137.8
3,chr4,HAVANA,transcript,11393150,11429765,.,-,.,ID=ENST00000002596.5;gene_id=ENSG00000002587.9...,ENSG00000002587.9,ENSG00000002587.9,ENSG00000002587.9
4,chr3,HAVANA,transcript,50155129,50189075,.,+,.,ID=ENST00000002829.7;gene_id=ENSG00000001617.1...,ENSG00000001617.11,ENSG00000001617.11,ENSG00000001617.11


In [69]:
print("number of genes = "+str(len(set(full_df_tr["trans_parent"]))))
print("number of genes in upper: "+str(len(set(upper_df["geneid"]))))
print("number of geneID extracted genes from full_df_tr: "+str(len(set(full_df_tr["geneID"]))))
len(set(full_df_tr["attributes"].str.split("geneID=",expand=True)[1].str.split(";",expand=True)[0]))

number of genes = 32522
number of genes in upper: 17282
number of geneID extracted genes from full_df_tr: 32522


17282

In [78]:
# now let's see if we can merge transcripts together by genes
gene_df = full_df_tr.groupby(by="trans_parent").agg({"start":"min",
                                                   "end":"max",
                                                   "strand":lambda row: ",".join(list(set(row))),
                                                   "seqid":lambda row: ",".join(list(set(row))),
                                                   "geneID":"count"}).reset_index()
gene_df["score"] = "."
gene_df["phase"] = "."
gene_df["source"]= "intersect"
gene_df["type"] = "gene"
gene_df["attributes"] = "ID="+gene_df["trans_parent"]
gene_df = gene_df[gff3Cols]
gene_df.head()

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes
0,chr1,intersect,gene,169849631,169893959,.,-,.,ID=ENSG00000000457.13
1,chr1,intersect,gene,169794730,169854080,.,+,.,ID=ENSG00000000460.16
2,chr1,intersect,gene,27612064,27635277,.,-,.,ID=ENSG00000000938.12
3,chr1,intersect,gene,196651878,196747504,.,+,.,ID=ENSG00000000971.15
4,chr6,intersect,gene,53497341,53545129,.,-,.,ID=ENSG00000001084.12


In [82]:
# now to modify the transcript entries to include Parent attribute
full_df_tr["attributes"] = full_df_tr.attributes.str.split(";",expand=True,n=1)[0]+\
                            ";Parent="+full_df_tr["trans_parent"]+";"+\
                            full_df_tr.attributes.str.split(";",expand=True,n=1)[1]
full_df_tr.head()

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,gene_id,geneID,trans_parent
0,chr7,HAVANA,transcript,127588345,127591705,.,+,.,ID=ENST00000000233.9;Parent=ENSG00000004059.10...,ENSG00000004059.10,ENSG00000004059.10,ENSG00000004059.10
1,chr12,HAVANA,transcript,8940365,8949955,.,-,.,ID=ENST00000000412.7;Parent=ENSG00000003056.7;...,ENSG00000003056.7,ENSG00000003056.7,ENSG00000003056.7
2,chr2,HAVANA,transcript,72129238,72148038,.,-,.,ID=ENST00000001146.6;Parent=ENSG00000003137.8;...,ENSG00000003137.8,ENSG00000003137.8,ENSG00000003137.8
3,chr4,HAVANA,transcript,11393150,11429765,.,-,.,ID=ENST00000002596.5;Parent=ENSG00000002587.9;...,ENSG00000002587.9,ENSG00000002587.9,ENSG00000002587.9
4,chr3,HAVANA,transcript,50155129,50189075,.,+,.,ID=ENST00000002829.7;Parent=ENSG00000001617.11...,ENSG00000001617.11,ENSG00000001617.11,ENSG00000001617.11


In [85]:
# now separate out exons and CDS and concatenate them with genes and transcripts and then sort accordingly
lower_df = full_df[full_df["type"].isin(has_parent)].reset_index(drop=True)
lower_df = lower_df[gff3Cols]
full_df_tr = full_df_tr[gff3Cols]
resdf = pd.concat([gene_df,full_df_tr,lower_df],axis=0)
resdf.head()

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes
0,chr1,intersect,gene,169849631,169893959,.,-,.,ID=ENSG00000000457.13
1,chr1,intersect,gene,169794730,169854080,.,+,.,ID=ENSG00000000460.16
2,chr1,intersect,gene,27612064,27635277,.,-,.,ID=ENSG00000000938.12
3,chr1,intersect,gene,196651878,196747504,.,+,.,ID=ENSG00000000971.15
4,chr6,intersect,gene,53497341,53545129,.,-,.,ID=ENSG00000001084.12


In [87]:
# lastly we need to sort the gff correctly

lower_df["transID"]=lower_df.attributes.str.split("Parent=",expand=True)[1].str.split(";",expand=True)[0]
assert len(lower_df[lower_df["transID"].isnull()])==0,"some parents are missing from exons"
lower_df.head()

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,transID
0,chr7,HAVANA,exon,127588345,127588565,.,+,.,Parent=ENST00000000233.9,ENST00000000233.9
1,chr7,HAVANA,exon,127589083,127589163,.,+,.,Parent=ENST00000000233.9,ENST00000000233.9
2,chr7,HAVANA,exon,127589485,127589594,.,+,.,Parent=ENST00000000233.9,ENST00000000233.9
3,chr7,HAVANA,exon,127590066,127590137,.,+,.,Parent=ENST00000000233.9,ENST00000000233.9
4,chr7,HAVANA,exon,127590963,127591088,.,+,.,Parent=ENST00000000233.9,ENST00000000233.9
5,chr7,HAVANA,exon,127591213,127591705,.,+,.,Parent=ENST00000000233.9,ENST00000000233.9
6,chr7,HAVANA,CDS,127588499,127588565,.,+,0,Parent=ENST00000000233.9,ENST00000000233.9
7,chr7,HAVANA,CDS,127589083,127589163,.,+,2,Parent=ENST00000000233.9,ENST00000000233.9
8,chr7,HAVANA,CDS,127589485,127589594,.,+,2,Parent=ENST00000000233.9,ENST00000000233.9
9,chr7,HAVANA,CDS,127590066,127590137,.,+,0,Parent=ENST00000000233.9,ENST00000000233.9


In [89]:
full_df_tr["geneID"]=full_df_tr.attributes.str.split("Parent=",expand=True)[1].str.split(";",expand=True)[0]
assert len(full_df_tr[full_df_tr["geneID"].isnull()])==0,"some parents are missing from transcripts"
full_df_tr["transID"]=full_df_tr.attributes.str.split("ID=",expand=True)[1].str.split(";",expand=True)[0]
assert len(full_df_tr[full_df_tr["transID"].isnull()])==0,"some IDs are missing from transcripts"
full_df_tr.head()

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,geneID,transID
0,chr7,HAVANA,transcript,127588345,127591705,.,+,.,ID=ENST00000000233.9;Parent=ENSG00000004059.10...,ENSG00000004059.10,ENST00000000233.9
1,chr12,HAVANA,transcript,8940365,8949955,.,-,.,ID=ENST00000000412.7;Parent=ENSG00000003056.7;...,ENSG00000003056.7,ENST00000000412.7
2,chr2,HAVANA,transcript,72129238,72148038,.,-,.,ID=ENST00000001146.6;Parent=ENSG00000003137.8;...,ENSG00000003137.8,ENST00000001146.6
3,chr4,HAVANA,transcript,11393150,11429765,.,-,.,ID=ENST00000002596.5;Parent=ENSG00000002587.9;...,ENSG00000002587.9,ENST00000002596.5
4,chr3,HAVANA,transcript,50155129,50189075,.,+,.,ID=ENST00000002829.7;Parent=ENSG00000001617.11...,ENSG00000001617.11,ENST00000002829.7


In [91]:
gene_df["geneID"]=gene_df.attributes.str.split("ID=",expand=True)[1].str.split(";",expand=True)[0]
assert len(gene_df[gene_df["geneID"].isnull()])==0,"some IDs are missing from genes"
gene_df["transID"]=np.inf
gene_df.head()

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,geneID,transID
0,chr1,intersect,gene,169849631,169893959,.,-,.,ID=ENSG00000000457.13,ENSG00000000457.13,inf
1,chr1,intersect,gene,169794730,169854080,.,+,.,ID=ENSG00000000460.16,ENSG00000000460.16,inf
2,chr1,intersect,gene,27612064,27635277,.,-,.,ID=ENSG00000000938.12,ENSG00000000938.12,inf
3,chr1,intersect,gene,196651878,196747504,.,+,.,ID=ENSG00000000971.15,ENSG00000000971.15,inf
4,chr6,intersect,gene,53497341,53545129,.,-,.,ID=ENSG00000001084.12,ENSG00000001084.12,inf


In [93]:
lower_df=lower_df.merge(full_df_tr[["transID","geneID"]],how="left",left_on="transID",right_on="transID")

dfM=pd.concat([gene_df[["seqid","source","type","start","end","score","strand","phase","attributes","transID","geneID"]],
        full_df_tr[["seqid","source","type","start","end","score","strand","phase","attributes","transID","geneID"]],
        lower_df[["seqid","source","type","start","end","score","strand","phase","attributes","transID","geneID"]]],axis=0).reset_index(drop=True)
dfM["type"]=pd.Categorical(dfM["type"],categories=["gene","transcript","exon","CDS"],ordered=True)
dfM=dfM.sort_values(by=["seqid","strand","geneID","transID","type"]).reset_index(drop=True)

In [95]:
# lastly need to remove alternative scaffolds
setChr=set(dfM['seqid'])
alts = set(dfM[dfM["seqid"].str.endswith("_alt")]["seqid"])
prims = ["chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13",
         "chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chrX","chrY","chrM"]
rest = setChr - (alts.union(set(prims)))

subdf = dfM[dfM["seqid"].isin(set(prims).union(set(rest)))].reset_index(drop=True)
subdf.to_csv("./prepAnnotation/refseq_vs_gencode_intersection.prot_lnc.no_alts.gff",sep="\t",index=False,header=False)