In [1]:
import pandas as pd
import numpy as np
import os

gff3Cols=["seqid","source","type","start","end","score","strand","phase","attributes"]
outDir="./res_tmp_full"

In [2]:
df=pd.read_csv(outDir+"/dfGFF_blat.gtf",sep="\t",names=gff3Cols)

df["id"]=df.attributes.str.split("ID=",expand=True)[1].str.split(";",expand=True)[0]
dfE=df[df["type"]=="exon"].reset_index(drop=True)
dfE["transID"]=dfE.attributes.str.split("Parent=",expand=True)[1].str.split(";",expand=True)[0]
transcriptSet=set(dfE["transID"])
dfT=df[df["id"].isin(transcriptSet)].reset_index(drop=True)
assert len(set(dfT["id"]))==len(transcriptSet)
dfT["transID"]=dfT["id"]

# here's the time to deal with inconsistencies
# first remove any enties that do not have a parent - those do not conform to the expected structure that we define
# however, this way we remove PSEUDOGENES, which may not be the desired effect - need to think more/discuss
dfE=dfE[dfE["transID"].isin(dfT["transID"])].reset_index(drop=True)
dfT["type"]="transcript"

dfE=dfE.merge(dfT[["transID"]],how="left",left_on="transID",right_on="transID")

# now need to concatenate all these three dataframes
dfM=pd.concat([dfT[["seqid","source","type","start","end","score","strand","phase","attributes","transID"]],dfE[["seqid","source","type","start","end","score","strand","phase","attributes","transID"]]],axis=0).reset_index(drop=True)
dfM["type"]=pd.Categorical(dfM["type"],categories=["transcript","exon"],ordered=True)
dfM=dfM.sort_values(by=["transID","type"]).reset_index(drop=True)
del dfE
del dfT
del df

dfM["start"]=dfM["start"].astype(int)+1
dfM["end"]=dfM["end"].astype(int)
# some pseudogenes are still present

dfM_g=dfM[dfM["type"].isin(["exon"])][["seqid","strand","type","start","end","transID"]].reset_index(drop=True)
dfM_g=dfM_g.sort_values(by=["transID","start"]).reset_index(drop=True)
dfM_g["ns"]=dfM_g.start.astype(str).shift(-1)
dfM_g["nid"]=dfM_g.transID.shift(-1)
dfM_g["ns"]=np.where(dfM_g["transID"]==dfM_g["nid"],dfM_g["ns"],"***")
dfM_g["ns"]=dfM_g.ns.astype(str)+"+"
dfM_g["ends"]=dfM_g.end.astype(str)+"+"
dfM_g=dfM_g[~(dfM_g['ns']=="***+")].reset_index(drop=True)
dfM_g=dfM_g.groupby(by="transID").agg({'start':'min',\
                                       'ns':'sum',\
                                       'ends':'sum',\
                                       'end':'max',\
                                       'strand':'min',\
                                       'seqid':'min',\
                                       'nid':'count'})
dfM_g.rename({"nid":'count'},axis=1,inplace=True)
dfM_g.reset_index(inplace=True)
dfM_g.columns=['transID','start_min',"ns",'end','end_max','strand','seqid','count']
dfM_g["ns"]=dfM_g["ns"].str.rstrip("***+")
dfM_g["end"]=dfM_g["end"].str.rstrip("***+")
dfM_g["chain"]=dfM_g['seqid']+":"+dfM_g['strand']+'@'+dfM_g['end']+'-'+dfM_g['ns']
dfM_g=dfM_g[['transID','start_min','end_max','chain','count']]

del transcriptSet
dfM=dfM[gff3Cols+["transID"]]
dfM

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,transID
0,CM000675.2,ref,transcript,18834403,18871969,.,-,.,ID=0,0
1,CM000675.2,ref,exon,18834403,18871969,.,-,.,Parent=0,0
2,CM000675.2,ref,transcript,113946084,113954541,.,-,.,ID=1,1
3,CM000675.2,ref,exon,113946084,113951962,.,-,.,Parent=1,1
4,CM000675.2,ref,exon,113953503,113954541,.,-,.,Parent=1,1
5,CM000675.2,ref,transcript,113956426,113972373,.,-,.,ID=10,10
6,CM000675.2,ref,exon,113956426,113956839,.,-,.,Parent=10,10
7,CM000675.2,ref,exon,113964927,113964994,.,-,.,Parent=10,10
8,CM000675.2,ref,exon,113967341,113967388,.,-,.,Parent=10,10
9,CM000675.2,ref,exon,113968467,113968514,.,-,.,Parent=10,10


In [4]:
df=pd.read_csv(outDir+"/chessPrim.gff",sep='\t',names=gff3Cols)
df=df[df["type"].isin(['transcript','exon'])].reset_index(drop=True)

df["id"]=df.attributes.str.split("ID=",expand=True)[1].str.split(";",expand=True)[0]
dfE=df[df["type"]=="exon"].reset_index(drop=True)
dfE["transID"]=dfE.attributes.str.split("Parent=",expand=True)[1].str.split(";",expand=True)[0]
transcriptSet=set(dfE["transID"])
dfT=df[df["id"].isin(transcriptSet)].reset_index(drop=True)
assert len(set(dfT["id"]))==len(transcriptSet)
dfT["transID"]=dfT["id"]

# here's the time to deal with inconsistencies
# first remove any enties that do not have a parent - those do not conform to the expected structure that we define
# however, this way we remove PSEUDOGENES, which may not be the desired effect - need to think more/discuss
dfE=dfE[dfE["transID"].isin(dfT["transID"])].reset_index(drop=True)
dfT["type"]="transcript"

dfE=dfE.merge(dfT[["transID"]],how="left",left_on="transID",right_on="transID")

# now need to concatenate all these three dataframes
dfM_2=pd.concat([dfT[["seqid","source","type","start","end","score","strand","phase","attributes","transID"]],dfE[["seqid","source","type","start","end","score","strand","phase","attributes","transID"]]],axis=0).reset_index(drop=True)
dfM_2["type"]=pd.Categorical(dfM_2["type"],categories=["transcript","exon"],ordered=True)
dfM_2=dfM_2.sort_values(by=["transID","type"]).reset_index(drop=True)
del dfE
del dfT
del df

dfM_2["start"]=dfM_2["start"].astype(int)
dfM_2["end"]=dfM_2["end"].astype(int)
# some pseudogenes are still present

dfM_2_g=dfM_2[dfM_2["type"].isin(["exon"])][["seqid","strand","type","start","end","transID"]].reset_index(drop=True)
dfM_2_g=dfM_2_g.sort_values(by=["transID","start"]).reset_index(drop=True)
dfM_2_g["ns"]=dfM_2_g.start.astype(str).shift(-1)
dfM_2_g["nid"]=dfM_2_g.transID.shift(-1)
dfM_2_g["ns"]=np.where(dfM_2_g["transID"]==dfM_2_g["nid"],dfM_2_g["ns"],"***")
dfM_2_g["ns"]=dfM_2_g.ns.astype(str)+"+"
dfM_2_g["ends"]=dfM_2_g.end.astype(str)+"+"
dfM_2_g=dfM_2_g[~(dfM_2_g['ns']=="***+")].reset_index(drop=True)
dfM_2_g=dfM_2_g.groupby(by="transID").agg({'start':'min','ns':'sum','ends':'sum','end':'max','strand':'min','seqid':'min','nid':'count'})
dfM_2_g.rename({"nid":'count'},axis=1,inplace=True)
dfM_2_g.reset_index(inplace=True)
dfM_2_g.columns=['transID','start_min',"ns",'end','end_max','strand','seqid','count']
dfM_2_g["ns"]=dfM_2_g["ns"].str.rstrip("***+")
dfM_2_g["end"]=dfM_2_g["end"].str.rstrip("***+")
dfM_2_g["chain"]=dfM_2_g['seqid']+":"+dfM_2_g['strand']+'@'+dfM_2_g['end']+'-'+dfM_2_g['ns']
dfM_2_g=dfM_2_g[['transID','start_min','end_max','chain','count']]

del transcriptSet
dfM_2=dfM_2[gff3Cols+["transID"]]
dfM_2

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,transID
0,CM000663.2,BestRefSeq,transcript,11874,14409,.,+,.,ID=CHS.1.1;Parent=CHS.1;gene_name=DDX11L1;Dbxr...,CHS.1.1
1,CM000663.2,BestRefSeq,exon,11874,12227,.,+,.,Parent=CHS.1.1,CHS.1.1
2,CM000663.2,BestRefSeq,exon,12613,12721,.,+,.,Parent=CHS.1.1,CHS.1.1
3,CM000663.2,BestRefSeq,exon,13221,14409,.,+,.,Parent=CHS.1.1,CHS.1.1
4,CM000663.2,Gnomon,transcript,184878,199860,.,-,.,ID=CHS.10.1;Parent=CHS.10;gene_name=LOC1027238...,CHS.10.1
5,CM000663.2,Gnomon,exon,184878,185350,.,-,.,Parent=CHS.10.1,CHS.10.1
6,CM000663.2,Gnomon,exon,185491,185559,.,-,.,Parent=CHS.10.1,CHS.10.1
7,CM000663.2,Gnomon,exon,186317,186469,.,-,.,Parent=CHS.10.1,CHS.10.1
8,CM000663.2,Gnomon,exon,187129,187287,.,-,.,Parent=CHS.10.1,CHS.10.1
9,CM000663.2,Gnomon,exon,187380,187577,.,-,.,Parent=CHS.10.1,CHS.10.1


In [9]:
# Step 1 - check if there are any that correspond to any intron chains entirely
#          these are the ones that already exist
intron_compatible=dfM_2_g.merge(dfM_g,how='inner',on='chain')
intron_compatible=intron_compatible[['transID_x','transID_y']]
intron_compatible.columns=['transID_prim','transID_alt']
intron_compatible.to_csv(outDir+"/intron_compatible.csv",index=False)

# step 2, check any single-exon transcripts that are contained within any other single-exon transcripts
singleDF_2=dfM_2[~(dfM_2["transID"].isin(dfM_2_g["transID"]))&(dfM_2['type']=='transcript')].reset_index(drop=True)
singleDF=dfM[~(dfM["transID"].isin(dfM_g["transID"]))&(dfM['type']=='transcript')].reset_index(drop=True)
singleDF=singleDF_2.merge(singleDF,on=['seqid','start','end','strand'],how="inner")
singleDF=singleDF[["transID_x","transID_y"]].reset_index(drop=True)
singleDF.columns=['transID_prim','transID_alt']
singleDF.to_csv(outDir+"/singleDF.csv",index=False)

# Step 2.5 we can now take the dataframes above, concatenate them and process
# these transcripts belong to genes that are guaranteed to exist on the primary scaffolds so we can move them
# and the rest of the sibling transcripts to the primary scaffolds
# and remove from the alternative scaffolds
perfectMatch=pd.concat([intron_compatible,singleDF])
mapDF=pd.read_csv(outDir+"/blat_map.csv")
mapDF['uid']=mapDF['uid'].astype(str)
perfectMatch=perfectMatch.merge(mapDF,how='left',left_on='transID_alt',right_on='uid')
perfectMatch.drop('uid',axis=1,inplace=True)
perfectMatch.columns=['transID_prim','transID_alt','transID_alt_2']
perfectMatch.to_csv(outDir+"/perfectMatch_blat.csv",index=False)
perfectMatch

Unnamed: 0,transID_prim,transID_alt,transID_alt_2
0,CHS.10151.4,4545,CHS.59603.1
1,CHS.10152.14,4531,CHS.59604.1
2,CHS.10152.15,4538,CHS.59604.2
3,CHS.10152.16,4539,CHS.59604.3
4,CHS.10152.17,4543,CHS.59604.4
5,CHS.10154.2,4654,CHS.59606.1
6,CHS.10155.1,4536,CHS.59607.1
7,CHS.10155.2,4541,CHS.59607.2
8,CHS.10155.3,4557,CHS.59607.3
9,CHS.10155.4,4555,CHS.59607.4


In [13]:
# now we can also add any other transcripts that belong to same alt genes

dfT=dfM[dfM["type"]=="transcript"].reset_index(drop=True)
dfT=dfT.merge(mapDF,how='left',left_on='transID',right_on='uid')
dfT['geneID_alt']="CHS."+dfT.qName.str.split(".",expand=True)[1]
perfectMatch["geneID_alt"]="CHS."+perfectMatch.transID_alt_2.str.split(".",expand=True)[1]
perfectMatch["geneID_prim"]="CHS."+perfectMatch.transID_prim.str.split(".",expand=True)[1]
dfT=dfT[dfT['geneID_alt'].isin(set(perfectMatch['geneID_alt']))].reset_index(drop=True)
dfT=dfT.merge(perfectMatch[["geneID_alt",'geneID_prim']],how='left',left_on='geneID_alt',right_on='geneID_alt')
dfT[["qName",'geneID_prim']].to_csv(outDir+"/additionalTrans_blat.csv",index=False)
dfT[["qName",'geneID_prim']]

Unnamed: 0,qName,geneID_prim
0,CHS.14872.1,CHS.13462
1,CHS.59631.1,CHS.14839
2,CHS.59632.7,CHS.14842
3,CHS.59632.7,CHS.14842
4,CHS.59632.7,CHS.14842
5,CHS.59632.7,CHS.14842
6,CHS.59632.7,CHS.14842
7,CHS.59632.7,CHS.14842
8,CHS.20830.6,CHS.19178
9,CHS.20830.6,CHS.19178


In [10]:
df=pd.read_csv(outDir+"/chessPrim.gff",sep="\t",names=gff3Cols)

df=df[df["type"]=="exon"].reset_index(drop=True)
df["transID"]=df.attributes.str.split("Parent=",expand=True)[1].str.split(";",expand=True)[0]
df["start"]=df["start"].astype(int)
df["end"]=df["end"].astype(int)
# some pseudogenes are still present

dfI_2=df[["seqid","strand","type","start","end","transID"]].reset_index(drop=True)
dfI_2=dfI_2.sort_values(by=["transID","start"]).reset_index(drop=True)
dfI_2["ns"]=dfI_2.start.astype(str).shift(-1)
dfI_2["nid"]=dfI_2.transID.shift(-1)
dfI_2["ns"]=np.where(dfI_2["transID"]==dfI_2["nid"],dfI_2["ns"],"***")
dfI_2["ns"]=dfI_2.ns
dfI_2["ends"]=dfI_2.end
dfI_2["start"]=dfI_2.ends
dfI_2["end"]=dfI_2.ns
dfI_2=dfI_2[~(dfI_2["end"]=="***")].reset_index(drop=True)
dfI_2.drop(["ns","nid","ends"],axis=1,inplace=True)
dfI_2["end"]=dfI_2["end"].astype(int)
dfI_2["id"]=dfI_2["seqid"]+":"+dfI_2["strand"]+"@"+dfI_2["start"].astype(str)+"-"+dfI_2["end"].astype(str)

dfI_2

Unnamed: 0,seqid,strand,type,start,end,transID,id
0,CM000663.2,+,exon,12227,12613,CHS.1.1,CM000663.2:+@12227-12613
1,CM000663.2,+,exon,12721,13221,CHS.1.1,CM000663.2:+@12721-13221
2,CM000663.2,-,exon,185350,185491,CHS.10.1,CM000663.2:-@185350-185491
3,CM000663.2,-,exon,185559,186317,CHS.10.1,CM000663.2:-@185559-186317
4,CM000663.2,-,exon,186469,187129,CHS.10.1,CM000663.2:-@186469-187129
5,CM000663.2,-,exon,187287,187380,CHS.10.1,CM000663.2:-@187287-187380
6,CM000663.2,-,exon,187577,187755,CHS.10.1,CM000663.2:-@187577-187755
7,CM000663.2,-,exon,187890,188130,CHS.10.1,CM000663.2:-@187890-188130
8,CM000663.2,-,exon,188266,188439,CHS.10.1,CM000663.2:-@188266-188439
9,CM000663.2,-,exon,188584,188791,CHS.10.1,CM000663.2:-@188584-188791


In [11]:
# merge two dataframes on introns
dfI=dfI_2.merge(dfI,on="id",how="inner")

# extract geneID
dfI["geneID"]="CHS."+dfI["transID_x"].str.split(".",expand=True)[1]
# check if any have more than one associated geneID (should not be the case ever)
oneExonShared=dfI.groupby(by="transID_y").agg({"geneID":{"genes":lambda x: set(x),"geneCount":lambda x: len(set(x))}}).reset_index()
oneExonShared.columns=["transID_alt","geneID_prim","geneCount_prim"]
oneExonShared

# separate into those with
oneExonShared_single=oneExonShared[oneExonShared["geneCount_prim"]==1].reset_index(drop=True)
oneExonShared_single.to_csv(outDir+"/oneIntronShared_single_gmap.csv",index=False)
oneExonShared_multi=oneExonShared[oneExonShared["geneCount_prim"]>1].reset_index(drop=True)
oneExonShared_multi.to_csv(outDir+"/oneIntronShared_multi_gmap.csv",index=False)
display(oneExonShared_single)
display(oneExonShared_multi)

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


Unnamed: 0,transID_alt,geneID_prim,geneCount_prim
0,1,{CHS.14839},1
1,10,{CHS.14842},1
2,100,{CHS.19178},1
3,1000,{CHS.49661},1
4,10000,{CHS.45955},1
5,10001,{CHS.46058},1
6,10002,{CHS.46049},1
7,10003,{CHS.45921},1
8,10004,{CHS.45921},1
9,10005,{CHS.45948},1


Unnamed: 0,transID_alt,geneID_prim,geneCount_prim
0,10032,"{CHS.46063, CHS.46056}",2
1,10052,"{CHS.45996, CHS.45995}",2
2,10053,"{CHS.45996, CHS.45995}",2
3,10059,"{CHS.46063, CHS.46056}",2
4,10068,"{CHS.45996, CHS.45997}",2
5,10069,"{CHS.45996, CHS.45997}",2
6,10070,"{CHS.45996, CHS.45997}",2
7,10071,"{CHS.45996, CHS.45997}",2
8,10072,"{CHS.45996, CHS.45997}",2
9,10093,"{CHS.46032, CHS.46031}",2


In [None]:
# move transcripts

# not yet

In [None]:
# step 4. see if there is significant overlap between any features
# perhaps with bedtools, maybe something else

In [None]:
# for the remainder, we should first find those alignments that have good alignment quality, and figure out which one whould be used

In [None]:
# first let's create geneIDs for all transcripts
# - need to identify overlapping transcripts
dfT=df[df["type"]=="transcript"].reset_index(drop=True)
dfT["start"]=dfT['start'].astype(int)
dfT["end"]=dfT['end'].astype(int)
dfT.sort_values(by=['seqid','strand','start','end'],ascending=True,inplace=True)
dfT.reset_index(drop=True,inplace=True)
dfT

dup_dfT=dfT[dfT.duplicated(["seqid","start","end","strand"],keep=False)].reset_index()
dup_dfT["transID"]=dup_dfT['attributes'].str.split("ID=",expand=True)[1].str.split(";",expand=True)[0]
dfT.groupby(by=["seqid","start","end","strand"]).agg({"type":'min',\
                                                       "source":'min',\
                                                       "score":'min',\
                                                       "phase":'min',\
                                                       "attributes":'min'}).reset_index()
dup_dfT

# now need to reassign transcript IDS to all exons that belong to duuplicated transcripts
dfE=df[df['type']=='exon'].reset_index(drop=True)
dfE_dupT

