In [7]:
import pandas as pd
import numpy as np
import os

gff3Cols=["seqid","source","type","start","end","score","strand","phase","attributes"]
outDir="./res_tmp_full"

In [8]:
# Step 1 - unify blat and gmap perfect gene maps
gmapDF_perfect=pd.read_csv(outDir+"/perfectMatch_gmap.csv")
blatDF_perfect=pd.read_csv(outDir+"/perfectMatch_blat.csv")
gmapDF_rest=pd.read_csv(outDir+"/additionalTrans_gmap.csv")
blatDF_rest=pd.read_csv(outDir+"/additionalTrans_blat.csv")
setAllUsedTranscripts_alt=set(gmapDF_perfect["transID_alt_2"])\
                            .union(set(blatDF_perfect["transID_alt_2"])\
                            .union(set(gmapDF_rest["QNAME"])\
                            .union(set(blatDF_rest["qName"]))))
print(len(setAllUsedTranscripts_alt))

8930


In [9]:
# Step 2 - remove all that have been identified in the previous step by either gmap or blat
df=pd.read_csv(outDir+"/dfGFF_gmap.gtf",sep="\t",names=gff3Cols)
df=df[df["type"]=="exon"].reset_index(drop=True)
df["transID"]=df.attributes.str.split("Parent=",expand=True)[1].str.split(";",expand=True)[0]

mapDF=pd.read_csv(outDir+'/gmap_map.csv')
mapDF['uid']=mapDF['uid'].astype(str)
df=df.merge(mapDF,how='left',left_on='transID',right_on='uid')
df=df[~(df['QNAME'].isin(setAllUsedTranscripts_alt))]
df

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,transID,QNAME,uid
5,CM000670.2,ref,exon,1371622,1371935,.,-,.,Parent=2,2,CHS.54822.1,2
6,CM000670.2,ref,exon,1372364,1373222,.,-,.,Parent=2,2,CHS.54822.1,2
20,CM000670.2,ref,exon,1565509,1565739,.,-,.,Parent=8,8,CHS.54827.1,8
23,CM000670.2,ref,exon,12378651,12380286,.,-,.,Parent=10,10,CHS.54831.1,10
78,CM000670.2,ref,exon,144693006,144693813,.,-,.,Parent=30,30,CHS.54850.1,30
81,CM000670.2,ref,exon,144738004,144738707,.,-,.,Parent=32,32,CHS.54854.1,32
82,CM000670.2,ref,exon,144738813,144739653,.,-,.,Parent=32,32,CHS.54854.1,32
98,CM000670.2,ref,exon,143536356,143536552,.,-,.,Parent=37,37,CHS.54855.1,37
99,CM000670.2,ref,exon,143538101,143539419,.,-,.,Parent=37,37,CHS.54855.1,37
100,CM000670.2,ref,exon,143543023,143543098,.,-,.,Parent=37,37,CHS.54855.1,37


In [10]:
# Step 3 - find all that have at least one shared intron
#          these are the ones that need to be added as new transcripts to pre-existing genes

# to do this - create a dataframe with introns instead of exons
df["start"]=df["start"].astype(int)+1
df["end"]=df["end"].astype(int)
# some pseudogenes are still present

dfI=df[["seqid","strand","type","start","end","transID"]].reset_index(drop=True)
dfI=dfI.sort_values(by=["transID","start"]).reset_index(drop=True)
dfI["ns"]=dfI.start.astype(str).shift(-1)
dfI["nid"]=dfI.transID.shift(-1)
dfI["ns"]=np.where(dfI["transID"]==dfI["nid"],dfI["ns"],"***")
dfI["ns"]=dfI.ns
dfI["ends"]=dfI.end
dfI["start"]=dfI.ends
dfI["end"]=dfI.ns
dfI=dfI[~(dfI["end"]=="***")].reset_index(drop=True)
dfI.drop(["ns","nid","ends"],axis=1,inplace=True)
dfI["end"]=dfI["end"].astype(int)
dfI["id"]=dfI["seqid"]+":"+dfI["strand"]+"@"+dfI["start"].astype(str)+"-"+dfI["end"].astype(str)

dfI.sort_values(by="id",ascending=True)

Unnamed: 0,seqid,strand,type,start,end,transID,id
260,CM000663.2,+,exon,13027622,13029340,10349,CM000663.2:+@13027622-13029340
261,CM000663.2,+,exon,13029651,13031765,10349,CM000663.2:+@13029651-13031765
262,CM000663.2,+,exon,13032343,13035403,10349,CM000663.2:+@13032343-13035403
253,CM000663.2,+,exon,143499497,143507500,10341,CM000663.2:+@143499497-143507500
233,CM000663.2,+,exon,143499497,143507500,10336,CM000663.2:+@143499497-143507500
240,CM000663.2,+,exon,143499497,143507500,10338,CM000663.2:+@143499497-143507500
241,CM000663.2,+,exon,143507593,143517220,10338,CM000663.2:+@143507593-143517220
254,CM000663.2,+,exon,143507593,143517220,10341,CM000663.2:+@143507593-143517220
234,CM000663.2,+,exon,143507593,143517220,10336,CM000663.2:+@143507593-143517220
242,CM000663.2,+,exon,143517338,143529105,10338,CM000663.2:+@143517338-143529105


In [11]:
df=pd.read_csv(outDir+"/chessPrim.gff",sep="\t",names=gff3Cols)

df=df[df["type"]=="exon"].reset_index(drop=True)
df["transID"]=df.attributes.str.split("Parent=",expand=True)[1].str.split(";",expand=True)[0]
df["start"]=df["start"].astype(int)
df["end"]=df["end"].astype(int)
# some pseudogenes are still present

dfI_2=df[["seqid","strand","type","start","end","transID"]].reset_index(drop=True)
dfI_2=dfI_2.sort_values(by=["transID","start"]).reset_index(drop=True)
dfI_2["ns"]=dfI_2.start.astype(str).shift(-1)
dfI_2["nid"]=dfI_2.transID.shift(-1)
dfI_2["ns"]=np.where(dfI_2["transID"]==dfI_2["nid"],dfI_2["ns"],"***")
dfI_2["ns"]=dfI_2.ns
dfI_2["ends"]=dfI_2.end
dfI_2["start"]=dfI_2.ends
dfI_2["end"]=dfI_2.ns
dfI_2=dfI_2[~(dfI_2["end"]=="***")].reset_index(drop=True)
dfI_2.drop(["ns","nid","ends"],axis=1,inplace=True)
dfI_2["end"]=dfI_2["end"].astype(int)
dfI_2["id"]=dfI_2["seqid"]+":"+dfI_2["strand"]+"@"+dfI_2["start"].astype(str)+"-"+dfI_2["end"].astype(str)

dfI_2

Unnamed: 0,seqid,strand,type,start,end,transID,id
0,CM000663.2,+,exon,12227,12613,CHS.1.1,CM000663.2:+@12227-12613
1,CM000663.2,+,exon,12721,13221,CHS.1.1,CM000663.2:+@12721-13221
2,CM000663.2,-,exon,185350,185491,CHS.10.1,CM000663.2:-@185350-185491
3,CM000663.2,-,exon,185559,186317,CHS.10.1,CM000663.2:-@185559-186317
4,CM000663.2,-,exon,186469,187129,CHS.10.1,CM000663.2:-@186469-187129
5,CM000663.2,-,exon,187287,187380,CHS.10.1,CM000663.2:-@187287-187380
6,CM000663.2,-,exon,187577,187755,CHS.10.1,CM000663.2:-@187577-187755
7,CM000663.2,-,exon,187890,188130,CHS.10.1,CM000663.2:-@187890-188130
8,CM000663.2,-,exon,188266,188439,CHS.10.1,CM000663.2:-@188266-188439
9,CM000663.2,-,exon,188584,188791,CHS.10.1,CM000663.2:-@188584-188791


In [12]:
# merge two dataframes on introns
dfI=dfI_2.merge(dfI,on="id",how="inner")
del dfI_2

# extract geneID
dfI["geneID"]="CHS."+dfI["transID_x"].str.split(".",expand=True)[1]
# check if any have more than one associated geneID (should not be the case ever)
oneExonShared=dfI.groupby(by="transID_y").agg({"geneID":{"genes":lambda x: set(x),"geneCount":lambda x: len(set(x))}}).reset_index()
oneExonShared.columns=["transID_alt","geneID_prim","geneCount_prim"]
oneExonShared

# separate into those with
oneExonShared_single=oneExonShared[oneExonShared["geneCount_prim"]==1].reset_index(drop=True)
oneExonShared_single['geneID_prim']=oneExonShared_single.apply(lambda row: ",".join(list(row['geneID_prim'])),axis=1)
oneExonShared_single=oneExonShared_single.merge(mapDF,how='left',left_on='transID_alt',right_on='uid')[['QNAME','geneID_prim']]
oneExonShared_single.to_csv(outDir+"/oneIntronShared_single_gmap.csv",index=False)
oneExonShared_multi=oneExonShared[oneExonShared["geneCount_prim"]>1].reset_index(drop=True)
oneExonShared_multi=oneExonShared_multi.merge(mapDF,how='left',left_on='transID_alt',right_on='uid')[['QNAME','geneID_prim']]
oneExonShared_multi.to_csv(outDir+"/oneIntronShared_multi_gmap.csv",index=False)
display(oneExonShared_single)
display(oneExonShared_multi)

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


Unnamed: 0,QNAME,geneID_prim
0,CHS.39517.2,CHS.36569
1,CHS.27902.3,CHS.27564
2,CHS.27902.4,CHS.27564
3,CHS.28088.3,CHS.27564
4,CHS.45027.3,CHS.43081
5,CHS.36303.14,CHS.35988
6,CHS.36303.15,CHS.35988
7,CHS.36303.30,CHS.35988
8,CHS.36303.31,CHS.35988
9,CHS.36303.32,CHS.35988


Unnamed: 0,QNAME,geneID_prim
