In [1]:
import pandas as pd
import numpy as np
import os

gff3Cols=["seqid","source","type","start","end","score","strand","phase","attributes"]
outDir="./res_tmp_full"

In [2]:
# Step 1 - unify blat and gmap perfect gene maps
gmapDF_perfect=pd.read_csv(outDir+"/perfectMatch_gmap.csv")
blatDF_perfect=pd.read_csv(outDir+"/perfectMatch_blat.csv")
gmapDF_rest=pd.read_csv(outDir+"/additionalTrans_gmap.csv")
blatDF_rest=pd.read_csv(outDir+"/additionalTrans_blat.csv")
setAllUsedTranscripts_alt=set(gmapDF_perfect["transID_alt_2"])\
                            .union(set(blatDF_perfect["transID_alt_2"])\
                            .union(set(gmapDF_rest["QNAME"])\
                            .union(set(blatDF_rest["qName"]))))
print(len(setAllUsedTranscripts_alt))

8930


In [3]:
# Step 2 - remove all that have been identified in the previous step by either gmap or blat
df=pd.read_csv(outDir+"/dfGFF_blat.gtf",sep="\t",names=gff3Cols)
df=df[df["type"]=="exon"].reset_index(drop=True)
df["transID"]=df.attributes.str.split("Parent=",expand=True)[1].str.split(";",expand=True)[0]

mapDF=pd.read_csv(outDir+'/blat_map.csv')
mapDF['uid']=mapDF['uid'].astype(str)
df=df.merge(mapDF,how='left',left_on='transID',right_on='uid')
df=df[~(df['qName'].isin(setAllUsedTranscripts_alt))]
df

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,transID,qName,uid
3,CM000675.2,ref,exon,112313466,112318737,.,+,.,Parent=2,2,CHS.14863.1,2
4,CM000675.2,ref,exon,112318839,112319716,.,+,.,Parent=2,2,CHS.14863.1,2
5,CM000675.2,ref,exon,112321111,112321766,.,+,.,Parent=2,2,CHS.14863.1,2
6,CM000675.2,ref,exon,113741868,113742279,.,-,.,Parent=3,3,CHS.14874.1,3
7,CM000675.2,ref,exon,113743309,113743430,.,-,.,Parent=3,3,CHS.14874.1,3
8,CM000675.2,ref,exon,113744673,113745021,.,-,.,Parent=3,3,CHS.14874.1,3
9,CM000675.2,ref,exon,113751447,113751992,.,-,.,Parent=3,3,CHS.14874.1,3
10,CM000675.2,ref,exon,113752032,113752802,.,-,.,Parent=3,3,CHS.14874.1,3
11,CM000675.2,ref,exon,113752802,113753539,.,-,.,Parent=3,3,CHS.14874.1,3
12,CM000675.2,ref,exon,113753539,113754127,.,-,.,Parent=3,3,CHS.14874.1,3


In [4]:
# Step 3 - find all that have at least one shared intron
#          these are the ones that need to be added as new transcripts to pre-existing genes

# to do this - create a dataframe with introns instead of exons
df["start"]=df["start"].astype(int)+1
df["end"]=df["end"].astype(int)
# some pseudogenes are still present

dfI=df[["seqid","strand","type","start","end","transID"]].reset_index(drop=True)
dfI=dfI.sort_values(by=["transID","start"]).reset_index(drop=True)
dfI["ns"]=dfI.start.astype(str).shift(-1)
dfI["nid"]=dfI.transID.shift(-1)
dfI["ns"]=np.where(dfI["transID"]==dfI["nid"],dfI["ns"],"***")
dfI["ns"]=dfI.ns
dfI["ends"]=dfI.end
dfI["start"]=dfI.ends
dfI["end"]=dfI.ns
dfI=dfI[~(dfI["end"]=="***")].reset_index(drop=True)
dfI.drop(["ns","nid","ends"],axis=1,inplace=True)
dfI["end"]=dfI["end"].astype(int)
dfI["id"]=dfI["seqid"]+":"+dfI["strand"]+"@"+dfI["start"].astype(str)+"-"+dfI["end"].astype(str)

dfI.sort_values(by="id",ascending=True)

Unnamed: 0,seqid,strand,type,start,end,transID,id
7560,CM000663.2,+,exon,13027621,13029339,5149,CM000663.2:+@13027621-13029339
7561,CM000663.2,+,exon,13029650,13030047,5149,CM000663.2:+@13029650-13030047
7562,CM000663.2,+,exon,13030625,13031572,5149,CM000663.2:+@13030625-13031572
7641,CM000663.2,+,exon,144324981,144333928,5236,CM000663.2:+@144324981-144333928
7668,CM000663.2,+,exon,144324981,144333928,5245,CM000663.2:+@144324981-144333928
7637,CM000663.2,+,exon,144324981,144333928,5234,CM000663.2:+@144324981-144333928
7638,CM000663.2,+,exon,144334021,144335655,5234,CM000663.2:+@144334021-144335655
7669,CM000663.2,+,exon,144334021,144335655,5245,CM000663.2:+@144334021-144335655
7642,CM000663.2,+,exon,144334021,144335655,5236,CM000663.2:+@144334021-144335655
7639,CM000663.2,+,exon,144335773,144337905,5234,CM000663.2:+@144335773-144337905


In [5]:
df=pd.read_csv(outDir+"/chessPrim.gff",sep="\t",names=gff3Cols)

df=df[df["type"]=="exon"].reset_index(drop=True)
df["transID"]=df.attributes.str.split("Parent=",expand=True)[1].str.split(";",expand=True)[0]
df["start"]=df["start"].astype(int)
df["end"]=df["end"].astype(int)
# some pseudogenes are still present

dfI_2=df[["seqid","strand","type","start","end","transID"]].reset_index(drop=True)
dfI_2=dfI_2.sort_values(by=["transID","start"]).reset_index(drop=True)
dfI_2["ns"]=dfI_2.start.astype(str).shift(-1)
dfI_2["nid"]=dfI_2.transID.shift(-1)
dfI_2["ns"]=np.where(dfI_2["transID"]==dfI_2["nid"],dfI_2["ns"],"***")
dfI_2["ns"]=dfI_2.ns
dfI_2["ends"]=dfI_2.end
dfI_2["start"]=dfI_2.ends
dfI_2["end"]=dfI_2.ns
dfI_2=dfI_2[~(dfI_2["end"]=="***")].reset_index(drop=True)
dfI_2.drop(["ns","nid","ends"],axis=1,inplace=True)
dfI_2["end"]=dfI_2["end"].astype(int)
dfI_2["id"]=dfI_2["seqid"]+":"+dfI_2["strand"]+"@"+dfI_2["start"].astype(str)+"-"+dfI_2["end"].astype(str)

dfI_2

Unnamed: 0,seqid,strand,type,start,end,transID,id
0,CM000663.2,+,exon,12227,12613,CHS.1.1,CM000663.2:+@12227-12613
1,CM000663.2,+,exon,12721,13221,CHS.1.1,CM000663.2:+@12721-13221
2,CM000663.2,-,exon,185350,185491,CHS.10.1,CM000663.2:-@185350-185491
3,CM000663.2,-,exon,185559,186317,CHS.10.1,CM000663.2:-@185559-186317
4,CM000663.2,-,exon,186469,187129,CHS.10.1,CM000663.2:-@186469-187129
5,CM000663.2,-,exon,187287,187380,CHS.10.1,CM000663.2:-@187287-187380
6,CM000663.2,-,exon,187577,187755,CHS.10.1,CM000663.2:-@187577-187755
7,CM000663.2,-,exon,187890,188130,CHS.10.1,CM000663.2:-@187890-188130
8,CM000663.2,-,exon,188266,188439,CHS.10.1,CM000663.2:-@188266-188439
9,CM000663.2,-,exon,188584,188791,CHS.10.1,CM000663.2:-@188584-188791


In [6]:
# merge two dataframes on introns
dfI=dfI_2.merge(dfI,on="id",how="inner")
del dfI_2

# extract geneID
dfI["geneID"]="CHS."+dfI["transID_x"].str.split(".",expand=True)[1]
# check if any have more than one associated geneID (should not be the case ever)
oneExonShared=dfI.groupby(by="transID_y").agg({"geneID":{"genes":lambda x: set(x),"geneCount":lambda x: len(set(x))}}).reset_index()
oneExonShared.columns=["transID_alt","geneID_prim","geneCount_prim"]
oneExonShared

# separate into those with
oneExonShared_single=oneExonShared[oneExonShared["geneCount_prim"]==1].reset_index(drop=True)
oneExonShared_single['geneID_prim']=oneExonShared_single.apply(lambda row: ",".join(list(row['geneID_prim'])),axis=1)
oneExonShared_single=oneExonShared_single.merge(mapDF,how='left',left_on='transID_alt',right_on='uid')[['qName','geneID_prim']]
oneExonShared_single.to_csv(outDir+"/oneIntronShared_single_blat.csv",index=False)
oneExonShared_multi=oneExonShared[oneExonShared["geneCount_prim"]>1].reset_index(drop=True)
oneExonShared_multi['geneID_prim']=oneExonShared_multi.apply(lambda row: ",".join(list(row['geneID_prim'])),axis=1)
oneExonShared_multi=oneExonShared_multi.merge(mapDF,how='left',left_on='transID_alt',right_on='uid')[['qName','geneID_prim']]
oneExonShared_multi.to_csv(outDir+"/oneIntronShared_multi_blat.csv",index=False)
display(oneExonShared_single)
display(oneExonShared_multi)

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


Unnamed: 0,qName,geneID_prim
0,CHS.52476.2,CHS.49661
1,CHS.49295.6,CHS.45955
2,CHS.48757.7,CHS.46058
3,CHS.49371.4,CHS.46049
4,CHS.49054.5,CHS.45921
5,CHS.49054.9,CHS.45921
6,CHS.48891.13,CHS.45948
7,CHS.48755.7,CHS.46049
8,CHS.48715.1,CHS.45994
9,CHS.48951.11,CHS.46049


Unnamed: 0,qName,geneID_prim
0,CHS.48756.5,"CHS.46056,CHS.46063"
1,CHS.48928.3,"CHS.45995,CHS.45996"
2,CHS.48928.4,"CHS.45995,CHS.45996"
3,CHS.49155.4,"CHS.46056,CHS.46063"
4,CHS.48496.1,"CHS.45996,CHS.45997"
5,CHS.48749.1,"CHS.46032,CHS.46031"
6,CHS.48853.1,"CHS.45887,CHS.45888"
7,CHS.48853.2,"CHS.45887,CHS.45888"
8,CHS.48853.3,"CHS.45887,CHS.45888"
9,CHS.27908.8,"CHS.27573,CHS.27574"


Unnamed: 0,qName,geneID_prim
0,CHS.52476.2,CHS.49661
1,CHS.49295.6,CHS.45955
2,CHS.48757.7,CHS.46058
3,CHS.49371.4,CHS.46049
4,CHS.49054.5,CHS.45921
5,CHS.49054.9,CHS.45921
6,CHS.48891.13,CHS.45948
7,CHS.48755.7,CHS.46049
8,CHS.48715.1,CHS.45994
9,CHS.48951.11,CHS.46049
