In [22]:
from pybedtools import BedTool
import pandas as pd
import numpy as np
import os

pd.set_option('display.max_columns',500)

gff3Cols=["seqid","source","type","start","end","score","strand","phase","attributes"]

intersectCols=["chrom",
                "source",
                "type",
                "start",
                "end",
                "score",
                "strand",
                "phase",
                "attributes",
                "chromK",
                "sourceK",
                "typeK",
                "startK",
                "endK",
                "scoreK",
                "strandK",
                "phaseK",
                "attributesK",
                "distance"]

outDir="./res_tmp_full"

In [23]:
mapDF=pd.read_csv(outDir+"/blat_map.csv")
mapDF

Unnamed: 0,qName,uid
0,CHS.14872.1,0
1,CHS.59631.1,1
2,CHS.14863.1,2
3,CHS.14874.1,3
4,CHS.59633.2,4
5,CHS.59633.1,5
6,CHS.59633.3,6
7,CHS.14859.6,7
8,CHS.59632.1,8
9,CHS.14875.1,9


In [24]:
# Step 1 - unify blat and gmap perfect gene maps as well as the imperfect ones
gmapDF_perfect=pd.read_csv(outDir+"/perfectMatch_gmap.csv")
blatDF_perfect=pd.read_csv(outDir+"/perfectMatch_blat.csv")
gmapDF_rest=pd.read_csv(outDir+"/additionalTrans_gmap.csv")
blatDF_rest=pd.read_csv(outDir+"/additionalTrans_blat.csv")

gmapDF_single=pd.read_csv(outDir+"/oneIntronShared_single_gmap.csv")
blatDF_single=pd.read_csv(outDir+"/oneIntronShared_single_blat.csv")
gmapDF_multi=pd.read_csv(outDir+"/oneIntronShared_multi_gmap.csv")
blatDF_multi=pd.read_csv(outDir+"/oneIntronShared_multi_blat.csv")

setAllUsedTranscripts_alt=set(gmapDF_perfect["transID_alt_2"])\
                            .union(set(blatDF_perfect["transID_alt_2"])\
                            .union(set(gmapDF_rest["QNAME"])\
                            .union(set(blatDF_rest["qName"])\
                            .union(set(gmapDF_single["QNAME"])\
                            .union(set(blatDF_single["qName"])\
                            .union(set(gmapDF_multi["QNAME"])\
                            .union(set(blatDF_multi["qName"]))))))))
print(len(setAllUsedTranscripts_alt))

10936


In [25]:
# Step 2 - remove all that have been identified in the previous step by either gmap or blat
df=pd.read_csv(outDir+"/dfGFF_blat.gtf",sep="\t",names=gff3Cols)
df=df[df["type"]=="exon"].reset_index(drop=True)
df["transID"]=df.attributes.str.split("Parent=",expand=True)[1].str.split(";",expand=True)[0]

mapDF=pd.read_csv(outDir+'/blat_map.csv')
mapDF['uid']=mapDF['uid'].astype(str)
df=df.merge(mapDF,how='left',left_on='transID',right_on='uid')
df=df[~(df['qName'].isin(setAllUsedTranscripts_alt))].reset_index(drop=True)
df

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,transID,qName,uid
0,CM000675.2,ref,exon,113748510,113750815,.,-,.,Parent=9,9,CHS.14875.1,9
1,CM000675.2,ref,exon,113750817,113751089,.,-,.,Parent=9,9,CHS.14875.1,9
2,CM000675.2,ref,exon,112331225,112332719,.,-,.,Parent=22,22,CHS.14865.1,22
3,CM000675.2,ref,exon,112333592,112333660,.,-,.,Parent=22,22,CHS.14865.1,22
4,CM000675.2,ref,exon,114126157,114127591,.,+,.,Parent=24,24,CHS.59635.1,24
5,CM000675.2,ref,exon,114125286,114126062,.,-,.,Parent=31,31,CHS.59634.1,31
6,CM000675.2,ref,exon,114126100,114126282,.,-,.,Parent=31,31,CHS.59634.1,31
7,CM000675.2,ref,exon,18334816,18334938,.,+,.,Parent=55,55,CHS.59625.1,55
8,CM000678.2,ref,exon,15363623,15363810,.,-,.,Parent=56,56,CHS.20829.1,56
9,CM000678.2,ref,exon,15363822,15363878,.,-,.,Parent=56,56,CHS.20829.1,56


In [26]:
# this time, since we've already identified all transcripts that exist, we can just check for the overlap with genes
# instead of overlap with exons
dfPrim=pd.read_csv(outDir+'/chessPrim.gff',sep='\t',names=gff3Cols)
dfPrim=dfPrim[dfPrim['type']=='gene'].reset_index(drop=True)
dfPrim

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes
0,CM000663.2,RefSeq,gene,11874,14409,.,+,.,ID=CHS.1;GENE_TYPE=misc_RNA;STATUS=known_refse...
1,CM000663.2,RefSeq,gene,14362,29370,.,-,.,ID=CHS.2;GENE_TYPE=misc_RNA;STATUS=known_refse...
2,CM000663.2,RefSeq,gene,29926,31295,.,+,.,ID=CHS.3;GENE_TYPE=lncRNA;STATUS=known_refseq;...
3,CM000663.2,RefSeq,gene,34611,36081,.,-,.,ID=CHS.4;GENE_TYPE=lncRNA;STATUS=known_refseq;...
4,CM000663.2,RefSeq,gene,51943,53959,.,+,.,ID=CHS.5;GENE_TYPE=lncRNA;STATUS=known_refseq;...
5,CM000663.2,RefSeq,gene,69091,70008,.,+,.,ID=CHS.6;GENE_TYPE=protein_coding;STATUS=known...
6,CM000663.2,RefSeq,gene,91169,297504,.,-,.,ID=CHS.7;GENE_TYPE=misc_RNA;STATUS=known_refse...
7,CM000663.2,RefSeq,gene,134773,140566,.,-,.,ID=CHS.8;GENE_TYPE=lncRNA;STATUS=known_refseq;...
8,CM000663.2,RefSeq,gene,181049,184258,.,+,.,ID=CHS.9;GENE_TYPE=protein_coding;STATUS=known...
9,CM000663.2,RefSeq,gene,184878,199860,.,-,.,ID=CHS.10;GENE_TYPE=misc_RNA;STATUS=known_refs...


In [28]:
annotation=BedTool.from_dataframe(dfPrim)
## now to run bedtools on the entire thing
sites=BedTool.from_dataframe(df[gff3Cols])
nearby=sites.intersect(annotation,wao=True)
dfIntersect=pd.read_table(nearby.fn,names=intersectCols,index_col=False)

def writeGFF(row):
    percentOverlaps=[]
    percentOverlaps2=[]
    percentOverlaps3=[]
    wpo=[]
    wpo2=[]
    wpoL=1
    wpo2L=1
    
    if row.distance==0:
        percentOverlaps.append(0)
        percentOverlaps2.append(0)
        wpo2L=wpo2L+float(int(row.end)-int(row.start))
    else:
        po=round(float(row.distance)/float(int(row.endK)-int(row.startK)),4)
        if po>1.0:
            po=1.0
        percentOverlaps.append(po)
        percentOverlaps2.append(round(float(row.distance)/float(int(row.end)-int(row.start)),4))
        percentOverlaps3.append(".".join(row.attributesK.split("Parent=")[-1].split(".")[:-1]))

    return [";".join([str(x) for x in percentOverlaps]),";".join([str(x) for x in percentOverlaps2]),";".join(list(set(percentOverlaps3)))]

dfIntersect["po_prim"]=np.nan
dfIntersect["po_alt"]=np.nan
dfIntersect["ann"]=np.nan
dfIntersect[["po_prim","po_alt","ann"]]=pd.DataFrame([x for x in dfIntersect.apply(lambda row: writeGFF(row),axis=1)])
dfIntersect.to_csv(outDir+"/finalMappingOverlap_gene_blat.csv",index=False)
dfIntersect['po_prim'].astype(float)
dfIntersect['po_alt'].astype(float)
# dfIntersect=dfIntersect.merge(df[['uid','blockCount']],how="left",left_on="attributes",right_on="uid")

dfIntersect['id']=dfIntersect.attributes.str.split("Parent=",expand=True)[1]
mapDF['uid']=mapDF['uid'].astype(str)
dfIntersect=dfIntersect.merge(mapDF,how='left',left_on='id',right_on='uid')
dfIntersect.rename({'qName':'transID_alt'},axis=1,inplace=True)
dfIntersect["geneID_alt"]='CHS.'+dfIntersect['transID_alt'].str.split(".",expand=True)[1]

dfIntersect['po_alt']=dfIntersect['po_alt'].astype(float)
dfIntersect['distance']=dfIntersect['distance'].astype(int)
dfGroups=pd.DataFrame(dfIntersect[['geneID_alt','distance','po_alt','transID_alt']].groupby(by='geneID_alt').agg({'distance':'sum',
                                                                                                    'po_alt':{'po_alt_min':'min',
                                                                                                              'po_alt_max':'max'},
                                                                                                    'transID_alt':{'transID_alt_names':lambda x:",".join(list(set(x))),
                                                                                                                  'transID_alt_count':lambda x:len(set(x))}})).reset_index()
dfGroups.columns=['geneID_alt','total_distance','po_alt_min','po_alt_max','transID_alt_names','transID_count']


# for these a separate step is needed, create new gene entries (standardize boundaries, common exons, etc)

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


In [34]:
# now can identify all cases, when all transcripts of the same alt gene
# have no overlap with the primary scaffolds
dfNone=dfGroups[dfGroups['total_distance']==0].reset_index(drop=True)
dfNone.to_csv(outDir+'/none_blat.csv',index=False)
dfNone

Unnamed: 0,geneID_alt,total_distance,po_alt_min,po_alt_max,transID_alt_names,transID_count
0,CHS.10528,0,0.0,0.0,"CHS.10528.2,CHS.10528.1",2
1,CHS.10590,0,0.0,0.0,CHS.10590.1,1
2,CHS.16641,0,0.0,0.0,CHS.16641.1,1
3,CHS.16642,0,0.0,0.0,CHS.16642.1,1
4,CHS.18532,0,0.0,0.0,CHS.18532.1,1
5,CHS.18552,0,0.0,0.0,"CHS.18552.1,CHS.18552.2",2
6,CHS.18607,0,0.0,0.0,CHS.18607.1,1
7,CHS.20848,0,0.0,0.0,CHS.20848.1,1
8,CHS.20855,0,0.0,0.0,CHS.20855.1,1
9,CHS.23680,0,0.0,0.0,"CHS.23680.1,CHS.23680.2",2


In [35]:
# cases when on alt scaffolds transcripts belong to the same alternative gene
# and where some transcripts do have an overlap with a gene on primary scaffold, and some do not have any such overlap
dfSome=dfGroups[(dfGroups['po_alt_min']==0)&(dfGroups['po_alt_max']>0)].reset_index(drop=True)
dfSome.to_csv(outDir+'/some_blat.csv',index=False)
dfSome

Unnamed: 0,geneID_alt,total_distance,po_alt_min,po_alt_max,transID_alt_names,transID_count
0,CHS.10544,212,0.0,1.2,"CHS.10544.2,CHS.10544.1",2
1,CHS.10545,26,0.0,1.0909,CHS.10545.1,1
2,CHS.10560,732,0.0,1.0014,CHS.10560.1,1
3,CHS.10592,2020,0.0,1.001,CHS.10592.1,1
4,CHS.13394,131,0.0,1.0169,CHS.13394.1,1
5,CHS.13428,1401,0.0,1.0009,CHS.13428.1,1
6,CHS.13451,2516,0.0,1.0004,CHS.13451.1,1
7,CHS.14865,1,0.0,0.0007,CHS.14865.1,1
8,CHS.16653,401,0.0,1.0101,"CHS.16653.1,CHS.16653.3,CHS.16653.2",3
9,CHS.18585,486,0.0,1.0026,"CHS.18585.7,CHS.18585.2,CHS.18585.5,CHS.18585....",7


In [32]:
# find all where all transcripts are covered decently
dfGood=dfGroups[dfGroups['po_alt_min']>0.9].reset_index(drop=True)
dfGood.to_csv(outDir+'/good_blat.csv',index=False)
dfGood

Unnamed: 0,geneID_alt,total_distance,po_alt_min,po_alt_max,transID_alt_names,transID_count
0,CHS.10523,863,1.0000,1.0000,CHS.10523.1,1
1,CHS.10556,1446,1.0010,1.0022,CHS.10556.1,1
2,CHS.10575,2079,1.0009,1.0010,CHS.10575.1,1
3,CHS.10584,814,1.0012,1.0012,CHS.10584.1,1
4,CHS.10599,3899,1.0004,1.3333,CHS.10599.2,1
5,CHS.13388,248,1.0040,1.0040,CHS.13388.1,1
6,CHS.13418,3600,1.0011,1.0011,CHS.13418.1,1
7,CHS.14875,5157,1.0000,1.0037,CHS.14875.1,1
8,CHS.16635,1317,1.0008,1.0008,CHS.16635.1,1
9,CHS.16638,904,1.0022,1.0022,CHS.16638.1,1


In [36]:
# how many are left?
dfRest=dfGroups[~(dfGroups["geneID_alt"].isin(set(dfNone['geneID_alt'])\
                                        .union(dfSome['geneID_alt'])\
                                        .union(dfGood['geneID_alt'])))].reset_index(drop=True)
dfRest.to_csv(outDir+'/rest_blat.csv',index=False)
dfRest

Unnamed: 0,geneID_alt,total_distance,po_alt_min,po_alt_max,transID_alt_names,transID_count
0,CHS.10568,577,0.8376,1.0094,CHS.10568.1,1
1,CHS.10591,2132,0.8333,1.006,"CHS.10591.2,CHS.10591.1",2
2,CHS.10597,12035,0.524,1.0016,"CHS.10597.1,CHS.10597.2,CHS.10597.3",3
3,CHS.13447,9776,0.8966,1.0833,CHS.13447.1,1
4,CHS.13449,6173,0.1543,1.0022,CHS.13449.1,1
5,CHS.18608,7550,0.622,1.0028,"CHS.18608.1,CHS.18608.2",2
6,CHS.23708,1205,0.6756,1.0141,CHS.23708.1,1
7,CHS.23758,2093,0.8939,1.1667,"CHS.23758.3,CHS.23758.2,CHS.23758.1",3
8,CHS.23889,7488,0.4532,1.0049,"CHS.23889.1,CHS.23889.2",2
9,CHS.23949,7488,0.4532,1.0049,"CHS.23949.2,CHS.23949.1",2
