In [1]:
from pybedtools import BedTool
import pandas as pd
import numpy as np
import shutil
import csv
import os

pd.set_option('display.max_columns',500)

outDir="./res_tmp_full"
alignmentDir=outDir+"/alt_alignments"

blatCols=["matches",
        "misMatches",
        "repMatches",
        "nCount",
        "qNumInsert",
        "qBaseInsert",
        "tNumInsert",
        "tBaseInsert",
        "strand",
        "qName",
        "qSize",
        "qStart",
        "qEnd",
        "tName",
        "tSize",
        "tStart",
        "tEnd",
        "blockCount",
        "blockSizes",
        "qStarts",
        "tStarts"]
intersectCols=["chrom",
                "source",
                "type",
                "start",
                "end",
                "score",
                "strand",
                "phase",
                "attributes",
                "chromK",
                "sourceK",
                "typeK",
                "startK",
                "endK",
                "scoreK",
                "strandK",
                "phaseK",
                "attributesK",
                "distance"]
samCols=['QNAME',
         'FLAG',
         'RNAME',
         'POS',
         'MAPQ',
         'CIGAR',
         'RNEXT',
         'PNEXT',
         'TLEN',
         'SEQ',
         'QUAL']

gff3Cols=["seqid","source","type","start","end","score","strand","phase","attributes"]

In [2]:
# get all alignments into a single dataframe tagged by chromosome from SAM files (GMAP)
dfAll=pd.DataFrame([])
for psl in os.listdir(alignmentDir):
    if psl[-4:]==".psl":
        chrom=".".join(psl.split(".")[:-1])
        fp=alignmentDir+"/"+psl
        psl=pd.read_csv(fp,sep="\t",skiprows=5,names=blatCols)
        psl=psl[psl["tName"]==chrom].sort_values(by="matches",ascending=False).reset_index(drop=True).drop_duplicates(["qName"])#.drop(["repMatches","nCount","qBaseInsert","tNumInsert"],axis=1)
        dfAll=pd.concat([dfAll,psl],axis=0).reset_index(drop=True)
dfAll

Unnamed: 0,matches,misMatches,repMatches,nCount,qNumInsert,qBaseInsert,tNumInsert,tBaseInsert,strand,qName,qSize,qStart,qEnd,tName,tSize,tStart,tEnd,blockCount,blockSizes,qStarts,tStarts
0,37565,2,0,0,0,0,0,0,-,CHS.14872.1,37567,0,37567,CM000675.2,114364328,18834402,18871969,1,37567,0,18834402
1,6918,0,0,0,0,0,1,1540,-,CHS.59631.1,6918,0,6918,CM000675.2,114364328,113946083,113954541,2,58791039,05879,113946083113953502
2,6799,4,0,0,0,0,2,1497,+,CHS.14863.1,6803,0,6803,CM000675.2,114364328,112313466,112321766,3,5271877655,052716148,112313466112318839112321111
3,4917,19,0,0,2,42,5,12261,-,CHS.14874.1,4978,0,4978,CM000675.2,114364328,113741868,113759065,8,4111213485457707375881416,04115328801425223529743562,"113741868,113743309,113744673,113751447,113752..."
4,4669,0,0,0,0,0,25,150172,-,CHS.59633.2,4669,0,4669,CM000675.2,114364328,113977782,114132623,26,"1640,184,104,209,83,107,74,78,78,107,124,75,11...","0,1640,1824,1928,2137,2220,2327,2401,2479,2557...","113977782,113981674,113992484,113996530,113999..."
5,4203,0,0,0,0,0,23,150638,-,CHS.59633.1,4203,0,4203,CM000675.2,114364328,113977782,114132623,24,"1640,184,104,209,83,107,74,78,78,107,124,75,11...","0,1640,1824,1928,2137,2220,2327,2401,2479,2557...","113977782,113981674,113992484,113996530,113999..."
6,4191,0,0,0,0,0,23,96006,-,CHS.59633.3,4191,0,4191,CM000675.2,114364328,113977782,114077979,24,"1640,184,104,209,83,107,74,78,78,107,124,75,11...","0,1640,1824,1928,2137,2220,2327,2401,2479,2557...","113977782,113981674,113992484,113996530,113999..."
7,3232,0,0,0,0,0,3,6799,-,CHS.14859.6,3232,0,3232,CM000675.2,114364328,112185657,112195688,4,252353360296,0252325762936,112185657112189466112194816112195392
8,2831,0,0,0,0,0,3,13403,-,CHS.59632.1,2831,0,2831,CM000675.2,114364328,113954564,113970798,4,22756852436,0227523432395,113954564113964926113970170113970362
9,2572,5,0,0,0,0,1,2,-,CHS.14875.1,2577,0,2577,CM000675.2,114364328,113748510,113751089,2,2305272,02305,113748510113750817


In [3]:
# need to re-write the writeGFF function to make faster
includeCols=["matches",
                "misMatches",
                "repMatches",
                "nCount",
                "qNumInsert",
                "qBaseInsert",
                "tNumInsert",
                "tBaseInsert",
                "strand",
                "qName",
                "qSize",
                "qStart",
                "qEnd",
                "tName",
                "tSize",
                "tStart",
                "tEnd",
                "blockCount"]
splitCols=["blockSizes",
           "qStarts",
           "tStarts"]
# make unique id from index
dfAll["uid"]=dfAll.reset_index(drop=True).reset_index()["index"]

dfGFF=pd.DataFrame([])
for col in splitCols:
    tmp=pd.concat([pd.Series(row['uid'], row[col].split(','))              
                        for _, row in dfAll.iterrows()]).reset_index()
    tmp.columns=[col,"uid"]
    dfGFF=pd.concat([dfGFF,tmp],axis=1)
    
dfGFF=dfGFF[~((dfGFF["blockSizes"]==dfGFF["qStarts"])\
              &(dfGFF["qStarts"]==dfGFF["tStarts"])\
              &(dfGFF["tStarts"]==""))].reset_index(drop=True) # get rid of empty lines
dfGFF.columns=["blockSizes","uid1","qStarts","uid2","tStarts","uid"]
dfGFF.drop(["uid1","uid2"],axis=1,inplace=True)
dfGFF=dfGFF.merge(dfAll[["tName","strand","uid"]],on="uid",how="left")
dfGFF["phase"]="."
dfGFF["score"]="."
dfGFF["ref"]="ref"
dfGFF["type"]="exon"
dfGFF["attribute"]=dfGFF["uid"]
dfGFF["start"]=dfGFF['tStarts']
dfGFF['end']=dfGFF['start'].astype(int)+dfGFF["blockSizes"].astype(int)
dfGFF=dfGFF[['tName',\
             'ref',\
             'type',\
             'start',\
             'end',\
             'score',\
             'strand',\
             'phase',\
             'attribute']]
dfAll[["qName","uid"]].to_csv(outDir+"/blat_map.csv",index=False)

In [6]:
# we can try to write out a gff of the blocks for the gff compare

# first need to add a transcript feature for each exon group
dft=dfGFF.groupby(by='attribute').agg({"start":"min","end":"max","ref":"min","tName":"min","score":"min","strand":"min","phase":"min"}).reset_index()
dft['type']='transcript'
dft['attribute']='ID='+dft['attribute'].astype(str)
dfGFF['attribute']='Parent='+dfGFF['attribute'].astype(str)
dfGFF_f=pd.concat([dft[['tName','ref','type','start','end','score','strand','phase','attribute']],dfGFF])
dfGFF_f.to_csv(outDir+'/dfGFF_blat.gtf',sep="\t",index=False,header=False,quoting=csv.QUOTE_NONE)

# gffread -E dfGFF.gtf  -o- > dfGFF.gff

In [None]:
# Step 1 - check if there are any that correspond to any intron chains entirely
#          these are the ones that already exist



dfGFF_f

In [None]:
# step 2 - find all that have at least one shared intron
#          these are the ones that need to be added as new transcripts to pre-existing genes

# step 3 - find all that do not share any introns but overlap a known gene
#          not sure what to do here

# step 4 - investigate what's remaining
#          new genes should be created
#          is the alignment quality good?
#          does GMAP agree with BLAT

In [None]:
dfPrimExon=pd.DataFrame([])
for chrom in set(dfAll["tName"]):
    dfPrimExon=pd.concat([dfPrimExon,pd.read_csv(outDir+"/prim_exons/chessPrim_exon_"+chrom+".gff",sep='\t',names=gff3Cols)])
dfPrimExon.reset_index(drop=True,inplace=True)
    
annotation=BedTool.from_dataframe(dfPrimExon)

In [None]:
## now to run bedtools on the entire thing
sites=BedTool.from_dataframe(dfGFF)
nearby=sites.intersect(annotation,wao=True)
dfIntersect=pd.read_table(nearby.fn,names=intersectCols,index_col=False)
dfIntersect

In [75]:
def writeGFF(row):
    percentOverlaps=[]
    percentOverlaps2=[]
    percentOverlaps3=[]
    wpo=[]
    wpo2=[]
    wpoL=1
    wpo2L=1
    
    if row.distance==0:
        percentOverlaps.append(0)
        percentOverlaps2.append(0)
        wpo2L=wpo2L+float(int(row.end)-int(row.start))
    else:
        po=round(float(row.distance)/float(int(row.endK)-int(row.startK)),4)
        if po>1.0:
            po=1.0
        percentOverlaps.append(po)
        percentOverlaps2.append(round(float(row.distance)/float(int(row.end)-int(row.start)),4))
        percentOverlaps3.append(".".join(row.attributesK.split("Parent=")[-1].split(".")[:-1]))

    return [";".join([str(x) for x in percentOverlaps]),";".join([str(x) for x in percentOverlaps2]),";".join(list(set(percentOverlaps3)))]

dfIntersect["po_prim"]=np.nan
dfIntersect["po_alt"]=np.nan
dfIntersect["ann"]=np.nan
dfIntersect[["po_prim","po_alt","ann"]]=pd.DataFrame([x for x in dfIntersect.apply(lambda row: writeGFF(row),axis=1)])
dfIntersect.to_csv(outDir+"/finalMappingOverlap_exon_2.csv",index=False)
dfIntersect['po_prim'].astype(float)
dfIntersect['po_alt'].astype(float)
dfIntersect=dfIntersect.merge(dfAll[['uid','blockCount']],how="left",left_on="attributes",right_on="uid")
dfIntersect

Unnamed: 0,chrom,source,type,start,end,score,strand,phase,attributes,chromK,sourceK,typeK,startK,endK,scoreK,strandK,phaseK,attributesK,distance,po_prim,po_alt,ann,uid,blockCount
0,CM000683.2,ref,exon,46188446,46191235,.,-,.,0,CM000683.2,HAVANA,exon,46188446,46188821,.,-,.,Parent=CHS.35045.1,376,1.0,0.1348,CHS.35045,0,15
1,CM000683.2,ref,exon,46188446,46191235,.,-,.,0,CM000683.2,HAVANA,exon,46191076,46191235,.,-,.,Parent=CHS.35045.1,160,1.0,0.0574,CHS.35045,0,15
2,CM000683.2,ref,exon,46188446,46191235,.,-,.,0,CM000683.2,Gnomon,exon,46188446,46188821,.,-,.,Parent=CHS.35045.2,376,1.0,0.1348,CHS.35045,0,15
3,CM000683.2,ref,exon,46188446,46191235,.,-,.,0,CM000683.2,Gnomon,exon,46191076,46191235,.,-,.,Parent=CHS.35045.2,160,1.0,0.0574,CHS.35045,0,15
4,CM000683.2,ref,exon,46188446,46191235,.,-,.,0,CM000683.2,Gnomon,exon,46188446,46188821,.,-,.,Parent=CHS.35045.3,376,1.0,0.1348,CHS.35045,0,15
5,CM000683.2,ref,exon,46188446,46191235,.,-,.,0,CM000683.2,Gnomon,exon,46191076,46191235,.,-,.,Parent=CHS.35045.3,160,1.0,0.0574,CHS.35045,0,15
6,CM000683.2,ref,exon,46188446,46191235,.,-,.,0,CM000683.2,Gnomon,exon,46188446,46188821,.,-,.,Parent=CHS.35045.4,376,1.0,0.1348,CHS.35045,0,15
7,CM000683.2,ref,exon,46188446,46191235,.,-,.,0,CM000683.2,Gnomon,exon,46191076,46191235,.,-,.,Parent=CHS.35045.4,160,1.0,0.0574,CHS.35045,0,15
8,CM000683.2,ref,exon,46188446,46191235,.,-,.,0,CM000683.2,Gnomon,exon,46188446,46188821,.,-,.,Parent=CHS.35045.5,376,1.0,0.1348,CHS.35045,0,15
9,CM000683.2,ref,exon,46188446,46191235,.,-,.,0,CM000683.2,Gnomon,exon,46191076,46191235,.,-,.,Parent=CHS.35045.5,160,1.0,0.0574,CHS.35045,0,15


In [76]:
# let's try to write cases manually

# first we shall extract all the ones that match perfectly - any such transcripts can be discarded, since they already exist on the primary scaffolds
# additionally, a threshold can be set for each PO lower than 1.0 - can be discussed
dfPerfect=dfIntersect[(dfIntersect["po_prim"]=="1.0")&(dfIntersect["po_alt"]=="1.0")]# of these we need to figure out how many constitute full transcripts
dfExists=dfPerfect[['attributesK','po_prim','blockCount','start','end','strand','attributes']].groupby(by=['attributesK','start','end','strand','attributes']).agg({'po_prim':'count','blockCount':'min'}).reset_index()
dfExists[dfExists['po_prim']==dfExists['blockCount']]
# tofind such transcripts we need to have an expected exon count per transcript and the transcript count for each corresponding primary transcript

Unnamed: 0,attributesK,start,end,strand,attributes,po_prim,blockCount
975,Parent=CHS.35042.15,46142108,46142741,-,62,1,1


In [None]:
# next would be to identify any transcripts that are correctly aligned, but do not overlap any known transcripts completely
# e.g. one of the exons missing from the alignment


In [3]:
### now let's try to make a gff of exons for each of the identified transcripts

# I guess a sane way to do this is to use apply lambda over rows and work with each row independently
def writeGFF(row,annotation):
    if not os.path.exists(outDir+"/gffs"):
        os.mkdir(outDir+"/gffs")
    tstarts=row["tStarts"].split(",")
    blocksizes=row["blockSizes"].split(",")
    percentOverlaps=[]
    percentOverlaps2=[]
    percentOverlaps3=[]
    wpo=[]
    wpo2=[]
    wpoL=1
    wpo2L=1
    for i in range(row["blockCount"]):
        dataBed=pd.DataFrame([[row["tName"],"ref","exon",int(tstarts[i]),int(tstarts[i])+int(blocksizes[i]),".",row["strand"],".","nothing"]])
        dataBed.to_csv(outDir+"/gffs/"+row["qName"]+".gff",sep="\t",index=False,header=False,quoting=csv.QUOTE_NONE)
        sites=BedTool.from_dataframe(dataBed)
#         display(pd.read_table(annotations[row["tName"]].fn))
        nearby=sites.intersect(annotations[row["tName"]], wao=True)
        df=pd.read_table(nearby.fn,names=intersectCols,index_col=False)
        if df.iloc[0].distance==0:
            percentOverlaps.append(0)
            percentOverlaps2.append(0)
            wpo2L=wpo2L+float(int(df.iloc[0].end)-int(df.iloc[0].start))
        else:
            percentOverlaps.append(round(float(df.iloc[0].distance)/float(int(df.iloc[0].endK)-int(df.iloc[0].startK)),4))
            wpoL=wpoL+float(int(df.iloc[0].endK)-int(df.iloc[0].startK))
            wpo.append(percentOverlaps[-1]*float(int(df.iloc[0].endK)-int(df.iloc[0].startK)))
            percentOverlaps2.append(round(float(df.iloc[0].distance)/float(int(df.iloc[0].end)-int(df.iloc[0].start)),4))
            wpo2L=wpo2L+float(int(df.iloc[0].end)-int(df.iloc[0].start))
            wpo2.append(percentOverlaps2[-1]*float(int(df.iloc[0].end)-int(df.iloc[0].start)))
            percentOverlaps3.append(".".join(df.iloc[0].attributesK.split("Parent=")[-1].split(".")[:-1]))
#     shutil.rmtree(outDir+"/gffs")
    return [";".join([str(x) for x in percentOverlaps]),";".join([str(x) for x in percentOverlaps2]),";".join(list(set(percentOverlaps3))),sum(wpo)/wpoL,sum(wpo2)/wpo2L]

annotations={}
for chrom in set(dfAll["tName"]):
    annotations[chrom]=BedTool(outDir+"/prim_exons/chessPrim_exon_"+chrom+".gff")

dfAll["po"]=np.nan
dfAll["po2"]=np.nan
dfAll["ann"]=np.nan
dfAll[["po","po2","ann","wpo","wpo2"]]=pd.DataFrame([x for x in dfAll.apply(lambda row: writeGFF(row,annotations),axis=1)])
dfAll.to_csv(outDir+"/finalMappingOverlap_exon.csv",index=False)
dfAll

Unnamed: 0,matches,misMatches,repMatches,nCount,qNumInsert,qBaseInsert,tNumInsert,tBaseInsert,strand,qName,...,tEnd,blockCount,blockSizes,qStarts,tStarts,po,po2,ann,wpo,wpo2
0,6246,0,0,0,0,0,14,21104,-,CHS.35073.4,...,46215796,15,"2789,1385,1035,81,66,106,97,150,51,72,57,28,98...","0,2789,4174,5209,5290,5356,5462,5559,5709,5760...","46188446,46191880,46193626,46195675,46196201,4...",1.0027;0.8624;1.0059;1.0125;1.0154;1.0095;1.01...,0.1348;1.0;0.1652;1.0;1.0;1.0;1.0;1.0;1.0;1.0;...,CHS.35045,0.934478,0.474778
1,6074,0,0,0,0,0,16,24371,-,CHS.35073.11,...,46219588,17,"2092,1385,1154,81,66,106,97,150,51,232,57,28,9...","0,2092,3477,4631,4712,4778,4884,4981,5131,5182...","46189143,46191880,46193507,46195675,46196201,4...",1.0063;0.8624;1.0059;1.0125;1.0154;1.0095;1.01...,0.0765;1.0;0.1482;1.0;1.0;1.0;1.0;1.0;1.0;0.31...,CHS.35045,0.935409,0.491037
2,5923,0,0,0,0,0,17,27118,-,CHS.35073.1,...,46220512,18,"3764,79,171,81,66,106,97,150,51,72,57,28,98,11...","0,3764,3843,4014,4095,4161,4267,4364,4514,4565...","46187471,46191880,46194490,46195675,46196201,4...",1.0027;0.0492;1.0059;1.0125;1.0154;1.0095;1.01...,0.0999;1.0;1.0;1.0;1.0;1.0;1.0;1.0;1.0;1.0;1.0...,CHS.35045,0.555957,0.319552
3,5327,0,0,0,0,0,9,9302,-,CHS.35071.3,...,46155570,10,2191162130621321350114901184101,0219123532483254526774027414150425226,"46140941,46145416,46145817,46146265,46150118,4...",0.3959;1.0062;1.0078;1.0164;1.0076;1.0073;1.00...,1.0005;1.0;1.0;1.0;1.0;0.1022;1.0;0.0988;1.0;0...,CHS.35042,0.497379,0.619761
4,5079,3,0,0,0,0,7,5801,-,CHS.35071.9,...,46155570,8,8911306213213501141426977,0891102110831215256526794105,"46144687,46145817,46146265,46150118,46150387,4...",1.0062;1.0078;1.0164;1.0076;1.0073;1.0088;1.01...,0.1818;1.0;1.0;1.0;0.1022;1.0;0.0624;0.1013,CHS.35042,1.007614,0.182157
5,4853,3,0,0,0,0,9,9773,-,CHS.35071.2,...,46155570,10,164116213062132135089129184977,0164118031933199521273477356636953879,"46140941,46145416,46145817,46146265,46150118,4...",0.2966;1.0062;1.0078;1.0164;1.0076;1.0073;0.78...,1.0006;1.0;1.0;1.0;1.0;0.1022;1.0;0.6899;1.0;0...,CHS.35042,0.410797,0.561442
6,4605,0,0,0,0,0,10,10015,-,CHS.35071.6,...,46155572,11,21801621306213213508912984184103,"0,2180,2342,2472,2534,2666,4016,4105,4234,4318...","46140952,46145416,46145817,46146265,46150118,4...",0.3939;1.0062;1.0078;1.0164;1.0076;1.0073;0.78...,1.0005;1.0;1.0;1.0;1.0;0.1022;1.0;0.6899;0;1.0...,CHS.35042,0.491947,0.709088
7,4475,0,0,0,0,0,10,10147,-,CHS.35071.7,...,46155577,11,1450654162130621321350114129184108,"0,1450,2104,2266,2396,2458,2590,3940,4054,4183...","46140955,46142478,46145416,46145817,46146265,4...",0.2621;0.1183;1.0062;1.0078;1.0164;1.0076;1.00...,1.0007;1.0015;1.0;1.0;1.0;1.0;0.1022;1.0;0.689...,CHS.35042,0.264149,0.718491
8,4371,3,0,0,0,0,10,10255,-,CHS.35071.1,...,46155570,11,21911621306213213818089129184977,"0,2191,2353,2483,2545,2677,2815,2995,3084,3213...","46140941,46145416,46145817,46146265,46150118,4...",0.3959;1.0062;1.0078;1.0164;1.0076;1.0073;1.00...,1.0005;1.0;1.0;1.0;1.0;1.0;1.0;1.0;0.6899;1.0;...,CHS.35042,0.507056,0.790186
9,4334,0,0,0,0,0,16,26790,-,CHS.35073.6,...,46219574,17,"2785,79,171,81,96,106,97,150,51,72,57,28,98,11...","0,2785,2864,3035,3116,3212,3318,3415,3565,3616...","46188450,46191880,46194490,46195675,46196201,4...",0.992;0.0492;1.0059;1.0125;1.0154;1.0095;1.010...,0.1336;1.0;1.0;1.0;0.6875;1.0;1.0;1.0;1.0;1.0;...,CHS.35045,0.554778,0.435773


In [None]:
# functions for the results parsing

def extractFlagBits(data):
    data["paired"]=data["FLAG"]               &1 #template having multiple segments in sequencing
    data["aligned2Mates"]=data["FLAG"]        &2 #each segment properly aligned according to the aligner
    data["unmappedCurr"]=data["FLAG"]         &4 #segment unmapped
    data["unmappedMate"]=data["FLAG"]         &8 #next segment in the template unmapped
    data["reversedCurr"]=data["FLAG"]         &16 #SEQ being reverse complemented
    data["reversedMate"]=data["FLAG"]         &32 #SEQ of the next segment in the template being reverse complemented
    data["firstRead"]=data["FLAG"]            &64 #the first segment in the template
    data["lastRead"]=data["FLAG"]             &128 #the last segment in the template
    data["secondaryAlignment"]=data["FLAG"]   &256 #secondary alignment
    data["noPassFilter"]=data["FLAG"]         &512 #not passing filters, such as platform/vendor quality controls
    data["PCRdup"]=data["FLAG"]               &1024 #PCR or optical duplicate
    data["suppAl"]=data["FLAG"]               &2048 #supplementary alignment

def se(row):
    cigar=row["CIGAR"]
    chars=re.findall(r"[\D']+", cigar)
    ints=[int(x) for x in re.findall(r"[\d']+",cigar)]
    readLen=0
    pre=0
    post=0
    n=0
    m_pre_tem=0
    m_pre_ref=0
    m_post_tem=0
    m_post_ref=0
    indexN=0
    di=0
    blockCount=1
    blockSizes=[0]
    tStarts=[0]
    if "N" in chars:
        indexN=chars.index("N")
        blockCount=len(cigar.split("N"))
    for i in range(len(chars)):
        if i==0 and chars[i] in "SH":
            pre=ints[i]
            readLen=readLen+ints[i]
            tStarts[0]=tStarts[0]+ints[i]
        if i==len(chars)-1 and chars[i] in "SH":
            post=ints[i]
            readLen=readLen+ints[i]
        if chars[i]=="N":
            n=n+ints[i]
            tStarts.append(tStarts[-1]+blockSizes[-1]+n)
            blockSizes.append(0)
        if i<indexN and chars[i]=="M":
            m_pre_tem=m_pre_tem+ints[i]
            m_pre_ref=m_pre_ref+ints[i]
            readLen=readLen+ints[i]
            blockSizes[-1]=blockSizes[-1]+ints[i]
        if i>=indexN and chars[i]=="M":
            m_post_tem=m_post_tem+ints[i]
            m_post_ref=m_post_ref+ints[i]
            readLen=readLen+ints[i]
            blockSizes[-1]=blockSizes[-1]+ints[i]
        if i<indexN and chars[i]=="D":
            m_pre_ref=m_pre_ref+ints[i]
            di=di+ints[i]
        if i>=indexN and chars[i]=="D":
            m_post_ref=m_post_ref+ints[i]
            di=di+ints[i]
        if i<indexN and chars[i]=="I":
            readLen=readLen+ints[i]
            m_pre_tem=m_pre_tem+ints[i]
            di=di+ints[i]
        if i>=indexN and chars[i]=="I":
            readLen=readLen+ints[i]
            m_post_tem=m_post_tem+ints[i]
            di=di+ints[i]
    return pd.Series([pre,post,m_pre_ref,m_pre_tem,m_post_ref,m_post_tem,n,readLen,di,blockCount,",".join([str(x) for x in blockSizes]),",".join([str(x) for x in tStarts])])

def parseCIGAR(data):
    data["CIGAR"].replace("*",np.nan,inplace=True)
    data.dropna(axis=0,inplace=True)
    data.reset_index(drop=True,inplace=True)

#     data["READ_LEN"]=data.SEQ.str.len()
    data["CIGAR_POST"]=data.CIGAR.str.extract("[M]([0-9]+)[A-Z]$",expand=False).replace(np.nan,0).astype(int)
    data["END"]=data.READ_LEN-data.CIGAR_POST
    data["CIGAR_PRE"]=data.CIGAR.str.extract("^([0-9]+)[SH]",expand=False).replace(np.nan,0).astype(int)

    data16=data[data["reversedCurr"]==16].reset_index(drop=True)
    data0=data[data["reversedCurr"]==0].reset_index(drop=True)
    data16["Template_start"]=data16.READ_LEN-data16.END
    data16["Template_end"]=data16.READ_LEN-data16.CIGAR_PRE
    data0["Template_start"]=data0.CIGAR_PRE
    data0["Template_end"]=data0.END

    data16["Reference_start"]=data16.READ_LEN-data16.END+data16.POS-data16.Template_start
    data16["Reference_end"]=data16.READ_LEN-data16.CIGAR_PRE-1+data16.POS-data16.Template_start+data16.N
    data0["Reference_start"]=data0.POS
    data0["Reference_end"]=data0.END+data0.POS-data0.CIGAR_PRE+data0.N 
    
    data=pd.concat([data16,data0]).reset_index(drop=True)
    data.drop(["CIGAR_POST","CIGAR_PRE"],axis=1,inplace=True)
    return data

def tStarts(row):
    re=row["Reference_start"]
    tStarts=[]
    qStarts=row.qStarts.split(",")
    qs2=[int(x)-int(qStarts[0]) for x in qStarts]
    blockSizes=row.blockSizes.split(",")
    for i in range(row["blockCount"]):
        tStarts.append(re+int(qStarts[i]))
    return pd.Series([",".join([str(x) for x in tStarts])])

# get all alignments into a single dataframe tagged by chromosome from SAM files (GMAP)
dfAllSam=pd.DataFrame([])
for sam in os.listdir(alignmentDir):
    if sam[-4:]==".sam":
        chrom=sam.split("_")[0]
        fp=alignmentDir+"/"+sam
        df=pd.read_csv(fp,sep="\t",comment='@',usecols=[0,1,2,3,4,5,6,7,8,9,10],names=samCols['QNAME',
                                                                                         'FLAG',
                                                                                         'RNAME',
                                                                                         'POS',
                                                                                         'MAPQ',
                                                                                         'CIGAR',
                                                                                         'RNEXT',
                                                                                         'PNEXT',
                                                                                         'TLEN',
                                                                                         'SEQ',
                                                                                         'QUAL'])
        df["MD"]=pd.read_csv(fp,usecols=[0],comment="@",names=["full"])["full"].str.split("\tMD:Z:",expand=True)[1].str.split("\t",expand=True)[0]
        extractFlagBits(df)
        dfAllSam=pd.concat([dfAllSam,df],axis=0).reset_index(drop=True)

dfAllSam["PRE"]=np.nan
dfAllSam["POST"]=np.nan
dfAllSam["MPRER"]=np.nan
dfAllSam["MPRET"]=np.nan
dfAllSam["MPOSTR"]=np.nan
dfAllSam["MPOSTT"]=np.nan
dfAllSam["N"]=np.nan
dfAllSam["blockCount"]=0
dfAllSam["blockSizes"]=""
dfAllSam["qStarts"]=""
dfAllSam["tStarts"]=""
dfAllSam[["PRE","POST","MPRER","MPRET","MPOSTR","MPOSTT","N","READ_LEN","DI","blockCount","blockSizes","qStarts"]]=pd.DataFrame(dfAllSam.apply(lambda row: se(row),axis=1))
dfAllSam=parseCIGAR(dfAllSam)
dfAllSam["tStarts"]=dfAllSam.apply(lambda row: tStarts(row),axis=1)
dfAllSam.drop(["FLAG","QUAL","paired","aligned2Mates","unmappedCurr","unmappedMate","reversedMate","firstRead","lastRead","secondaryAlignment","noPassFilter","PCRdup","suppAl","qStarts"],axis=1,inplace=True)

# now let's do the same with the sam output
# one thing to try is to extract blocks similarly to the psl format and then use the same writeGFF function as with blat output
# this operation can only be performed on those reads mapped once for now
# we hall deal with those that have multiple alignments separately
def writeGFFSam(row,annotation):
    if not os.path.exists("./out"):
        os.mkdir("./out/gffs")
    
    tstarts=row["tStarts"].split(",")
    blocksizes=row["blockSizes"].split(",")
    percentOverlaps=[]
    percentOverlaps2=[]
    percentOverlaps3=[]
    strand=""
    wpoL=1
    wpo2L=1
    wpo=[]
    wpo2=[]
    for i in range(row["blockCount"]):
        if row["reversedCurr"]==16:
            strand="-"
        else:
            strand="+"
        dataBed=pd.DataFrame([[row["RNAME"],"ref","exon",int(tstarts[i]),int(tstarts[i])+int(blocksizes[i]),".",strand,".","nothing"]])
        dataBed.to_csv("./out/gffs/"+row["QNAME"]+".gff",sep="\t",index=False,header=False,quoting=csv.QUOTE_NONE)
        sites=BedTool.from_dataframe(dataBed)
        nearby=sites.intersect(annotations[row["RNAME"]], wao=True)
        df=pd.read_table(nearby.fn,names=["chrom",
                                          "source",
                                          "type",
                                          "start",
                                          "end",
                                          "score",
                                          "strand",
                                          "phase",
                                          "attributes",
                                          "chromK",
                                          "sourceK",
                                          "typeK",
                                          "startK",
                                          "endK",
                                          "scoreK",
                                          "strandK",
                                          "phaseK",
                                          "attributesK",
                                          "distance"],index_col=False)

        df=df[df["type"]=="exon"].sort_values(by="distance",ascending=False).drop_duplicates(["chrom"]).reset_index(drop=True)
        if df.iloc[0].distance==0:
            percentOverlaps.append(0)
            percentOverlaps2.append(0)
            wpo2L=wpo2L+float(int(df.iloc[0].end)-int(df.iloc[0].start))
        else:
            percentOverlaps.append(round(float(df.iloc[0].distance)/float(int(df.iloc[0].endK)-int(df.iloc[0].startK)),4))
            wpoL=wpoL+float(int(df.iloc[0].endK)-int(df.iloc[0].startK))
            wpo.append(percentOverlaps[-1]*float(int(df.iloc[0].endK)-int(df.iloc[0].startK)))
            percentOverlaps2.append(round(float(df.iloc[0].distance)/float(int(df.iloc[0].end)-int(df.iloc[0].start)),4))
            wpo2L=wpo2L+float(int(df.iloc[0].end)-int(df.iloc[0].start))
            wpo2.append(percentOverlaps2[-1]*float(int(df.iloc[0].end)-int(df.iloc[0].start)))
            percentOverlaps3.append(".".join(df.iloc[0].attributesK.split("Parent=")[-1].split(".")[:-1]))
    return [";".join([str(x) for x in percentOverlaps]),";".join([str(x) for x in percentOverlaps2]),";".join(list(set(percentOverlaps3))),sum(wpo)/wpoL,sum(wpo2)/wpo2L]

annotations={}
for chrom in set(dfAllSam["RNAME"]):
    annotations[chrom]=BedTool("./out/chessPrim_exon_"+chrom+".gff")
dfAllSam[["po","po2","ann","wpo","wpo2"]]=pd.DataFrame([x for x in dfAllSam.apply(lambda row: writeGFFSam(row,annotations),axis=1)])
dfAllSam.to_csv("./finalMappingOverlapSam_exon.csv",index=False)
dfAllSam1=dfAllSam.drop_duplicates("QNAME",keep=False).reset_index(drop=True)
dfAllSam1.to_csv("./finalMappingOverlapSam1_exon.csv",index=False)
dfAllSam1