-
Notifications
You must be signed in to change notification settings - Fork 0
/
sam2pileup.py
executable file
·122 lines (109 loc) · 4.12 KB
/
sam2pileup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import sys
import pysam
if len(sys.argv) != 3:
sys.stderr.write("usage:\npython XX.py test.bam test.annotate.meaningful.xls\n")
exit(0)
bamfile=pysam.AlignmentFile(sys.argv[1])
fh=file(sys.argv[2])
def formatDict(tmpdict,totalDepth):
outstr=[]
for basestr in ['A','C','G','T']:
baseformat=''
basedepth,[revDepth,forDepth] = tmpdict[basestr]
depthRatio = float(basedepth)/totalDepth*100
formatstr=';'.join(map(str,[basedepth,'%.0f%%'%depthRatio,'%d+,%d-'%(forDepth,revDepth)]))
outstr.append(formatstr)
return '\t'.join(outstr)
def consensusGenerate(context):
ret=[]
for element in context:
if len(element)==0:continue
if len(element) == 6:
ret.append(element)
else:
for tmpelement in ret:
if element in tmpelement:
break
else:
ret.append(element)
return ret
for line in fh:
if line.startswith("#"):
continue
arr=line.rstrip().split("\t")
if arr[8] != "SNP":
continue
chr,start,end,samplename,ref,alt,owner = [arr[4],arr[5],arr[6],arr[0],arr[9],arr[10],'smone']
#chr,start,end,samplename,ref,alt,owner,other=line.rstrip().split("\t",7)
start,end=map(int,[start,end])
pileupcolumns=bamfile.pileup(chr,start-1,start,truncate=True)
for pileupcolumn in pileupcolumns:
print samplename+'\t'+pileupcolumn.reference_name + '\t' + str(pileupcolumn.pos) + '\t'+ ref+'\t'+alt +'\t'+ str(pileupcolumn.n) + "\t",
baseQarr=[]
baseChararr=[]
mappingOrient=[]
upstreamContext=[]
downstreamContext=[]
borderDistances = Readtotallen=0
readMapq=[]
for pileupread in pileupcolumn.pileups:
readalignment = pileupread.alignment
#print line
#if not pileupread.query_position:
if pileupread.is_del or pileupread.is_refskip:
#print readalignment
continue
baseChararr.append(readalignment.query_sequence[pileupread.query_position])
baseQarr.append(readalignment.query_qualities[pileupread.query_position])
readMapq.append(readalignment.mapping_quality)
qlen=len(readalignment.query_sequence)
borderDistance = qlen-pileupread.query_position if pileupread.query_position > 0.5*qlen else pileupread.query_position
#print str(qlen) + '\t'+str(pileupread.query_position)+ '\t' + str(borderDistance)
borderDistances+=borderDistance
Readtotallen+=qlen
if readalignment.is_reverse:
mappingOrient.append(0)
else:
mappingOrient.append(1)
'''
try:
upstreamContext.append(readalignment.query_sequence[pileupread.query_position-10:pileupread.query_position])
downstreamContext.append(readalignment.query_sequence[pileupread.query_position+1:pileupread.query_position+10])
except:
continue
'''
#print readalignment.get_tag('NM')
if qlen==readalignment.query_alignment_length and readalignment.get_tag('NM')==0:
#print readalignment
#print str(qlen) + '\t'+str(readalignment.query_alignment_length)
upstreamContext.append(readalignment.query_sequence[pileupread.query_position-6:pileupread.query_position])
downstreamContext.append(readalignment.query_sequence[pileupread.query_position+1:pileupread.query_position+7])
tmpdict={'A':[0,[0,0]],'C':[0,[0,0]],'G':[0,[0,0]],'T':[0,[0,0]]}
for i,tmpstr in enumerate(baseChararr):
strandflag=mappingOrient[i]
if tmpstr=="A":
tmpdict[tmpstr][0]+=1
tmpdict[tmpstr][1][strandflag]+=1
elif tmpstr=='T':
tmpdict[tmpstr][0]+=1
tmpdict[tmpstr][1][strandflag]+=1
elif tmpstr=='G':
tmpdict[tmpstr][0]+=1
tmpdict[tmpstr][1][strandflag]+=1
elif tmpstr=='C':
tmpdict[tmpstr][0]+=1
tmpdict[tmpstr][1][strandflag]+=1
else:
sys.stderr.write("[Warning]There is an ambiguous Base here!! Plz check %s in %dth read\n"%(tmpstr,i))
continue
#print ''.join(map(str,baseQarr)) + "\t" ''.join(baseChararr)
print formatDict(tmpdict,pileupcolumn.n) + '\t',
print '%.2f'%(float(borderDistances)/Readtotallen)+'\t',
upstreamContext=consensusGenerate(upstreamContext)
downstreamContext=consensusGenerate(downstreamContext)
print ','.join(list(set(upstreamContext))) + "\t"+ ','.join(list(set(downstreamContext))) + "\t",
print ';'.join(map(str,baseQarr))+'\t'+''.join(baseChararr)+"\t",
print ";".join(map(str,readMapq)),
print "\t"+owner
bamfile.close()
fh.close()