-
Notifications
You must be signed in to change notification settings - Fork 2
/
flag_duplicates.py
217 lines (208 loc) · 8.15 KB
/
flag_duplicates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
#!/usr/local/env python2.7
"""Reads in a sam file and filters for nonduplicated and duplicated records. Duplicated records are further split into PCR duplicates and optical duplicates. Usage: python flag_duplicates.py sorted.sam
"""
import pandas as pd
from pandas import DataFrame
import sys
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import os
from matplotlib.font_manager import FontProperties
import random
class bcolors: #print colored text to screen (should work linux, OS X, and windows -- only tested on linux)
WARNING = '\033[91m'
COMPLETE = '\033[92m'
def find_pcr_opt_dups(dups, pixDist): #From a dataframe generated using read_sam_get_nondups splits putative PCR duplicates from putative optical duplicates
optDups = []
pcrDups = []
possibleOptNames = []
dupsBin = dups.assign(bin_group = dups['tileCig'] + dups['ref']) #add bin groups and group dataframe
dupGroup = dupsBin.groupby(['bin_group', 'start'], as_index=False)
for name, group in dupGroup: #if there are duplicates but all from same sample == PCR duplicate, else add group name to list
grouped_samples = dupGroup.get_group(name)['sampleID']
if len(grouped_samples.unique()) == 1:
sameSamp = dupGroup.get_group(name)['record'].reset_index()
pcrDups.append(sameSamp['record'][0]) #only keeps first instance of the duplicate
else:
possibleOptNames.append(name)
dupsGroup_counts = dups.assign(duplicates = dups['tileCig'] + "_" + dups['ref'] + "_" + dups['start']) #get crosstab table by sample per bin group
count_table = pd.crosstab([dupsGroup_counts.ref, dupsGroup_counts.start, dupsGroup_counts.tileCig], dupsGroup_counts.sampleID, margins=True)
with open("count_table.txt", "w") as countTable:
count_table.to_csv(countTable)
countTable.close()
nit = 0 #process duplicate records
print "Processing possible optical duplicates..."
for i in range(0, len(possibleOptNames)):
xvals = dupGroup.get_group(possibleOptNames[nit]).sort('x')['x'].reset_index()
yvals = dupGroup.get_group(possibleOptNames[nit]).sort('x')['y'].reset_index()
records = dupGroup.get_group(possibleOptNames[nit])['record'].reset_index()
nit += 1
fin = 0
sin = 1
for k in range(0, len(xvals)):
try:
t = xvals['x'][sin] #if you have reached last record break the loop
except KeyError:
break
xi = xvals['x'][fin]
xj = xvals['x'][sin]
yi = yvals['y'][fin]
yj = yvals['y'][sin]
if abs(int(xi)-int(xj)) > 100 or abs(int(yi)-int(yj)) > 100: #if either x or y coordinates more than 100 pixels == PCR duplicate
if records['record'][fin] not in pcrDups:
pcrDups.append(records['record'][fin]) #add first record
if records['record'][sin] not in pcrDups:
pcrDups.append(records['record'][sin]) #add second record
fin += 1
sin += 1
elif abs(int(yi)-int(yj)) <= 100 and abs(int(xi)-int(xj)) <= 100: #if y and x coordinates under 100 pixels == optical duplicate
if records['record'][fin] not in optDups:
optDups.append(records['record'][fin])
if records['record'][sin] not in optDups:
optDups.append(records['record'][sin])
fin += 1
sin += 1
else:
print "Records do not match critiera:"
print records['record'][fin]
print records['record'][sin]
print "Found %i PCR duplicates" % len(pcrDups)
print "Found %i optical duplicates" % len(optDups)
with open("pcr_duplicates.txt", "w") as pcrOut:
for record in pcrDups:
pcrOut.write(record)
pcrOut.close()
with open("optical_duplicates.txt", "w") as optOut:
for record in optDups:
optOut.write(record)
optOut.close()
if len(optDups) >= 1: #if optical duplicates detected, make figures?
print "-----------------------"
next = raw_input("Generate tile figures?: ")
if 'y' in next:
plotOpticalDups()
else:
print bcolors.COMPLETE + "Complete, no figures generated"
def plotOpticalDups(): #read in optical duplicates file, extract info
data = []
with open("optical_duplicates.txt", "r") as f:
for line in f:
if not line.strip():
continue
else:
try:
optRecord = line.split()
t = optRecord[2]
except IndexError:
break
optRecord = line.split()
read = optRecord[0].split(":")
tile = read[4]
x = read[5]
y = read[6]
sampleID = read[0]
data.append([tile, sampleID, x, y])
dfOptDups = DataFrame(data)
dfOptDups.columns = ['tile', 'sampleID', 'x', 'y']
tileGroups = dfOptDups.groupby('tile')
tiles = []
for name, group in tileGroups:
tiles.append(name)
git = 0
fontP = FontProperties()
fontP.set_size('small')
for i in range(0, len(tiles)): #get colors, x y values for group
colorDict = {}
currentGroup = tileGroups.get_group(tiles[git])
currentName = currentGroup.tile[0]
x = map(int, currentGroup.x.tolist())
y = map(int, currentGroup.y.tolist())
xnorm = [float(i)/sum(x) for i in x]
ynorm = [float(i)/sum(y) for i in y]
for i in range(0, len(currentGroup.sampleID.unique())):
colorDict[currentGroup.sampleID.unique()[i]] = matplotlib.colors.cnames.values()[random.randrange(1, 150)] #link random color to each sample
fig = plt.figure() #loop to create figures
backgroundColor = '#ffffff'
ax = fig.add_subplot(111, axisbg=backgroundColor)
ax.scatter(xnorm, ynorm, color=colorDict.values(), marker=',', s=5)
ax.spines['top'].set_visible(False) #remove plot borders
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
plt.xlim(0, 1.0)
plt.ylim(0, 1.0)
plt.tick_params(bottom='off', top='off', right='off', left='off') #no tick marks
plt.grid(color="white")
markers = [plt.Line2D([0,0], [0,0], color=color, marker='s', linestyle='') for color in colorDict.values()]
lgd = plt.legend(markers, colorDict.keys(), numpoints=1, prop=fontP, bbox_to_anchor=(0.5, -0.1), ncol=5, fancybox=True)
fig.savefig(currentName + '.pdf', bbox_extra_artists=(lgd,), bbox_inches='tight', format='pdf')
git += 1
print bcolors.COMPLETE + "Complete! Figures successfully generated"
def read_sam_get_nondups(inputfile): #Loads and extracts data from sam file
data = []
header = []
with open(inputfile, "r") as f:
print "Reading in %s..." % inputfile
for line in f:
try:
record = line.split()
t=record[2]
except IndexError:
break
if line.startswith("@"):
header.append(line)
else:
record = line.split()
ref = record[2]
start = record[3]
cigar = record[5]
read = record[0].split(":")
sampleID = read[0]
tile = read[4]
x = read[5]
y = read[6]
tileCig = tile + "_" + cigar
data.append([tileCig, ref, start, x, y, sampleID, line])
dfSam = DataFrame(data)
print "Found %i records in %s" % (len(dfSam), inputfile)
print "Finding nondupicated records..."
dfSam.columns = ['tileCig', 'ref', 'start', 'x', 'y', 'sampleID', 'record']
dfSam['count'] = dfSam.groupby('start')['start'].transform('count')
nondups = dfSam[dfSam['count'] == 1]['record']
print "Found %i nonduplicated samples" % len(nondups)
with open("nonduplicates.txt", "w") as f:
f.write(''.join(header)) #add header to nonduplicates file
for line in nondups:
f.write(line)
dups = dfSam[dfSam['count'] > 1]
if len(nondups) == len(dfSam): #break if no duplicates are found
print "-----------------------"
print bcolors.WARNING + "Exit: No duplicated records found"
sys.exit(1)
print "-----------------------"
next = raw_input("Set pixel distance to value other than default (100)? ") #set pixel distance
if 'y' in next:
pixDist = raw_input("New pixel distance: ")
else:
pixDist = 100
find_pcr_opt_dups(dups, pixDist)
def main():
print """
___ _ ___ _ _ _
| __| |__ _ __ _ | \ _ _ _ __| (_)__ __ _| |_ ___ ___
| _|| / _` / _` | | |) | || | '_ \ | / _/ _` | _/ -_|_-<
|_| |_\__,_\__, | |___/ \_,_| .__/_|_\__\__,_|\__\___/__/
|___/ |_|
"""
if len(sys.argv) != 2:
print bcolors.WARNING + "Error! No sam file specified"
sys.exit(1)
inputfile = sys.argv[1]
assert os.path.exists(inputfile), bcolors.WARNING + 'Error! File does not exist: %s. Is the path correct?' %inputfile
read_sam_get_nondups(inputfile)
main()
__author__ = "Allison E. Mann"
__license__ = "GPL"
__version__ = "1.0.1"
__email__="allison.e.mann@gmail.com"