This repository has been archived by the owner on Nov 9, 2023. It is now read-only.
/
split_libraries.py
1467 lines (1175 loc) · 59.3 KB
/
split_libraries.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
#file split_libraries.py
"""Performs preprocessing steps for barcoded library analysis, e.g. 454.
Specifically, does the quality-filtering step (using several criteria) and
renames each read with the appropriate library id.
This module reads the linker+primer sequence from the input mapping file, and
associates these with the barcodes from the mapping file. If a barcode is
read that does not correspond to any in the mapping file, this module checks
against all possible primers from the mapping file. A rare situation could
arise if a barcode does not match any from the mapping file (either because
of sequencing errors or because the mapping file is incomplete) and variations
of the same primer are used for sequencing (e.g., a truncated form of the same
primer), where it is impossible to distinguish what primer was actually used
to amplify a given sequence. In these cases, portions of the given sequence
are sliced out in ascending order of primer sizes and compared to all possible
primers from the mapping file. The first matching primer is considered a hit
for the purposes of the determining where a primer ends and the actual
sequence data begins. Because of this, one should be careful about using
truncated forms of the same primer with this module. The -c option can be
used to disable attempts at barcode correction, and sequences that do not
have a barcode that matches the mapping file will not be recorded.
"""
__author__ = "Rob Knight and Micah Hamady"
__copyright__ = "Copyright 2011, The QIIME Project"
__credits__ = ["Rob Knight", "Micah Hamady", "Greg Caporaso", "Kyle Bittinger","Jesse Stombaugh","William Walters","Jens Reeder"] #remember to add yourself
__license__ = "GPL"
__version__ = "1.8.0"
__maintainer__ = "William Walters"
__email__ = "rob@spot.colorado.edu, william.a.walters@colorado.edu"
import re
from gzip import GzipFile
from os import mkdir, stat
from collections import defaultdict
from string import upper
from numpy import array, mean, arange, histogram
from numpy import __version__ as numpy_version
import warnings
warnings.filterwarnings('ignore', 'Not using MPI as mpi4py not found')
from cogent.parse.fasta import MinimalFastaParser
from cogent.seqsim.sequence_generators import SequenceGenerator, IUPAC_DNA
from cogent.core.moltype import IUPAC_DNA_ambiguities
from cogent import DNA, LoadSeqs
from cogent.align.align import make_dna_scoring_dict, local_pairwise
from cogent.util.misc import remove_files
from qiime.check_id_map import process_id_map
from qiime.barcode import correct_barcode
from qiime.hamming import decode_barcode_8
from qiime.golay import decode as decode_golay_12
from qiime.format import format_histograms
from qiime.parse import QiimeParseError, parse_qual_scores
from qiime.util import create_dir, median_absolute_deviation
## Including new=True in the histogram() call is necessary to
## get the correct result in versions prior to NumPy 1.2.0,
## but the new keyword will be removed in NumPy 1.4. In NumPy 1.2.0
## or later, new=True raises a Warning regarding
## deprecation of new. One solution to this is to check the
## numpy version, and if it's less than 1.2.0, overwrite histogram
## with new=True as the default. This avoids a deprecation warning message
## in versions 1.2.0 through 1.3.*, and a try/except to handle errors from
## versions 1.4.0 or later.
numpy_version = re.split("[^\d]", numpy_version)
numpy_version = tuple([int(i) for i in numpy_version if i.isdigit()])
if numpy_version < (1,3,0):
numpy_histogram = histogram
def histogram(a, bins=10, range=None, normed=False, weights=None):
return numpy_histogram(a,bins=bins,range=range,\
normed=normed,weights=weights,new=True)
# Supported barcode types - need to adapt these functions to ignore list
# of valid barcodes that the generic decoder requires
BARCODE_TYPES = {
"golay_12":(12, lambda bc, bcodes: decode_golay_12(bc)),
"hamming_8":(8, lambda bc, bcodes: decode_barcode_8(bc)),
# The decode function for variable length barcode does nothing -
# it's just provided to comply with the interface of the other
# barcode types. The returned barcode is always the same as the
# one passed in, and the number of mismatches is always 0. The
# length is None, corresponding to variable length.
"variable_length":(None, lambda bc, bcodes: (bc, 0))}
def get_infile(filename):
"""Returns filehandle, allowing gzip input."""
if filename.endswith(".gz"):
fin = GzipFile(filename, "rb")
else:
fin = open(filename, "U")
return fin
def count_mismatches(seq1, seq2, max_mm):
"""Counts mismatches, primer should be <= length of the seq.
"""
mm = 0
for i in range(min(len(seq2), len(seq1))):
if seq1[i] != seq2[i]:
mm += 1
if mm > max_mm:
return mm
return mm
def ok_mm_primer(primer_seq, all_primer_seqs, primer_mm):
"""Check if primer_seq matches any primer within max mismatches.
TODO: if we start using highly degenerate primers, should refactor using
faster algorithm.
"""
for curr_pat in all_primer_seqs:
if count_mismatches(primer_seq, curr_pat, primer_mm) <= primer_mm:
return True
return False
def MatchScorerAmbigs(match, mismatch, matches=None):
""" Alternative scorer factory for sw_align which allows match to ambiguous chars
It allows for matching to ambiguous characters which is useful for
primer/sequence matching. Not sure what should happen with gaps, but they
shouldn't be passed to this function anyway. Currently a gap will only match
a gap.
match and mismatch should both be numbers. Typically, match should be
positive and mismatch should be negative.
Resulting function has signature f(x,y) -> number.
Code original from Greg Caporaso
"""
matches = matches or \
{'A':{'A':None},'G':{'G':None},'C':{'C':None},\
'T':{'T':None},'-':{'-':None}}
for ambig, chars in IUPAC_DNA_ambiguities.items():
try:
matches[ambig].update({}.fromkeys(chars))
except KeyError:
matches[ambig] = {}.fromkeys(chars)
for char in chars:
try:
matches[char].update({ambig:None})
except KeyError:
matches[char] = {ambig:None}
def scorer(x, y):
# need a better way to disallow unknown characters (could
# try/except for a KeyError on the next step, but that would only
# test one of the characters)
if x not in matches or y not in matches:
raise ValueError, "Unknown character: %s or %s" % (x,y)
if y in matches[x]:
return match
else:
return mismatch
return scorer
# The scoring function which can be passed to cogent.alignment.algorithms.sw_align
equality_scorer_ambigs = MatchScorerAmbigs(1, -1)
expanded_equality_scorer_ambigs = MatchScorerAmbigs(1, -1,\
matches=\
{'A':{'A':None,'G':None},\
'G':{'G':None,'A':None,'T':None,'C':None,},\
'C':{'C':None,'G':None},\
'T':{'T':None,'G':None},\
'-':{'-':None}})
def pair_hmm_align_unaligned_seqs(seqs,moltype=DNA,params={}):
"""
Checks parameters for pairwise alignment, returns alignment.
Code from Greg Caporaso.
"""
seqs = LoadSeqs(data=seqs,moltype=moltype,aligned=False)
try:
s1, s2 = seqs.values()
except ValueError:
raise ValueError,\
"Pairwise aligning of seqs requires exactly two seqs."
try:
gap_open = params['gap_open']
except KeyError:
gap_open = 5
try:
gap_extend = params['gap_extend']
except KeyError:
gap_extend = 2
try:
score_matrix = params['score_matrix']
except KeyError:
score_matrix = make_dna_scoring_dict(\
match=1,transition=-1,transversion=-1)
return local_pairwise(s1,s2,score_matrix,gap_open,gap_extend)
def local_align_primer_seq(primer,sequence,sw_scorer=equality_scorer_ambigs):
"""Perform local alignment of primer and sequence
primer: Input primer sequence
sequence: target sequence to test primer against
Returns the number of mismatches,
and the start position in sequence of the hit.
Modified from code written by Greg Caporaso.
"""
query_primer = primer
query_sequence = str(sequence)
# Get alignment object from primer, target sequence
alignment = pair_hmm_align_unaligned_seqs([query_primer,query_sequence])
# Extract sequence of primer, target site, may have gaps in insertions
# or deletions have occurred.
primer_hit = str(alignment.Seqs[0])
target_hit = str(alignment.Seqs[1])
# Count insertions and deletions
insertions = primer_hit.count('-')
deletions = target_hit.count('-')
mismatches = 0
for i in range(len(target_hit)):
# using the scoring function to check for
# matches, but might want to just access the dict
if sw_scorer(target_hit[i],primer_hit[i]) == -1 and \
target_hit[i] != '-' and primer_hit[i] != '-':
mismatches += 1
try:
hit_start = query_sequence.index(target_hit.replace('-',''))
except ValueError:
raise ValueError,('substring not found, query string %s, target_hit %s' % (query_sequence, target_hit))
# sum total mismatches
mismatch_count = insertions + deletions + mismatches
return mismatch_count, hit_start
def expand_degeneracies(raw_primers):
""" Returns all non-degenerate versions of a given primer sequence """
expanded_primers=[]
for raw_primer in raw_primers:
primers=SequenceGenerator(template=raw_primer.strip(),
alphabet=IUPAC_DNA)
for primer in primers:
expanded_primers.append(primer)
return expanded_primers
def check_map(infile, disable_primer_check, barcode_type="golay_12",
added_demultiplex_field=None, has_barcodes=True):
"""Check mapping file and extract list of valid barcodes, primers """
if barcode_type == "variable_length":
var_len_barcodes = True
else:
var_len_barcodes = False
if barcode_type == "0":
has_barcodes = False
# hds, id_map, dsp, run_description, errors, warnings
hds, mapping_data, run_description, errors, warnings= \
process_id_map(infile, has_barcodes=has_barcodes, \
disable_primer_check=disable_primer_check,
added_demultiplex_field=added_demultiplex_field,
variable_len_barcodes=var_len_barcodes)
if errors:
raise ValueError,('Errors were found with mapping file, '+\
'please run check_id_map.py to identify problems.')
id_map = {}
for curr_data in mapping_data:
id_map[curr_data[0]] = {}
for header in range(len(hds)):
for curr_data in mapping_data:
id_map[curr_data[0]][hds[header]] = curr_data[header]
barcode_to_sample_id = {}
primer_seqs_lens = {}
all_primers = {}
for sample_id, sample in id_map.items():
if added_demultiplex_field:
barcode_to_sample_id[sample['BarcodeSequence'].upper() + "," +\
sample[added_demultiplex_field]] = sample_id
else:
barcode_to_sample_id[sample['BarcodeSequence'].upper()] = sample_id
if not disable_primer_check:
raw_primers = sample['LinkerPrimerSequence'].upper().split(',')
if len(raw_primers[0].strip()) == 0:
raise ValueError,('No primers detected, please use the '+\
'-p parameter to disable primer detection.')
expanded_primers = expand_degeneracies(raw_primers)
curr_bc_primers = {}
for primer in expanded_primers:
curr_bc_primers[primer] = len(primer)
all_primers[primer] = len(primer)
primer_seqs_lens[sample['BarcodeSequence']] = curr_bc_primers
return hds, id_map, barcode_to_sample_id, warnings, errors, \
primer_seqs_lens, all_primers
def fasta_ids(fasta_files, verbose=False):
""" Returns list of ids in FASTA files """
all_ids = set([])
for fasta_in in fasta_files:
for label, seq in MinimalFastaParser(fasta_in):
rid = label.split()[0]
if rid in all_ids:
raise ValueError, \
"Duplicate ID found in FASTA/qual file: %s" % label
all_ids.add(rid)
return all_ids
def count_ambig(curr_seq, valid_chars='ATCG'):
"""Counts non-standard characters in seq"""
up_seq = curr_seq.upper()
total = 0
for vchar in valid_chars:
total += up_seq.count(vchar)
return len(curr_seq) - total
def split_seq(curr_seq, barcode_len, primer_seq_len):
"""Split sequence into parts barcode, primer and remainder"""
curr_barcode = curr_seq[0:barcode_len]
rest_of_seq = curr_seq[barcode_len:]
primer_seq = rest_of_seq[0:primer_seq_len]
rest_of_seq = rest_of_seq[primer_seq_len:]
return curr_barcode, primer_seq, rest_of_seq
def get_barcode(curr_seq, barcode_len):
""" Split sequence into barcode and remaining sequence
Linker and primer part of remaining sequence, as one must first
read the barcode to find the associated primer from the mapping file"""
raw_barcode = curr_seq[0:barcode_len]
raw_seq = curr_seq[barcode_len:]
return raw_barcode, raw_seq
def primer_exceeds_mismatches(primer_seq, all_primer_seqs, max_primer_mm):
"""Returns True if primer exceeds allowed mismatches"""
if primer_seq not in all_primer_seqs:
if not ok_mm_primer(primer_seq, all_primer_seqs, max_primer_mm):
return True
return False
def seq_exceeds_homopolymers(curr_seq, max_len=6):
"""Returns False if primer contains any homopolymer > allowed length"""
for base in 'ATGC':
curr = base * (max_len+1)
if curr in curr_seq:
return True
return False
def check_barcode(curr_barcode, barcode_type, valid_map,
attempt_correction=True, added_demultiplex_field=None,
curr_id=None):
"""Return whether barcode is valid, and attempt correction."""
corrected_bc = False
if added_demultiplex_field:
added_demultiplex_lens =\
set([len(bc.split(',')[1]) for bc in valid_map])
# using set() will put in order of smallest to largest and removes
# redundant lengths, converting to list to sort from largest to smallest
added_demultiplex_lens =\
[length for length in added_demultiplex_lens][::-1]
# Handle specific case of run_prefix
# Need to slice out size(s) of label that matches run prefix size(s)
if added_demultiplex_field.upper() == "RUN_PREFIX":
added_demultiplex =\
[curr_id.split()[0][0:added_demultiplex_len] for \
added_demultiplex_len in added_demultiplex_lens]
else:
for label_item in curr_id.split():
if label_item.startswith(added_demultiplex_field):
added_demultiplex = [label_item.split('=')[1]]
all_bcs = [bc.split(',')[0] for bc in valid_map]
all_added_demultiplex = [bc.split(',')[1] for bc in valid_map]
for curr_added_demultiplex in added_demultiplex:
bc_and_demultiplex = curr_barcode + "," + curr_added_demultiplex
if bc_and_demultiplex in valid_map:
return False, bc_and_demultiplex, corrected_bc
elif attempt_correction == False:
return True, curr_barcode, corrected_bc
else:
if curr_barcode in valid_map:
return False, curr_barcode, corrected_bc
elif attempt_correction == False:
return True, curr_barcode, corrected_bc
if barcode_type in BARCODE_TYPES:
expect_len, curr_bc_fun = BARCODE_TYPES[barcode_type]
barcode, num_errors = curr_bc_fun(curr_barcode, valid_map)
corrected_bc = True
if added_demultiplex_field:
for curr_added_demultiplex in added_demultiplex:
bc_and_demultiplex = barcode + "," + curr_added_demultiplex
if bc_and_demultiplex in valid_map:
return num_errors, bc_and_demultiplex, corrected_bc
else:
return num_errors, barcode, corrected_bc
else:
try:
expect_len, curr_bc_fun = int(barcode_type), correct_barcode
barcode, num_errors = curr_bc_fun(curr_barcode, valid_map)
corrected_bc = True
if added_demultiplex_field:
for curr_added_demultiplex in added_demultiplex:
bc_and_demultiplex = barcode + "," + curr_added_demultiplex
if bc_and_demultiplex in valid_map:
return num_errors, bc_and_demultiplex, corrected_bc
except ValueError:
raise ValueError, "Unsupported barcode type: %s" % barcode_type
return num_errors, barcode, corrected_bc
def make_histograms(raw_lengths, pre_lengths, post_lengths, binwidth=10):
"""Makes histogram data for pre and post lengths"""
if post_lengths:
min_len = min([min(post_lengths), min(raw_lengths)])
else:
min_len = min(raw_lengths)
max_len = max(raw_lengths)
floor = (min_len/binwidth)*binwidth
ceil = ((max_len/binwidth)+2)*binwidth
bins = arange(floor, ceil, binwidth)
raw_hist = histogram(raw_lengths,bins)[0]
pre_hist = histogram(pre_lengths,bins)[0]
post_hist, bin_edges = histogram(post_lengths,bins)
return raw_hist, pre_hist, post_hist, bin_edges
class SeqQualBad(object):
"""Checks if a seq and qual score are bad, saving ids that are bad."""
def __init__(self, name, f):
"""New SeqQualBad keeps track of failed ids."""
self.FailedIds = []
self.Name = name
self.F = f
def __call__(self, id_, seq, qual):
"""SeqQualBad called on id, seq and qual returns bool.
Note: saves failed ids in self.FailedIds."""
result = self.F(id_, seq, qual)
if result:
self.FailedIds.append(id_)
return result
def __str__(self):
"""SeqQualBad str returns tab-delimited output of counts."""
return "%s\t%s" % (self.Name, len(self.FailedIds))
def qual_missing(id_, seq, qual):
"""returns True if qual is None"""
return qual is None
QualMissing = SeqQualBad('Missing Qual Score', qual_missing)
def get_seq_lengths(seq_lengths, bc_counts):
"""Convenience wrapper for getting lengths of good and bad seqs"""
all_seq_lengths = seq_lengths.values()
all_seq_ids = set(seq_lengths.keys())
bad_seq_ids = set(bc_counts[None]).union(set(bc_counts['#FAILED']))
good_seq_ids = all_seq_ids - bad_seq_ids
good_seq_lengths = map(seq_lengths.__getitem__, good_seq_ids)
return all_seq_lengths, good_seq_lengths
def check_window_qual_scores(qual_scores, window=50, min_average=25):
"""Check that all windows have ave qual score > threshold."""
# Code from Jens Reeder, added 1-13-2010
l = len(qual_scores)
window = min(window, l)
if (window == 0):
return True
#initialize with sum of first window
window_score = sum(qual_scores[:window])
idx = 0
while (window_score/float(window) >= min_average
and idx < l-window):
#'Move' window
window_score += qual_scores[idx+window] - qual_scores[idx]
idx += 1
if (idx == l-window):
# we processed all qual_scores, must be good
# Return index for truncation purposes
return True, idx
else:
return False, idx
def check_seqs(fasta_out, fasta_files, starting_ix, valid_map, qual_mappings,
filters, barcode_len, keep_primer, keep_barcode, barcode_type,
max_bc_errors, retain_unassigned_reads, attempt_bc_correction,
primer_seqs_lens, all_primers, max_primer_mm, disable_primer_check,
reverse_primers, rev_primers, qual_out, qual_score_window=0,
discard_bad_windows=False, min_qual_score=25, min_seq_len=200,
median_length_filtering=None, added_demultiplex_field=None,
reverse_primer_mismatches=0, truncate_ambi_bases=False):
"""Checks fasta-format sequences and qual files for validity."""
seq_lengths = {}
# Record complete barcode + primer + sequence lengths
raw_seq_lengths = {}
# Record sequence lengths after all optional removal of components
final_seq_lengths = {}
bc_counts = defaultdict(list)
curr_ix = starting_ix
corr_ct = 0 #count of corrected barcodes
# get the list of barcode lengths in reverse order
barcode_length_order =\
list(set([len(bc.split(',')[0]) for bc in valid_map]))
barcode_length_order.sort()
barcode_length_order = barcode_length_order[::-1]
primer_mismatch_count = 0
all_primers_lens = list(set(all_primers.values()))
all_primers_lens.sort()
reverse_primer_not_found = 0
sliding_window_failed = 0
trunc_ambi_base_counts = 0
below_seq_min_after_trunc = 0
below_seq_min_after_ambi_trunc = 0
for fasta_in in fasta_files:
for curr_id, curr_seq in MinimalFastaParser(fasta_in):
curr_rid = curr_id.split()[0]
curr_seq = upper(curr_seq)
curr_len = len(curr_seq)
curr_qual = qual_mappings.get(curr_rid, None)
#if qual_out:
# curr_qual_out_score = \
# "%2.2f" % float(float(sum(curr_qual))/float(len(curr_qual)))
seq_lengths[curr_rid] = curr_len
failed = False
for f in filters:
failed = failed or f(curr_rid, curr_seq, curr_qual)
if failed: #if we failed any of the checks, bail out here
bc_counts['#FAILED'].append(curr_rid)
continue
if barcode_type == 'variable_length':
# Reset the raw_barcode, raw_seq, and barcode_len -- if
# we don't match a barcode from the mapping file, we want
# these values to be None
raw_barcode, raw_seq, barcode_len = (None, None, None)
curr_valid_map =\
[curr_bc.split(',')[0] for curr_bc in valid_map]
# Iterate through the barcode length from longest to shortest
for l in barcode_length_order:
# extract the current length barcode from the sequence
bc, seq = get_barcode(curr_seq, l)
# check if the sliced sequence corresponds to a valid
# barcode, and if so set raw_barcode, raw_seq, and
# barcode_len for use in the next steps
if bc in curr_valid_map:
raw_barcode, raw_seq = bc, seq
barcode_len = len(raw_barcode)
break
# if we haven't found a valid barcode, log this sequence as
# failing to match a barcode, and move on to the next sequence
if not raw_barcode:
bc_counts['#FAILED'].append(curr_rid)
continue
else:
# Get the current barcode to look up the associated primer(s)
raw_barcode, raw_seq = get_barcode(curr_seq, barcode_len)
if not disable_primer_check:
try:
current_primers = primer_seqs_lens[raw_barcode]
# In this case, all values will be the same, i.e. the length
# of the given primer, or degenerate variations thereof.
primer_len = current_primers.values()[0]
if primer_exceeds_mismatches(raw_seq[:primer_len],\
current_primers, max_primer_mm):
bc_counts['#FAILED'].append(curr_rid)
primer_mismatch_count += 1
continue
except KeyError:
# If the barcode read does not match any of those in the
# mapping file, the situation becomes more complicated. We do
# not know the length the sequence to slice out to compare to
# our primer sets, so, in ascending order of all the given
# primer lengths, a sequence will the sliced out and compared
# to the primer set.
current_primers = all_primers
found_match = False
for seq_slice_len in all_primers_lens:
if not(primer_exceeds_mismatches(raw_seq[:seq_slice_len],\
current_primers, max_primer_mm)):
primer_len = seq_slice_len
found_match = True
break
if not found_match:
bc_counts['#FAILED'].append(curr_rid)
primer_mismatch_count += 1
continue
except IndexError:
# Try to raise meaningful error if problem reading primers
raise IndexError, ('Error reading primer sequences. If '+\
'primers were purposefully not included in the mapping '+\
'file, disable usage with the -p option.')
else:
# Set primer length to zero if primers are disabled.
primer_len = 0
# split seqs
cbc, cpr, cres = split_seq(curr_seq, barcode_len,\
primer_len)
total_bc_primer_len = len(cbc) + len(cpr)
# get current barcode
try:
bc_diffs, curr_bc, corrected_bc = \
check_barcode(cbc, barcode_type, valid_map.keys(), \
attempt_bc_correction, added_demultiplex_field, curr_id)
if bc_diffs > max_bc_errors:
raise ValueError, "Too many errors in barcode"
corr_ct += bool(corrected_bc)
except Exception, e:
bc_counts[None].append(curr_rid)
continue
curr_samp_id = valid_map.get(curr_bc, 'Unassigned')
new_id = "%s_%d" % (curr_samp_id, curr_ix)
# check if writing out primer
write_seq = cres
if reverse_primers == "truncate_only":
try:
rev_primer = rev_primers[curr_bc]
mm_tested = {}
for curr_rev_primer in rev_primer:
# Try to find lowest count of mismatches for all
# reverse primers
rev_primer_mm, rev_primer_index = \
local_align_primer_seq(curr_rev_primer,cres)
mm_tested[rev_primer_mm] = rev_primer_index
rev_primer_mm = min(mm_tested.keys())
rev_primer_index = mm_tested[rev_primer_mm]
if rev_primer_mm <= reverse_primer_mismatches:
write_seq = write_seq[0:rev_primer_index]
if qual_out:
curr_qual = curr_qual[0:barcode_len +\
primer_len + rev_primer_index]
else:
reverse_primer_not_found += 1
except KeyError:
pass
elif reverse_primers == "truncate_remove":
try:
rev_primer = rev_primers[curr_bc]
mm_tested = {}
for curr_rev_primer in rev_primer:
# Try to find lowest count of mismatches for all
# reverse primers
rev_primer_mm, rev_primer_index = \
local_align_primer_seq(curr_rev_primer,cres)
mm_tested[rev_primer_mm] = rev_primer_index
rev_primer_mm = min(mm_tested.keys())
rev_primer_index = mm_tested[rev_primer_mm]
if rev_primer_mm <= reverse_primer_mismatches:
write_seq = write_seq[0:rev_primer_index]
if qual_out:
curr_qual = curr_qual[0:barcode_len +\
primer_len + rev_primer_index]
else:
reverse_primer_not_found += 1
write_seq = False
except KeyError:
bc_counts['#FAILED'].append(curr_rid)
continue
# Check for quality score windows, truncate or remove sequence
# if poor window found. Previously tested whole sequence-now
# testing the post barcode/primer removed sequence only.
if qual_score_window:
passed_window_check, window_index =\
check_window_qual_scores(curr_qual, qual_score_window,
min_qual_score)
# Throw out entire sequence if discard option True
if discard_bad_windows and not passed_window_check:
sliding_window_failed += 1
write_seq = False
# Otherwise truncate to index of bad window
elif not discard_bad_windows and not passed_window_check:
sliding_window_failed += 1
write_seq = write_seq[0:window_index]
if qual_out:
curr_qual = curr_qual[0:barcode_len +\
primer_len + window_index]
# Check for sequences that are too short after truncation
if len(write_seq) + total_bc_primer_len < min_seq_len:
write_seq = False
below_seq_min_after_trunc += 1
if truncate_ambi_bases and write_seq:
write_seq_ambi_ix = True
# Skip if no "N" characters detected.
try:
ambi_ix = write_seq.index("N")
write_seq = write_seq[0:ambi_ix]
except ValueError:
write_seq_ambi_ix = False
pass
if write_seq_ambi_ix:
# Discard if too short after truncation
if len(write_seq) + total_bc_primer_len < min_seq_len:
write_seq = False
below_seq_min_after_ambi_trunc += 1
else:
trunc_ambi_base_counts += 1
if qual_out:
curr_qual = curr_qual[0:barcode_len +\
primer_len + ambi_ix]
# Slice out regions of quality scores that correspond to the
# written sequence, i.e., remove the barcodes/primers and reverse
# primers if option is enabled.
if qual_out:
qual_barcode, qual_primer, qual_scores_out = \
split_seq(curr_qual, barcode_len, primer_len)
# Convert to strings instead of numpy arrays, strip off brackets
qual_barcode = format_qual_output(qual_barcode)
qual_primer = format_qual_output(qual_primer)
qual_scores_out = format_qual_output(qual_scores_out)
if not write_seq:
bc_counts['#FAILED'].append(curr_rid)
continue
if keep_primer:
write_seq = cpr + write_seq
if qual_out:
qual_scores_out = qual_primer + qual_scores_out
if keep_barcode:
write_seq = cbc + write_seq
if qual_out:
qual_scores_out = qual_barcode + qual_scores_out
# Record number of seqs associated with particular barcode.
bc_counts[curr_bc].append(curr_rid)
if retain_unassigned_reads and curr_samp_id == "Unassigned":
fasta_out.write(
">%s %s orig_bc=%s new_bc=%s bc_diffs=%s\n%s\n" %
(new_id, curr_rid, cbc, curr_bc, int(bc_diffs), write_seq))
if qual_out:
qual_out.write(
">%s %s orig_bc=%s new_bc=%s bc_diffs=%s\n%s" %
(new_id, curr_rid, cbc, curr_bc, int(bc_diffs),
qual_scores_out))
elif not retain_unassigned_reads and curr_samp_id == "Unassigned":
bc_counts['#FAILED'].append(curr_rid)
else:
fasta_out.write(
">%s %s orig_bc=%s new_bc=%s bc_diffs=%s\n%s\n" %
(new_id, curr_rid, cbc, curr_bc, int(bc_diffs), write_seq))
if qual_out:
qual_out.write(
">%s %s orig_bc=%s new_bc=%s bc_diffs=%s\n%s" %
(new_id, curr_rid, cbc, curr_bc, int(bc_diffs),
qual_scores_out))
curr_len = len(write_seq)
#seq_lengths[curr_rid] = curr_len
curr_ix += 1
# Record the raw and written seq length of everything passing
# filters
raw_seq_lengths[curr_rid] = len(curr_seq)
final_seq_lengths[curr_id] = curr_len
if median_length_filtering:
# Read original fasta file output to get sequence lengths
fasta_out.close()
fasta_out = open(fasta_out.name, "U")
# Record sequence lengths for median/mad calculation
sequence_lens = []
for label, seq in MinimalFastaParser(fasta_out):
sequence_lens.append(len(seq))
'''# Create a temporary file to copy the contents of the fasta file, will
# need to delete once operations complete.
fasta_temp = open(fasta_out.name + "_tmp.fasta", "w")
sequence_lens = []
for label, seq in MinimalFastaParser(fasta_lens):
sequence_lens.append(len(seq))
fasta_temp.write(">%s\n%s\n" % (label, seq))
fasta_temp.close()
fasta_temp = open(fasta_out.name + "_tmp.fasta", "U")
fasta_lens.close()
# Overwrite seqs.fna with length filtered data
fasta_out = open(fasta_out.name, "w")'''
med_abs_dev, med_length = median_absolute_deviation(sequence_lens)
min_corrected_len = med_length - med_abs_dev *\
float(median_length_filtering)
max_corrected_len = med_length + med_abs_dev *\
float(median_length_filtering)
seqs_discarded_median = 0
fasta_out.seek(0)
final_written_lens = []
# Create final seqs.fna
final_fasta_out = open(fasta_out.name.replace('.tmp',''), "w")
for label,seq in MinimalFastaParser(fasta_out):
curr_len = len(seq)
if curr_len < min_corrected_len or curr_len > max_corrected_len:
seqs_discarded_median += 1
else:
final_fasta_out.write(">%s\n%s\n" % (label,seq))
final_written_lens.append(len(seq))
final_fasta_out.close()
fasta_out.close()
remove_files([fasta_out.name])
else:
min_corrected_len = 0
max_corrected_len = 0
seqs_discarded_median = 0
final_written_lens = 0
# Copy tmp seqs file to final seqs.fna file
fasta_out.close()
fasta_out = open(fasta_out.name, "U")
# Create final seqs.fna
final_fasta_out = open(fasta_out.name.replace('.tmp',''), "w")
for label,seq in MinimalFastaParser(fasta_out):
final_fasta_out.write(">%s\n%s\n" % (label,seq))
final_fasta_out.close()
fasta_out.close()
remove_files([fasta_out.name])
median_results = (median_length_filtering, min_corrected_len,
max_corrected_len, seqs_discarded_median, final_written_lens)
raw_seq_lengths = raw_seq_lengths.values()
final_seq_lengths = final_seq_lengths.values()
log_out = format_log(bc_counts, corr_ct, valid_map, seq_lengths, filters,\
retain_unassigned_reads, attempt_bc_correction, primer_mismatch_count, \
max_primer_mm, reverse_primers, reverse_primer_not_found,
sliding_window_failed, below_seq_min_after_trunc, qual_score_window,
discard_bad_windows, min_seq_len, raw_seq_lengths,
final_seq_lengths, median_results, truncate_ambi_bases,
below_seq_min_after_ambi_trunc, )
#all_seq_lengths, good_seq_lengths = get_seq_lengths(seq_lengths, bc_counts)
return log_out, seq_lengths.values(), raw_seq_lengths, final_seq_lengths
def format_qual_output(qual_array):
""" Converts to string from numpy arrays, removes brackets """
# Size of lines needed for proper quality score file format
qual_line_size = 60
qual_scores = ""
for slice in range(0, len(qual_array), qual_line_size):
current_segment = qual_array[slice:slice + qual_line_size]
current_segment =\
" ".join(str(score) for score in current_segment) + "\n"
qual_scores += current_segment
'''qual_array = str(qual_array)
qual_array = qual_array.replace('[','')
qual_array = qual_array.replace(']','') '''
return qual_scores
def format_log(bc_counts, corr_ct, valid_map, seq_lengths, filters,\
retain_unassigned_reads, attempt_bc_correction,
primer_mismatch_count, max_primer_mm,\
reverse_primers, reverse_primer_not_found, sliding_window_failed,
below_seq_min_after_trunc, qual_score_window,
discard_bad_windows, min_seq_len,
raw_seq_lengths, final_seq_lengths, median_results=(None),
truncate_ambi_bases=False, below_seq_min_after_ambi_trunc=0,
):
"""Makes log lines"""
log_out = []
all_seq_lengths, good_seq_lengths = get_seq_lengths(seq_lengths, bc_counts)
log_out.append("Number raw input seqs\t%d\n" % len(seq_lengths))
# append log data for median absolute deviation sequence length filtering
# if was performed.