This repository has been archived by the owner on Nov 9, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 269
/
filter_otus_by_sample.py
136 lines (110 loc) · 4.39 KB
/
filter_otus_by_sample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python
# file filter_otus_by_sample.py
__author__ = "Jesse Stombaugh"
__copyright__ = "Copyright 2011, The QIIME Project"
__credits__ = ["Jesse Stombaugh"] # remember to add yourself
__license__ = "GPL"
__version__ = "1.9.0"
__maintainer__ = "Jesse Stombaugh"
__email__ = "jesse.stombaugh@colorado.edu"
from string import strip
import re
from skbio.alignment import SequenceCollection
from skbio.sequence import DNA
def filter_otus(otus, prefs):
"""filters the otus file based on which samples should be removed and
determines which sequences to remove"""
new_otus_list = []
# create an list containing the seqs to remove
for i in otus:
new_otus = []
for j in (otus[i]):
sample_seq = j.split('_')
if len(sample_seq) > 1:
new_name = ''.join(sample_seq[:-1])
else:
new_name = sample_seq[0]
is_sample = False
for sample_id in prefs:
if prefs[sample_id] == new_name:
is_sample = True
if is_sample:
pass
else:
new_otus.append('%s' % (j))
if new_otus:
new_otus_list.append((i, new_otus))
return new_otus_list
def filter_aln_by_otus(aln, prefs):
"""filters the representative set of seqs based on which samples should
be removed"""
filtered_seqs = []
removed_seqs = []
for j in range(aln.sequence_count()):
remove = False
aln_name = aln[j].id
stripped_aln_name = aln_name.split(' ')[0].split('_')
if len(stripped_aln_name) > 1:
new_aln_name = ''.join(stripped_aln_name[:-1])
else:
new_aln_name = stripped_aln_name[0]
for sample_id in prefs:
if prefs[sample_id] == new_aln_name:
remove = True
if remove:
removed_seqs.append((aln_name, str(aln[aln_name])))
else:
filtered_seqs.append((aln_name, str(aln[aln_name])))
return filtered_seqs, removed_seqs
def process_extract_samples(samples_to_extract):
"""Parses the samples to extract option from the command line"""
prefs = {}
if samples_to_extract:
samples = samples_to_extract.strip().strip("'").split(',')
for j, col in enumerate(samples):
key = str(j)
prefs[key] = {}
prefs[key] = col
return prefs
def filter_samples(prefs, data, dir_path='', filename=None):
"""processes the filtering of the otus file and representative seq set, then
writes filtered otus and filtered representative seq set files"""
aln = data['aln']
otus = data['otus']
# filter the otus file based on which samples to remove
new_otus_list = filter_otus(otus, prefs)
filtered_otus_output_filepath = '%s/%s_sfiltered_otus.txt' \
% (dir_path, filename)
filtered_otus_output_filepath = open(filtered_otus_output_filepath, 'w')
# Write out a new otus file
for key in (new_otus_list):
filtered_otus_output_filepath.write(key[0])
for j in key[1]:
filtered_otus_output_filepath.write('\t' + str(j))
filtered_otus_output_filepath.write('\n')
filtered_otus_output_filepath.close()
# filter seq set
filtered_seqs, removed_seqs = filter_aln_by_otus(aln, prefs)
# write a fasta containing list of sequences removed from
# representative set
if len(removed_seqs) > 0:
removed_seqs = SequenceCollection.from_fasta_records(
[(e[0], str(e[1])) for e in removed_seqs], DNA)
else:
raise ValueError(
'No sequences were removed. Did you specify the correct Sample ID?')
output_filepath2 = '%s/%s_sremoved.fasta' % (dir_path, filename)
output_file2 = open(output_filepath2, 'w')
output_file2.write(removed_seqs.to_fasta())
output_file2.close()
# write a fasta containing the filtered representative seqs
if len(filtered_seqs) > 0:
filtered_seqs = SequenceCollection.from_fasta_records(
[(e[0], str(e[1])) for e in filtered_seqs], DNA)
else:
raise ValueError(
'No sequences were remaining in the fasta file. Did you remove all Sample ID\'s?')
output_filepath = '%s/%s_sfiltered.fasta' % (dir_path, filename)
output_file = open(output_filepath, 'w')
output_file.write(filtered_seqs.to_fasta())
output_file.close()