This repository has been archived by the owner on Nov 9, 2023. It is now read-only.
/
denoise_postprocess.py
96 lines (74 loc) · 3.44 KB
/
denoise_postprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python
__author__ = "Jens Reeder"
__copyright__ = "Copyright 2011, The QIIME Project"
__credits__ = ["Jens Reeder", "Rob Knight"]#remember to add yourself if you make changes
__license__ = "GPL"
__version__ = "1.5.0"
__maintainer__ = "Jens Reeder"
__email__ = "jens.reeder@gmail.com"
__status__ = "Release"
from optparse import OptionParser
from itertools import imap
from re import compile, search
from cogent import Sequence
from cogent.parse.fasta import MinimalFastaParser
from qiime.denoiser.utils import read_denoiser_mapping, sort_ids
def extract_read_to_sample_mapping(labels):
"""Extract a mapping from reads to sample_ids from split_libraries.
labels: iterable of header strings
A fasta header from split_libraries looks like this
>S160_1 E86FECS01DW5V4 orig_bc=CAGTACGATCTT new_bc=CAGTACGATCTT bc_diffs=0
"""
sample_id_mapping = {}
re = compile(r'(\S+) (\S+)')
for label in labels:
tmatch = search(re, label)
sample_id = tmatch.group(1)
flowgram_id = tmatch.group(2)
sample_id_mapping[flowgram_id] = sample_id
return sample_id_mapping
def post_process(fasta_fp, mapping_fp, denoised_seqs_fp,
otu_picker_otu_map_fp, out_dir):
"""redirect function for compatibilty with Qiime"""
combine_mappings(open(fasta_fp), open(mapping_fp),
open(denoised_seqs_fp),
open(otu_picker_otu_map_fp), out_dir)
def combine_mappings(fasta_fh, mapping_fh, denoised_seqs_fh, otu_picker_otu_map_fh, out_dir):
"""Combine denoiser and OTU picker mapping file, replace flowgram IDs.
fasta_fh: a fasta file with labels as produced by Qiime's split_libraries.py
used to replace flowgram id with the unique se_sample_id
mapping_fh: The cluster mapping from the denoiser.py
denoised_seqs_fh: the Fasta output files from denoiser.py
otu_picker_map_fh: cluster map from otu picker on denoised_seqs_fh
out_dir: output directory
"""
#read in mapping from split_library file
labels = imap(lambda (a,b): a, MinimalFastaParser(fasta_fh))
#mapping from seq_id to sample_id
sample_id_mapping = extract_read_to_sample_mapping(labels)
denoiser_mapping = read_denoiser_mapping(mapping_fh)
#read in cd_hit otu map
# and write out combined otu_picker+denoiser map
otu_fh = open(out_dir+"/denoised_otu_map.txt","w")
for otu_line in otu_picker_otu_map_fh:
otu_split = otu_line.split()
otu = otu_split[0]
ids = otu_split[1:]
get_sample_id = sample_id_mapping.get
#concat lists
#make sure the biggest one is first for pick_repr
all_ids = sort_ids(ids, denoiser_mapping)
all_ids.extend(sum([denoiser_mapping[id] for id in ids], []))
try:
otu_fh.write("%s\t" % otu +
"\t".join(map(get_sample_id, all_ids))+"\n")
except TypeError:
#get returns Null if denoiser_mapping id not present in sample_id_mapping
print "Found id in denoiser output, which was not found in split_libraries "+\
"output FASTA file. Wrong file?"
exit()
fasta_out_fh = open(out_dir+"/denoised_all.fasta","w")
for label, seq in MinimalFastaParser(denoised_seqs_fh):
id = label.split()[0]
newlabel = "%s %s" %(sample_id_mapping[id], id)
fasta_out_fh.write(Sequence(name= newlabel, seq=seq).toFasta()+"\n")