/
sort.py
95 lines (76 loc) · 3.39 KB
/
sort.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# ----------------------------------------------------------------------------
# Copyright (c) 2013--, emperor development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE.md, distributed with this software.
# ----------------------------------------------------------------------------
from __future__ import division
from numpy import zeros
import re
def sort_taxa_table_by_pcoa_coords(coords_header, otu_table, otu_header):
"""Sort and match the samples in the otu table and in the coordinates data
This function will sort the columns of an otu table as suggested by the
sample ids in the coords_header
Parameters
----------
coords_header: list of str
sample ids that are present in principal coordinates data
otu_table: numpy array
numpy array with the data for an otu table
otu_header: list of str
sample ids present in the otu table
Returns
-------
sorted_otu_headers: list of str
sample ids that were present in the coords_header list, the order in
this table matches the order of the coordinates data
sorted_otu_table: numpy array
otu table data with columns belonging to the sample ids in the
sorted_otu_headers list
"""
sorted_otu_headers = []
# the size of the otu table can be pre-allocated for better memory usage
matching_headers = len(set(coords_header) & set(otu_header))
sorted_otu_table = zeros([otu_table.shape[0], matching_headers])
# iterate through the available sample ids in the coordinates file and work
# only with the ones that are present in the coords and the otu table; the
# order of the ids is important hence iterate through the original list
for i, element in enumerate(coords_header):
if element in otu_header:
current_index = otu_header.index(element)
sorted_otu_table[:, i] = otu_table[:, current_index]
sorted_otu_headers.append(element)
return sorted_otu_headers, sorted_otu_table
def sort_comparison_filenames(coord_fps):
"""Pass in a list of file names and sort them using the suffix
Parameters
----------
coord_fps: list of str
The filenames with the format something_something_qX.txt where X is
the index of the file.
Returns
-------
list of str
A sorted version of the list that was passed in where the strings are
sorted according to the suffix they have, if the string doesn't have a
suffix it will be added to the beginning of the list.
"""
if coord_fps == []:
return []
def _get_suffix(fp):
"""Gets the number in the suffix for a string using a regex"""
# any alphanumeric set of characters proceeded by a 'q', a number,
# a dot & a txt extension at the end of the line. Take for example
# bray_curtis_q1.txt or unifrac_q11.txt
regex = re.compile(r'(\w+)_q([0-9]+).txt$')
tmatch = re.search(regex, fp)
try:
number = tmatch.group(2)
# if the regex doesn't match then put it at the beginning
except (IndexError, AttributeError):
number = -1
return float(number)
# the key function retrieves the suffix number for the function to sort
# according to it's floating point representation i. e. the cast to float
return sorted(coord_fps, key=_get_suffix)