This repository has been archived by the owner on Nov 9, 2023. It is now read-only.
/
add_alpha_to_mapping_file.py
executable file
·129 lines (103 loc) · 5.19 KB
/
add_alpha_to_mapping_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env python
# File created on 02 Nov 2012
from __future__ import division
__author__ = "Yoshiki Vazquez-Baeza"
__copyright__ = "Copyright 2011, The QIIME project"
__credits__ = ["Yoshiki Vazquez-Baeza", "Antonio Gonzalez-Pena"]
__license__ = "GPL"
__version__ = "1.6.0"
__maintainer__ = "Yoshiki Vazquez-Baeza"
__email__ = "yoshiki89@gmail.com"
__status__ = "Release"
from copy import deepcopy
from numpy import searchsorted
from qiime.stats import quantile
def add_alpha_diversity_values_to_mapping_file(metrics, alpha_sample_ids,
alpha_data, mapping_file_headers,
mapping_file_data, bins, method='equal',
missing_value_name='N/A'):
"""add 3 columns in the mapping file representing the alpha diversity data
Inputs:
metrics: list of alpha diversity metrics
alpha_sample_ids: list of sample identifiers in the alpha diversity data
alpha_data: alpha diversity data, as many columns as metrics and as many
rows as elments in alpha_sample_ids
mapping_file_headers: list of headers for the metadata mapping file
mapping_file_data: metadata mapping file data
bins: bins to classify the alpha diversity data
method: binning method selection, the options are 'equal' and 'quantile'.
'equal', will get you equally spaced limits and 'quantile' will assign the
limits using quantiles, using the selected number of bins.
missing_value_name: string to place for the sample ids in the mapping file
but not in the alpha diversity data
Output:
mapping_file_headers: modified headers mapping file headers, including three
new columns per metric i. e. for chao1 the new fields would be:
'chao1_alpha', 'chao1_alpha_norm' and 'chao1_alpha_bin'
mapping_file_data: extended data including the alpha diversity values,
normalized values and bins
"""
norm = lambda x, x_min, x_max: (x-x_min)/(x_max-x_min)
# data will be modified and returned so get your own copy
new_mapping_file_data = deepcopy(mapping_file_data)
new_mapping_file_headers = deepcopy(mapping_file_headers)
# regular levels assigned based on equally spaced bins
overall_probs = [i/bins for i in range(1, bins)]
# if we are using the average method the levels are equal to the probs list
if method == 'equal':
levels = overall_probs
for index, metric in enumerate(metrics):
# get the alpha diversity value for the metric being evaluated
data = [[row[index]] for row in alpha_data]
metric_max = max(data)[0]
metric_min = min(data)[0]
# add headers for each metric
new_mapping_file_headers.append('{0}_alpha'.format(metric))
new_mapping_file_headers.append('{0}_normalized_alpha'.format(metric))
new_mapping_file_headers.append('{0}_alpha_label'.format(metric))
# when using the quantile method the levels change depending on the
# metric being used; hence the calculation and normalization of the data
if method == 'quantile':
levels = quantile([norm(element[0], metric_min, metric_max)
for element in data], overall_probs)
# get the normalized value of diversity and the tag for each value
for value in data:
norm_value = norm(value[0], metric_min, metric_max)
value.append(norm_value)
value.append(_get_level(norm_value, levels, 'bin'))
# iterate using the mapping file instead of using the alpha diversity
# data because more often that you will like you will have more samples
# in the mapping file than in your downstream analysis data
for row in new_mapping_file_data:
try:
data_index = alpha_sample_ids.index(row[0])
# data fields should be strings
row.extend(map(str, data[data_index]))
except ValueError:
row.extend([missing_value_name, missing_value_name,
missing_value_name])
return new_mapping_file_data, new_mapping_file_headers
def _get_level(value, levels, prefix=None):
"""accommodate a value into the 'levels' list; return a string or an integer
Input:
value: normalized value to assign a level to, must be between 0 and 1
levels: sorted edges to check at which level does 'value' fall into place
Output:
output: (str) if a prefix is provided a string of the form: prefix_2_of_5 is
returned, where 2 is the level assigned to the passed value and 5 is the
number of levels specified. (int) If a prefix is not provided the level of
the value is returned as an integer
"""
assert value <= 1 and value >= 0, "The value must be between 0 and 1"
check = [i for i in range(0, len(levels)) if levels[i] == value]
# apply a special rule for the values that are equal to an edge
if len(check):
value_level = check[0] + 2
# if it is not a special case just use searchsorted
else:
value_level = searchsorted(levels, value)+1
if prefix != None:
output = '{0}_{1}_of_{2}'.format(prefix, value_level, len(levels)+1)
else:
output = value_level
return output