This repository has been archived by the owner on Nov 9, 2023. It is now read-only.
/
relatedness_library.py
161 lines (142 loc) · 6.92 KB
/
relatedness_library.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/env python
# File created on 02 May 2012
from __future__ import division
__author__ = "William Van Treuren"
__copyright__ = "Copyright 2011, The QIIME project"
__credits__ = ["William Van Treuren"]
__license__ = "GPL"
__version__ = "1.6.0"
__maintainer__ = "William Van Treuren"
__email__ = "wdwvt1@gmail.com"
__status__ = "Release"
from numpy.random import shuffle
from numpy import std, mean, array, allclose
from copy import deepcopy
"""NOTE: calculates NRI/NTI according to the formula used by Phylocom 4.2,3.41
rather than Webb 2002 or Webb 2000. Uses a 'null model 2' -- chooses the
random distance matrices without replacment from the full available pool of
distance matrices. See Phylocom manual for details."""
# NRI
def nri(dist_mat, all_ids, group_ids, iters=1000):
"""calculates nri (net relatedness index) of an otu or sample grouping
inputs:
datamtx - 2d array that has distance between otus or samples
all_ids - list of strs/floats/ints that corresponds to the rows or cols
of the datamtx
group_ids - list of strs/floats/ints that corresponds to the rows or
cols of the datamtx that you want to group and compare against the
entire matrix.
iters - int, number of random draws used to calculate the distance to
compare the group_ids distance against.
NOTE: calculates NRI according to the formula used by Phylocom 4.2,3.41
rather than Webb 2002 or Webb 2000. Uses a 'null model 2' -- chooses the
random distance matrices without replacment from the full available pool of
distance matrices. See Phylocom manual for details.
"""
if len(group_ids) < 2:
raise ValueError('there is no standard deviation of distances with ' +\
'less than 2 taxa in taxa ids')
if dist_mat.sum() == 0.0:
raise ValueError('the input dist_mat has no data. its sum is 0.')
mn_x_obs = \
mpd(take_distmat_data(dist_mat, all_ids, group_ids))
mn_x_n, sd_x_n = \
mpd_mean_sd(dist_mat, all_ids, len(group_ids), iters)
if allclose(sd_x_n,0.0) == True:
raise ValueError("The std of the random samples was zero within 10^-8 of"+\
" 0.0. This is likely due to a phylogeny which has equal distances"+\
" between all tips.")
return -1.0*((mn_x_obs-mn_x_n)/sd_x_n)
def take_random_ids(all_ids, num_to_take):
"""takes num_to_take ids from shuffled list of all_ids"""
if len(all_ids) < num_to_take:
raise ValueError('trying to take too many ids from all ids')
l = deepcopy(all_ids)
shuffle(l)
return l[:num_to_take]
def mpd(datamtx):
"""calculates mpd (mean phylogenetic distance)"""
dists = []
rows,cols = datamtx.shape
for r in range(rows):
for c in range(r+1,rows): # avoid d(i,i). d(i,i)=0 if datamtx calculated with distance metric
dists.append(datamtx[r][c])
m = mean(dists)
return float(m)
def mpd_mean_sd(dist_mat, all_ids, num_to_take, iters):
"""calculates mean and std for random dist mats based on iter randomizations
"""
means = []
for i in range(iters):
ids_to_take = take_random_ids(all_ids, num_to_take)
r_dist_mat = take_distmat_data(dist_mat, all_ids, ids_to_take)
i_mean = mpd(r_dist_mat)
means.append(i_mean)
# calculate the standard deviation of the means of the distances rather than
# the standard deviation of the distances themselves. The forumula from
# Webb 2002 seems to calculate the standard deviation of the distances
# but based on tests of Phylocom and the Phylocom manual, Phylocom
# computes the standard deviations of the means of the iters number of
# random distance matrices.
return float(mean(means)), float(std(means))
def take_distmat_data(dist_mat, all_ids, ids_to_keep):
"""takes data from the dist_mat based on index of ids_to_keep in all_ids"""
# ids could be given in any order, must sort to prevent choosing
# wrong values at later steps. all_ids must be given in order of the
# dist_mat array they correspond to
indices = sorted([all_ids.index(i) for i in ids_to_keep])
new_dist_mat = dist_mat.take(indices,axis=0) #remove rows
new_dist_mat = new_dist_mat.take(indices,axis=1) #remove cols
return new_dist_mat
# NTI
def nti(datamtx, all_ids, group_ids, iters=1000):
"""calculates nti (nearest taxon index) of an otu or sample grouping
inputs:
datamtx - 2d array that has distance between otus or samples
all_ids - list of strs/floats/ints that corresponds to the rows or cols
of the datamtx
group_ids - list of strs/floats/ints that corresponds to the rows or
cols of the datamtx that you want to group and compare against the
entire matrix.
iters - int, number of random draws used to calculate the distance to
compare the group_ids distance against.
NOTE: calculates NTI according to the formula used by Phylocom 4.2,3.41
rather than Webb 2002 or Webb 2000. Uses a 'null model 2' -- chooses the
random distance matrices without replacment from the full available pool of
distance matrices. See Phylocom manual for details.
"""
if len(group_ids) < 2:
raise ValueError('there is no standard deviation of distances with ' +\
'less than 2 taxa in the taxa ids')
if datamtx.sum() == 0.0:
raise ValueError('the input dist_mat has no data. its sum is 0.')
mn_y_obs = \
mntd(take_distmat_data(datamtx, all_ids, group_ids))
mn_y_n, sd_y_n = \
mntd_mean_sd(datamtx, all_ids, len(group_ids), iters)
# for debugging, to check against phylocom
# print mn_y_obs, mn_y_n, sd_y_n
# WARNING: the std deviation is being rounded
if allclose(sd_y_n,0.0) == True:
raise ValueError("The std of the random samples was zero within 10^-8 of"+\
" 0.0. This is likely due to a phylogeny which has equal distances"+\
" between all tips.")
return -1.*((mn_y_obs-mn_y_n)/sd_y_n)
def mntd(datamtx):
"""calculates mntd (mena nearest taxon distance) for a datamtx"""
# the MNTD (mean nearest taxon distance) requires choosing only the min
# value in each row, excluding d(i,i) vals which are dists to self (0.0)
m = 0.0
for i,row in enumerate(datamtx):
m += min(row.tolist()[:i]+row.tolist()[i+1:]) # exclude d(i,i). d(i,i)=0 bc datamatx calculated with metric
return m/float(i+1) #i+1 dists
def mntd_mean_sd(dist_mat, all_ids, num_to_take, iters):
"""calculates mean and std for random dist mats based on iter randomizations
num_to_take is the number of random rows/cols to take for a randomization"""
means = []
for i in range(iters):
ids_to_take = take_random_ids(all_ids, num_to_take)
r_dist_mat = take_distmat_data(dist_mat, all_ids, ids_to_take)
i_mean = mntd(r_dist_mat)
means.append(i_mean)
return float(mean(means)), float(std(means))