This repository has been archived by the owner on Nov 9, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 269
/
hierarchical_cluster.py
80 lines (63 loc) · 2.54 KB
/
hierarchical_cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python
__author__ = "Justin Kuczynski"
__copyright__ = "Copyright 2011, The QIIME Project"
__credits__ = ["Justin Kuczynski"]
__license__ = "GPL"
__version__ = "1.9.0"
__maintainer__ = "Justin Kuczynski"
__email__ = "justinak@gmail.com"
"""runs upgma or nj on a distance matrix file, writes the resulting tree
with distances to specified output file
"""
import warnings
warnings.filterwarnings('ignore', 'Not using MPI as mpi4py not found')
from optparse import OptionParser
from qiime.parse import parse_distmat
from scipy.cluster.hierarchy import linkage
from skbio.tree import TreeNode
from skbio.stats.distance import DistanceMatrix
from skbio.tree import nj
import os.path
def multiple_file_upgma(input_dir, output_dir):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_names = os.listdir(input_dir)
file_names = [fname for fname in file_names if not fname.startswith('.')]
for fname in file_names:
base_fname, ext = os.path.splitext(fname)
single_file_upgma(os.path.join(input_dir, fname),
os.path.join(output_dir, 'upgma_' + base_fname + '.tre'))
def single_file_upgma(input_file, output_file):
# read in dist matrix
dist_mat = DistanceMatrix.read(input_file)
# SciPy uses average as UPGMA:
# http://docs.scipy.org/doc/scipy/reference/generated/
# scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage
linkage_matrix = linkage(dist_mat.condensed_form(), method='average')
tree = TreeNode.from_linkage_matrix(linkage_matrix, dist_mat.ids)
# write output
f = open(output_file, 'w')
try:
f.write(tree.to_newick(with_distances=True))
except AttributeError:
if c is None:
raise RuntimeError("""input file %s did not make a UPGMA tree.
Ensure it has more than one sample present""" % (str(input_file),))
raise
f.close()
def single_file_nj(input_file, output_file):
dm = DistanceMatrix.read(input_file)
tree = nj(dm)
# write output
f = open(output_file, 'w')
f.write(tree.to_newick(with_distances=True))
f.close()
def multiple_file_nj(input_dir, output_dir):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_names = os.listdir(input_dir)
file_names = [fname for fname in file_names if not fname.startswith('.')]
for fname in file_names:
base_fname, ext = os.path.splitext(fname)
single_file_nj(os.path.join(input_dir, fname),
os.path.join(output_dir, 'nj_' + base_fname + '.tre'))