Skip to content

Commit

Permalink
Merge a7b9094 into 1f3bb5a
Browse files Browse the repository at this point in the history
  • Loading branch information
jairideout committed Feb 11, 2015
2 parents 1f3bb5a + a7b9094 commit 6e46452
Show file tree
Hide file tree
Showing 3 changed files with 321 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

### Features
* Modified ``skbio.stats.distance.pwmantel`` to accept a list of filepaths. This is useful as it allows for a smaller amount of memory consumption as it only loads two matrices at a time as opposed to requiring that all distance matrices are loaded into memory.
* Added ``heatmap`` method to ``skbio.alignment.Alignment`` for creating a heatmap from an alignment ([#765](https://github.com/biocore/scikit-bio/issues/765)).

### Bug fixes
* Fixed floating point precision bugs in ``Alignment.position_frequencies``, ``Alignment.position_entropies``, ``Alignment.omit_gap_positions``, ``Alignment.omit_gap_sequences``, ``BiologicalSequence.k_word_frequencies``, and ``SequenceCollection.k_word_frequencies`` ([#801](https://github.com/biocore/scikit-bio/issues/801)).
Expand Down
191 changes: 191 additions & 0 deletions skbio/alignment/_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@

import numpy as np
from scipy.stats import entropy
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt

from skbio._base import SkbioObject
from skbio.stats.distance import DistanceMatrix
Expand Down Expand Up @@ -1662,6 +1665,194 @@ def to_phylip(self, map_labels=False, label_prefix=""):

return '\n'.join(result), new_id_to_old_id

def heatmap(self, value_map,
legend_labels=('Minimum', 'Median', 'Maximum'), fig_size=None,
cmap=None, sequence_order=None):
"""Plot alignment as a heatmap.
Plot the alignment as a heatmap, coloring heatmap cells using the
character mapping in `value_map`. The x-axis is labeled by majority
consensus, and the y-axis is labeled by sequence IDs (first, last, and
every third sequence ID are displayed).
Parameters
----------
value_map : dict, collections.defaultdict
Dictionary mapping characters in the alignment to numeric values.
``KeyErrors`` are not caught, so all possible values should be in
`value_map`, or it should be a ``collections.defaultdict`` which
can, for example, default to ``nan``.
legend_labels : iterable, optional
Labels for the minimum, median, and maximum values in the legend.
Must be an iterable of exactly three strings. Mininum, median, and
maximum values are computed from the values in `value_map`. Only
values that are actually used to perform a character-to-value
mapping are included when computing the minimum, median, and
maximum. ``nan`` values are ignored.
fig_size : tuple, optional
Size of the figure in inches. If ``None``, defaults to figure size
specified in the user's matplotlibrc file.
cmap : matplotlib colormap, optional
See here for choices:
http://matplotlib.org/examples/color/colormaps_reference.html
If ``None``, defaults to the colormap specified in the user's
matplotlibrc file.
sequence_order : iterable, optional
The order, from top-to-bottom, that the sequences should be
plotted in. Must be an iterable containing all sequence IDs in the
alignment. If ``None``, sequences are plotted as they are ordered
in the alignment.
Returns
-------
matplotlib.figure.Figure
Figure containing the heatmap and legend of the plotted alignment.
Raises
------
KeyError
If a sequence character in the alignment is not in `value_map` and
`value_map` is not a ``collections.defaultdict``.
ValueError
If `sequence_order` isn't the same set of sequence IDs in the
alignment, or if `sequence_order` contains duplicate sequence IDs.
See Also
--------
majority_consensus
References
----------
.. [1] http://www.genome.jp/aaindex/
.. [2] http://www.genome.jp/dbget-bin/www_bget?aaindex:ARGP820101
Examples
--------
.. plot::
Let's visualize an alignment of protein sequences as a heatmap, with
amino acids colored by their hydrophobicity.
Load the AAIndex database [1]_ hydrophobicity index values [2]_:
>>> from collections import defaultdict
>>> import numpy as np
>>> hydrophobicity_idx = defaultdict(lambda: np.nan)
>>> hydrophobicity_idx.update({'A': 0.61, 'L': 1.53, 'R': 0.60,
... 'K': 1.15, 'N': 0.06, 'M': 1.18,
... 'D': 0.46, 'F': 2.02, 'C': 1.07,
... 'P': 1.95, 'Q': 0., 'S': 0.05,
... 'E': 0.47, 'T': 0.05, 'G': 0.07,
... 'W': 2.65, 'H': 0.61, 'Y': 1.88,
... 'I': 2.22, 'V': 1.32})
Define an alignment of protein sequences:
>>> from skbio import Alignment, Protein
>>> sequences = [Protein('VHLTPEEKSAVTALWGKVNVDEV--', id="seq1"),
... Protein('-GLSDGEWQLVLKVWGKVEGDLPGH', id="seq2"),
... Protein('-GLSDGEWQLVLKVWGKVETDITGH', id="seq3"),
... Protein('-VLSEGEWQLVLHVWAKVEADIAGH', id="seq4")]
>>> aln = Alignment(sequences)
Plot the alignment as a heatmap:
>>> fig = aln.heatmap(hydrophobicity_idx, fig_size=(15, 10),
... legend_labels=['Hydrophilic', 'Medium',
... 'Hydrophobic'],
... cmap='Greens')
"""
if len(legend_labels) != 3:
raise ValueError(
"`legend_labels` must contain exactly three labels.")

if sequence_order is not None:
sequence_order_set = set(sequence_order)
if len(sequence_order) != len(sequence_order_set):
raise ValueError(
"Found duplicate sequence ID(s) in `sequence_order`.")
if sequence_order_set != set(self.ids()):
raise ValueError(
"`sequence_order` must contain the same set of sequence "
"IDs in the alignment.")
else:
sequence_order = self.ids()

mtx, min_val, median_val, max_val = self._alignment_to_heatmap_matrix(
value_map, sequence_order)

# heatmap code derived from the matplotlib gallery:
# http://matplotlib.org/examples/pylab_examples/
# colorbar_tick_labelling_demo.html
fig, ax = plt.subplots()
if fig_size is not None:
fig.set_size_inches(fig_size)

cax = ax.imshow(mtx, interpolation='nearest', cmap=cmap)

sequence_count = self.sequence_count()
y_ticks = [0] + list(range(3, sequence_count - 3, 3))
if sequence_count > 1:
# we have more than one sequence, so add the index of the last
# sequence
y_ticks += [sequence_count - 1]
y_tick_labels = [sequence_order[tick] for tick in y_ticks]
ax.set_yticks(y_ticks)
ax.set_yticklabels(y_tick_labels)

ax.set_xticks(range(self.sequence_length()))
ax.set_xticklabels(self.majority_consensus(), size=7)

# add colorbar legend
cbar = fig.colorbar(cax, ticks=[min_val, median_val, max_val],
orientation='horizontal')
cbar.ax.set_xticklabels(legend_labels)

return fig

def _alignment_to_heatmap_matrix(self, value_map, sequence_order):
"""Convert alignment to numeric matrix for heatmap plotting."""
if self.is_empty():
raise AlignmentError(
"Cannot create heatmap from an empty alignment.")

sequence_length = self.sequence_length()
if sequence_length < 1:
raise AlignmentError(
"Cannot create heatmap from an alignment without any "
"positions.")

# track the values that are actually used -- this can differ from the
# values in `value_map` (e.g., if `value_map` is a superset or a
# defaultdict). we want to use these values to compute the min, median,
# and max for the colorbar legend labels
values = {}
mtx = np.empty([self.sequence_count(), sequence_length])
for row_idx, sequence_id in enumerate(sequence_order):
seq = str(self[sequence_id])
for col_idx in range(sequence_length):
char = seq[col_idx]
value = value_map[char]
mtx[row_idx][col_idx] = value
values[char] = value

# remove nans as this can mess up the median calculation below.
# numpy.nanmedian was added in 1.9.0, so consider updating this code
# if/when we start supporting numpy >= 1.9.0. scipy.stats.nanmedian is
# deprecated so we can't use that either.
#
# modified from http://stackoverflow.com/a/26475626/3776794
#
# note: we need to explicitly create a list from the dict's values in
# order to have this work in Python 3. Using values.values() directly,
# or with future.utils.viewvalues, creates a numpy array of object
# dtype, which makes np.isnan fail.
values = np.asarray(list(values.values()))
values = values[~np.isnan(values)]

return mtx, min(values), np.median(values), max(values)

def _validate_lengths(self):
"""Return ``True`` if all sequences same length, ``False`` otherwise
"""
Expand Down
129 changes: 129 additions & 0 deletions skbio/alignment/tests/test_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
import numpy as np
import numpy.testing as npt
from scipy.spatial.distance import hamming
import matplotlib as mpl
mpl.use('Agg')

from skbio import (NucleotideSequence, DNASequence, RNASequence, DNA, RNA,
DistanceMatrix, Alignment, SequenceCollection)
Expand Down Expand Up @@ -826,6 +828,133 @@ def test_to_phylip_no_positions(self):
with self.assertRaises(SequenceCollectionError):
npt.assert_warns(DeprecationWarning, a.to_phylip)

def check_heatmap_sanity(self, fig, exp_x_tick_labels, exp_y_tick_labels,
exp_legend_labels):
"""Helper method for testing basic heatmap figure properties."""
axes = fig.get_axes()
self.assertEqual(len(axes), 2)

ax, axc = axes

x_tick_labels = [e.get_text() for e in ax.get_xticklabels()]
self.assertEqual(x_tick_labels, exp_x_tick_labels)

y_tick_labels = [e.get_text() for e in ax.get_yticklabels()]
self.assertEqual(y_tick_labels, exp_y_tick_labels)

legend_labels = [e.get_text() for e in axc.get_xticklabels()]
self.assertEqual(legend_labels, exp_legend_labels)

def test_heatmap_empty(self):
# no seqs
aln = Alignment([])
value_map = {'A': 0.1, 'C': 1.5, 'U': -0.42, 'G': 0.55, 'T': 99}
with self.assertRaises(AlignmentError):
aln.heatmap(value_map)

# no positions
aln = Alignment([DNA(''), RNA('', id="s2")])
with self.assertRaises(AlignmentError):
aln.heatmap(value_map)

def test_heatmap_1_seq(self):
aln = Alignment([RNA('AACGU', id="seq1")])
value_map = {'A': 0.1, 'C': 1.5, 'U': -0.42, 'G': 0.55, 'T': 99}
fig = aln.heatmap(value_map)

self.check_heatmap_sanity(
fig, ['A', 'A', 'C', 'G', 'U'], ['seq1'],
['Minimum', 'Median', 'Maximum'])

def test_heatmap_2_seqs(self):
aln = Alignment([RNA('AACGU', id="seq1"),
RNA('AACGU', id="seq2")])
value_map = {'A': 0.1, 'C': 1.5, 'U': -0.42, 'G': 0.55, 'T': 99}
fig = aln.heatmap(value_map)

self.check_heatmap_sanity(
fig, ['A', 'A', 'C', 'G', 'U'], ['seq1', 'seq2'],
['Minimum', 'Median', 'Maximum'])

def test_heatmap_3_seqs(self):
aln = Alignment([DNA('AACCCGT', id="seq1"),
DNA('ACCCGGT', id="seq2"),
DNA('ACCCGGT', id="seq3")])
value_map = {'A': 0.61, 'C': 1.07, 'T': 0.05, 'G': 0.07}
fig = aln.heatmap(value_map)

self.check_heatmap_sanity(
fig, ['A', 'C', 'C', 'C', 'G', 'G', 'T'], ['seq1', 'seq3'],
['Minimum', 'Median', 'Maximum'])

def test_heatmap_with_non_defaults(self):
aln = Alignment([DNA('AGTCGGT', id="seq1"),
DNA('CAACGGA', id="seq2"),
DNA('AACCTCT', id="seq3"),
DNA('TACTCGT', id="seq4")])
value_map = {'A': 0.61, 'C': 1.07, 'T': 0.05, 'G': 0.07}
labels = ['a', 'b', 'c']
fig = aln.heatmap(
value_map, legend_labels=labels, fig_size=(42, 22), cmap='Blues',
sequence_order=('seq4', 'seq3', 'seq1', 'seq2'))

self.check_heatmap_sanity(fig, ['A', 'A', 'C', 'C', 'G', 'G', 'T'],
['seq4', 'seq2'], labels)
self.assertEqual(fig.get_figwidth(), 42.0)
self.assertEqual(fig.get_figheight(), 22.0)

def test_heatmap_invalid_legend_labels(self):
with self.assertRaises(ValueError):
self.a1.heatmap({}, legend_labels=['a', 'b', 'c', 'd'])

def test_heatmap_invalid_sequence_order(self):
# duplicate ids
with self.assertRaises(ValueError):
self.a1.heatmap({}, sequence_order=['d1', 'd2', 'd1'])

# provided set of ids doesn't match alignment's set of ids
with self.assertRaises(ValueError):
self.a1.heatmap({}, sequence_order=['d2', 'd3', 'd0'])

def test_heatmap_missing_character_in_value_map(self):
with self.assertRaises(KeyError):
self.a1.heatmap({})

def test_alignment_to_heatmap_matrix(self):
aln = Alignment([DNA('ACTG', id='d1'),
DNA('A.-G', id='d2'),
DNA('TC-G', id='d3')])
value_map = defaultdict(lambda: np.nan)
value_map.update({'A': 42, 'C': 10.5, 'T': 22.1, 'G': -7.789,
'U': -999.9, 'Z': 42000})
exp_min = -7.789
exp_median = 16.3
exp_max = 42

# sequence order is same as what's in the alignment
mtx, min_val, median_val, max_val = \
aln._alignment_to_heatmap_matrix(value_map, ['d1', 'd2', 'd3'])

exp_mtx = np.array([[42., 10.5, 22.1, -7.789],
[42., np.nan, np.nan, -7.789],
[22.1, 10.5, np.nan, -7.789]])
npt.assert_array_equal(mtx, exp_mtx)
self.assertEqual(min_val, exp_min)
self.assertAlmostEqual(median_val, exp_median)
self.assertEqual(max_val, exp_max)

# sequence order is different from what's in the alignment
mtx, min_val, median_val, max_val = \
aln._alignment_to_heatmap_matrix(value_map, ['d3', 'd1', 'd2'])

exp_mtx = np.array([[22.1, 10.5, np.nan, -7.789],
[42., 10.5, 22.1, -7.789],
[42., np.nan, np.nan, -7.789]])
npt.assert_array_equal(mtx, exp_mtx)
self.assertEqual(min_val, exp_min)
self.assertAlmostEqual(median_val, exp_median)
self.assertEqual(max_val, exp_max)

def test_validate_lengths(self):
self.assertTrue(self.a1._validate_lengths())
self.assertTrue(self.a2._validate_lengths())
Expand Down

0 comments on commit 6e46452

Please sign in to comment.