-
Notifications
You must be signed in to change notification settings - Fork 2
/
pca.py
267 lines (205 loc) · 7.57 KB
/
pca.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
# -*- coding: utf-8 -*-
"""
Utility functions for running principal components analysis and plotting the
results.
See also the examples at:
- http://nbviewer.ipython.org/github/alimanfoo/anhima/blob/master/examples/pca.ipynb
""" # noqa
from __future__ import division, print_function, absolute_import
# third party dependencies
import numpy as np
import matplotlib.pyplot as plt
import sklearn.decomposition
def pca(gn, n_components=10, whiten=False):
"""Perform a principal components analysis of genotypes, treating each
variant as a feature.
Parameters
----------
gn : array_like, shape (`n_variants`, `n_samples`)
A 2-dimensional array where each element is a genotype call coded as
a single integer counting the number of non-reference alleles.
n_components : int, None or string
Number of components to keep. If `n_components` is None all
components are kept: ``n_components == min(n_samples, n_features)``. If
`n_components` == 'mle', Minka's MLE is used to guess the dimension. If
0 < `n_components` < 1, select the number of components such that the
amount of variance that needs to be explained is greater than the
percentage specified by `n_components`.
whiten : bool
When True (False by default) the components vectors are divided by
n_samples times singular values to ensure uncorrelated outputs with
unit component-wise variances.
Returns
-------
model : ``sklearn.decomposition.PCA``
The fitted model.
coords : ndarray, shape (`n_samples`, `n_components`)
The result of fitting the model with `genotypes` and applying
dimensionality reduction to `genotypes`.
See Also
--------
sklearn.decomposition.PCA, anhima.ld.ld_prune_pairwise
Notes
-----
The :func:`anhima.ld.ld_prune_pairwise` can be used to obtain a set of
variants in approximate linkage equilibrium prior to running PCA.
"""
# check inputs
gn = np.asarray(gn)
assert gn.ndim == 2
# transpose because sklearn expects data as (n_samples, n_features)
m = gn.T
# set up PCA
model = sklearn.decomposition.PCA(n_components=n_components, whiten=whiten,
copy=True)
# fit the model and apply dimensionality reduction
coords = model.fit_transform(m)
return model, coords
def plot_coords(model, coords, pcx=1, pcy=2, ax=None, colors='b', sizes=20,
labels=None, scatter_kwargs=None, annotate_kwargs=None):
"""Scatter plot of transformed coordinates from principal components
analysis.
Parameters
----------
model : ``sklearn.decomposition.PCA``
The fitted model.
coords : ndarray, shape (`n_samples`, `n_components`)
The transformed coordinates.
pcx : int, optional
The principal component to plot on the X axis. N.B., this is
one-based, so `1` is the first principal component, `2` is the second
component, etc.
pcy : int, optional
The principal component to plot on the Y axis. N.B., this is
one-based, so `1` is the first principal component, `2` is the second
component, etc.
ax : axes, optional
The axes on which to draw. If not provided, a new figure will be
created.
colors : color or sequence of color, optional
Can be a single color format string, or a sequence of color
specifications of length `n_samples`.
sizes : scalar or array_like, shape (`n_samples`), optional
Size in points^2.
labels : sequence of strings
If provided, will be used to label points in the plot.
scatter_kwargs : dict-like
Additional keyword arguments passed through to `plt.scatter`.
annotate_kwargs : dict-like
Additional keyword arguments passed through to `plt.annotate` when
labelling points.
Returns
-------
ax : axes
The axes on which the plot was drawn.
"""
# check inputs
coords = np.asarray(coords)
assert coords.ndim == 2
# set up axes
if ax is None:
# make a square figure
x = plt.rcParams['figure.figsize'][0]
fig, ax = plt.subplots(figsize=(x, x))
# obtain X and Y data, N.B., `pcx` and `pcy` are 1-based
x = coords[:, pcx-1]
y = coords[:, pcy-1]
# plot points
if scatter_kwargs is None:
scatter_kwargs = dict()
ax.scatter(x, y, c=colors, s=sizes, **scatter_kwargs)
# label points
if labels is not None:
if annotate_kwargs is None:
annotate_kwargs = dict()
annotate_kwargs.setdefault('xycoords', 'data')
annotate_kwargs.setdefault('xytext', (3, 3))
annotate_kwargs.setdefault('textcoords', 'offset points')
for l, lx, ly in zip(labels, x, y):
if l is not None:
ax.annotate(str(l), xy=(lx, ly), **annotate_kwargs)
# tidy up
ax.set_xlabel('PC%s (%.2f%%)' %
(pcx, model.explained_variance_ratio_[pcx-1] * 100))
ax.set_ylabel('PC%s (%.2f%%)' %
(pcy, model.explained_variance_ratio_[pcy-1] * 100))
return ax
def plot_variance_explained(model, bar_kwargs=None, ax=None):
"""
Parameters
----------
model : ``sklearn.decomposition.PCA``
The fitted model.
bar_kwargs : dict-like, optional
Additional keyword arguments passed through to ``ax.bar()``.
ax : axes, optional
The axes on which to draw. If not provided, a new figure will be
created.
Returns
-------
ax : axes
The axes on which the plot was drawn.
"""
# set up axes
if ax is None:
fig, ax = plt.subplots()
# how many components are available
# coordinates for bar
y = model.explained_variance_ratio_ * 100 # express as percent
n = len(y)
x = np.arange(n)
# plot bar
if bar_kwargs is None:
bar_kwargs = dict()
bar_kwargs.setdefault('width', 1)
ax.bar(x, y, **bar_kwargs)
# tidy up
ax.set_xticks(x+.5)
ax.set_xticklabels(range(1, n+1))
ax.set_xlabel('principal component')
ax.set_ylabel('% variance explained')
return ax
def plot_loadings(model, pc=1, pos=None, plot_kwargs=None, ax=None):
"""
Plot loadings for the given principal component.
Parameters
----------
model : ``sklearn.decomposition.PCA``
The fitted model.
pc : int, optional
The principal component to plot loadings for. N.B., this is
one-based, so `1` is the first principal component, `2` is the second
component, etc.
pos : array_like, int, optional
An array of variant positions to use for the X axis, If not given,
variant index will be used for the X axis.
plot_kwargs : dict-like, optional
Additional keyword arguments passed through to ``ax.plot()``.
ax : axes, optional
The axes on which to draw. If not provided, a new figure will be
created.
Returns
-------
ax : axes
The axes on which the plot was drawn.
"""
# set up axes
if ax is None:
x = plt.rcParams['figure.figsize'][0]
fig = plt.figure(figsize=(x, x//3))
ax = fig.add_subplot(111)
# obtain loadings
y = model.components_[pc-1, :]
# plot them
if plot_kwargs is None:
plot_kwargs = dict()
if pos is not None:
assert len(y) == len(pos)
ax.plot(pos, y, **plot_kwargs)
ax.set_xlabel('position')
ax.set_xlim(min(pos), max(pos))
else:
ax.plot(y, **plot_kwargs)
ax.set_xlabel('variant')
ax.set_xlim(0, len(y))
return ax