Skip to content

Commit

Permalink
ENH: Allow seed to be set in differential evolution
Browse files Browse the repository at this point in the history
Allow seed to be set in differential evolution
Allow end user to choose algorithm
Set seed in all tests for reproducible results
Small cleanups for style
  • Loading branch information
bashtage committed Sep 10, 2018
1 parent fbd77c1 commit 0876cc7
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 51 deletions.
2 changes: 2 additions & 0 deletions lint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ if [ "$LINT" == true ]; then
statsmodels/info.py \
statsmodels/resampling/ \
statsmodels/interface/ \
statsmodels/graphics/functional.py \
statsmodels/graphics/tests/test_functional.py \
statsmodels/iolib/smpickle.py \
statsmodels.iolib/tests/test_pickle.py \
statsmodels/graphics/tsaplots.py \
Expand Down
84 changes: 56 additions & 28 deletions statsmodels/graphics/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import numpy as np
from statsmodels.compat.scipy import factorial
try:
from scipy.optimize import differential_evolution
from scipy.optimize import differential_evolution, brute, fmin
have_de_optim = True
except ImportError:
from scipy.optimize import brute, fmin
Expand Down Expand Up @@ -61,7 +61,8 @@ def __repr__(self):


def _inverse_transform(pca, data):
"""Inverse transform on PCA.
"""
Inverse transform on PCA.
Use PCA's `project` method by temporary replacing its factors with
`data`.
Expand Down Expand Up @@ -126,7 +127,8 @@ def _curve_constrained(x, idx, sign, band, pca, ks_gaussian):


def _min_max_band(args):
"""Min and max values at `idx`.
"""
Min and max values at `idx`.
Global optimization to find the extrema per component.
Expand All @@ -151,14 +153,14 @@ def _min_max_band(args):
``(max, min)`` curve values at `idx`
"""
idx, (band, pca, bounds, ks_gaussian) = args
if have_de_optim:
idx, (band, pca, bounds, ks_gaussian, use_brute, seed) = args
if have_de_optim and not use_brute:
max_ = differential_evolution(_curve_constrained, bounds=bounds,
args=(idx, -1, band, pca, ks_gaussian),
maxiter=7).x
maxiter=7, seed=seed).x
min_ = differential_evolution(_curve_constrained, bounds=bounds,
args=(idx, 1, band, pca, ks_gaussian),
maxiter=7).x
maxiter=7, seed=seed).x
else:
max_ = brute(_curve_constrained, ranges=bounds, finish=fmin,
args=(idx, -1, band, pca, ks_gaussian))
Expand All @@ -172,7 +174,7 @@ def _min_max_band(args):


def hdrboxplot(data, ncomp=2, alpha=None, threshold=0.95, bw=None,
xdata=None, labels=None, ax=None):
xdata=None, labels=None, ax=None, use_brute=False, seed=None):
"""
High Density Region boxplot
Expand Down Expand Up @@ -210,6 +212,13 @@ def hdrboxplot(data, ncomp=2, alpha=None, threshold=0.95, bw=None,
ax : Matplotlib AxesSubplot instance, optional
If given, this subplot is used to plot in instead of a new figure being
created.
use_brute : bool
Use the brute force optimizer instead of the default differential
evolution to find the curves. Default is False.
seed : {None, int, np.random.RandomState}
Seed value to pass to scipy.optimize.differential_evolution. Can be an
integer or RandomState instance. If None, then the default RandomState
provided by np.random is used.
Returns
-------
Expand Down Expand Up @@ -254,8 +263,8 @@ def hdrboxplot(data, ncomp=2, alpha=None, threshold=0.95, bw=None,
unlikely to be similar to the other curves.
Using a kernel smoothing technique, the probability density function (PDF)
of the multivariate space can be recovered. From this PDF, it is possible to
compute the density probability linked to the cluster of points and plot
of the multivariate space can be recovered. From this PDF, it is possible
to compute the density probability linked to the cluster of points and plot
its contours.
Finally, using these contours, the different quantiles can be extracted
Expand Down Expand Up @@ -347,9 +356,9 @@ def hdrboxplot(data, ncomp=2, alpha=None, threshold=0.95, bw=None,
for i in range(n_quantiles)]

# Find mean, outliers curves
if have_de_optim:
if have_de_optim and not use_brute:
median = differential_evolution(lambda x: - ks_gaussian.pdf(x),
bounds=bounds, maxiter=5).x
bounds=bounds, maxiter=5, seed=seed).x
else:
median = brute(lambda x: - ks_gaussian.pdf(x),
ranges=bounds, finish=fmin)
Expand All @@ -360,8 +369,9 @@ def hdrboxplot(data, ncomp=2, alpha=None, threshold=0.95, bw=None,

# Find HDR given some quantiles

def _band_quantiles(band):
"""Find extreme curves for a quantile band.
def _band_quantiles(band, use_brute=use_brute, seed=seed):
"""
Find extreme curves for a quantile band.
From the `band` of quantiles, the associated PDF extrema values
are computed. If `min_alpha` is not provided (single quantile value),
Expand All @@ -376,6 +386,14 @@ def _band_quantiles(band):
----------
band : array_like
alpha values ``(max_alpha, min_alpha)`` ex: ``[0.9, 0.5]``
use_brute : bool
Use the brute force optimizer instead of the default differential
evolution to find the curves. Default is False.
seed : {None, int, np.random.RandomState}
Seed value to pass to scipy.optimize.differential_evolution. Can
be an integer or RandomState instance. If None, then the default
RandomState provided by np.random is used.
Returns
-------
Expand All @@ -392,7 +410,8 @@ def _band_quantiles(band):

pool = Pool()
data = zip(range(dim), itertools.repeat((band, pca,
bounds, ks_gaussian)))
bounds, ks_gaussian,
seed, use_brute)))
band_quantiles = pool.map(_min_max_band, data)
pool.terminate()
pool.close()
Expand All @@ -403,16 +422,18 @@ def _band_quantiles(band):

extra_alpha = [i for i in alpha
if 0.5 != i and 0.9 != i and threshold != i]
if extra_alpha != []:
extra_quantiles = [y for x in extra_alpha
for y in _band_quantiles([x])]
if len(extra_alpha) > 0:
extra_quantiles = []
for x in extra_alpha:
for y in _band_quantiles([x], use_brute=use_brute, seed=seed):
extra_quantiles.append(y)
else:
extra_quantiles = []

# Inverse transform from n-variate plot to dataset dataset's shape
median = _inverse_transform(pca, median)[0]
hdr_90 = _band_quantiles([0.9, 0.5])
hdr_50 = _band_quantiles([0.5])
hdr_90 = _band_quantiles([0.9, 0.5], use_brute=use_brute, seed=seed)
hdr_50 = _band_quantiles([0.5], use_brute=use_brute, seed=seed)

hdr_res = HdrResults({
"median": median,
Expand Down Expand Up @@ -440,9 +461,11 @@ def _band_quantiles(band):

if len(outliers) != 0:
for ii, outlier in enumerate(outliers):
label = str(labels_outlier[ii]) if labels_outlier is not None else 'Outliers'
ax.plot(xdata, outlier,
ls='--', alpha=0.7, label=label)
if labels_outlier is None:
label = 'Outliers'
else:
label = str(labels_outlier[ii])
ax.plot(xdata, outlier, ls='--', alpha=0.7, label=label)

handles, labels = ax.get_legend_handles_labels()

Expand All @@ -467,8 +490,9 @@ def _band_quantiles(band):


def fboxplot(data, xdata=None, labels=None, depth=None, method='MBD',
wfactor=1.5, ax=None, plot_opts={}):
"""Plot functional boxplot.
wfactor=1.5, ax=None, plot_opts=None):
"""
Plot functional boxplot.
A functional boxplot is the analog of a boxplot for functional data.
Functional data is any type of data that varies over a continuum, i.e.
Expand Down Expand Up @@ -586,6 +610,7 @@ def fboxplot(data, xdata=None, labels=None, depth=None, method='MBD',
"""
fig, ax = utils.create_mpl_ax(ax)

plot_opts = {} if plot_opts is None else plot_opts
if plot_opts.get('cmap_outliers') is None:
from matplotlib.cm import rainbow_r
plot_opts['cmap_outliers'] = rainbow_r
Expand Down Expand Up @@ -620,7 +645,8 @@ def fboxplot(data, xdata=None, labels=None, depth=None, method='MBD',
ix_outliers = []
ix_nonout = []
for ii in range(data.shape[0]):
if np.any(data[ii, :] > upper_fence) or np.any(data[ii, :] < lower_fence):
if (np.any(data[ii, :] > upper_fence) or
np.any(data[ii, :] < lower_fence)):
ix_outliers.append(ii)
else:
ix_nonout.append(ii)
Expand Down Expand Up @@ -661,7 +687,8 @@ def fboxplot(data, xdata=None, labels=None, depth=None, method='MBD',

def rainbowplot(data, xdata=None, depth=None, method='MBD', ax=None,
cmap=None):
"""Create a rainbow plot for a set of curves.
"""
Create a rainbow plot for a set of curves.
A rainbow plot contains line plots of all curves in the dataset, colored in
order of functional depth. The median curve is shown in black.
Expand Down Expand Up @@ -766,7 +793,8 @@ def rainbowplot(data, xdata=None, depth=None, method='MBD', ax=None,


def banddepth(data, method='MBD'):
"""Calculate the band depth for a set of functional curves.
"""
Calculate the band depth for a set of functional curves.
Band depth is an order statistic for functional data (see `fboxplot`), with
a higher band depth indicating larger "centrality". In analog to scalar
Expand Down
57 changes: 34 additions & 23 deletions statsmodels/graphics/tests/test_functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

try:
import matplotlib.pyplot as plt
import matplotlib
have_matplotlib = True
except ImportError:
have_matplotlib = False
Expand All @@ -24,8 +23,7 @@

@pytest.mark.skipif(not have_matplotlib, reason='matplotlib not available')
def test_hdr_basic(close_figures):
fig, hdr = hdrboxplot(data, labels=labels)
print(hdr)
_, hdr = hdrboxplot(data, labels=labels, seed=12345)

assert len(hdr.extra_quantiles) == 0

Expand Down Expand Up @@ -54,18 +52,31 @@ def test_hdr_basic(close_figures):

assert_almost_equal(quant, quant_t, decimal=0)

labels_pos = np.all(np.in1d(data, hdr.outliers).reshape(data.shape), axis=1)
labels_pos = np.all(np.in1d(data, hdr.outliers).reshape(data.shape),
axis=1)
outliers = labels[labels_pos]
assert_equal([1982, 1983, 1997, 1998], outliers)
assert_equal(labels[hdr.outliers_idx], outliers)


@pytest.mark.skipif(not have_matplotlib, reason='matplotlib not available')
def test_hdr_basic_brute(close_figures):
_, hdr = hdrboxplot(data, labels=labels, use_brute=True)

assert len(hdr.extra_quantiles) == 0

median_t = [24.247, 25.625, 25.964, 24.999, 23.648, 22.302,
21.231, 20.366, 20.168, 20.434, 21.111, 22.299]

assert_almost_equal(hdr.median, median_t, decimal=2)


@pytest.mark.skipif(not have_matplotlib, reason='matplotlib not available')
def test_hdr_plot(close_figures):
fig = plt.figure()
ax = fig.add_subplot(111)

fig, res = hdrboxplot(data, labels=labels.tolist(), ax=ax, threshold=1)
hdrboxplot(data, labels=labels.tolist(), ax=ax, threshold=1, seed=12345)

ax.set_xlabel("Month of the year")
ax.set_ylabel("Sea surface temperature (C)")
Expand All @@ -76,7 +87,7 @@ def test_hdr_plot(close_figures):

@pytest.mark.skipif(not have_matplotlib, reason='matplotlib not available')
def test_hdr_alpha(close_figures):
fig, hdr = hdrboxplot(data, alpha=[0.7])
_, hdr = hdrboxplot(data, alpha=[0.7], seed=12345)
extra_quant_t = np.vstack([[25.1, 26.5, 27.0, 26.4, 25.4, 24.1,
23.0, 22.0, 21.7, 22.1, 22.7, 23.8],
[23.4, 24.8, 25.0, 23.9, 22.4, 21.1,
Expand All @@ -86,7 +97,7 @@ def test_hdr_alpha(close_figures):

@pytest.mark.skipif(not have_matplotlib, reason='matplotlib not available')
def test_hdr_multiple_alpha(close_figures):
fig, hdr = hdrboxplot(data, alpha=[0.4, 0.92])
_, hdr = hdrboxplot(data, alpha=[0.4, 0.92], seed=12345)
extra_quant_t = [[25.712, 27.052, 27.711, 27.200,
26.162, 24.833, 23.639, 22.378,
22.250, 22.640, 23.472, 24.649],
Expand All @@ -99,30 +110,32 @@ def test_hdr_multiple_alpha(close_figures):
[23.873, 25.371, 25.667, 24.644,
23.177, 21.923, 20.791, 20.015,
19.697, 19.951, 20.622, 21.858]]
assert_almost_equal(hdr.extra_quantiles, np.vstack(extra_quant_t), decimal=0)
assert_almost_equal(hdr.extra_quantiles, np.vstack(extra_quant_t),
decimal=0)


@pytest.mark.skipif(not have_matplotlib, reason='matplotlib not available')
def test_hdr_threshold(close_figures):
fig, hdr = hdrboxplot(data, alpha=[0.8], threshold=0.93)
labels_pos = np.all(np.in1d(data, hdr.outliers).reshape(data.shape), axis=1)
_, hdr = hdrboxplot(data, alpha=[0.8], threshold=0.93, seed=12345)
labels_pos = np.all(np.in1d(data, hdr.outliers).reshape(data.shape),
axis=1)
outliers = labels[labels_pos]
assert_equal([1968, 1982, 1983, 1997, 1998], outliers)


@pytest.mark.skipif(not have_matplotlib, reason='matplotlib not available')
def test_hdr_bw(close_figures):
fig, hdr = hdrboxplot(data, bw='cv_ml')
_, hdr = hdrboxplot(data, bw='cv_ml', seed=12345)
median_t = [24.25, 25.64, 25.99, 25.04, 23.71, 22.38,
21.31, 20.44, 20.24, 20.51, 21.19, 22.38]
assert_almost_equal(hdr.median, median_t, decimal=2)


@pytest.mark.skipif(not have_matplotlib, reason='matplotlib not available')
def test_hdr_ncomp(close_figures):
fig, hdr = hdrboxplot(data, ncomp=3)
_, hdr = hdrboxplot(data, ncomp=3, seed=12345)
median_t = [24.33, 25.71, 26.04, 25.08, 23.74, 22.40,
21.32, 20.45, 20.25, 20.53, 21.2 , 22.39]
21.32, 20.45, 20.25, 20.53, 21.20, 22.39]
assert_almost_equal(hdr.median, median_t, decimal=2)


Expand All @@ -138,14 +151,14 @@ def test_banddepth_BD2():
expected_depth = [0.5, 5./6, 5./6, 0.5]
assert_almost_equal(depth, expected_depth)

## Plot to visualize why we expect this output
#fig = plt.figure()
#ax = fig.add_subplot(111)
#for ii, yy in enumerate([y1, y2, y3, y4]):
# Plot to visualize why we expect this output
# fig = plt.figure()
# ax = fig.add_subplot(111)
# for ii, yy in enumerate([y1, y2, y3, y4]):
# ax.plot(xx, yy, label="y%s" % ii)

#ax.legend()
#plt.close(fig)
# ax.legend()
# plt.close(fig)


def test_banddepth_MBD():
Expand Down Expand Up @@ -174,16 +187,14 @@ def harmfunc(t):
b2i = (0.15 - 0.1) * np.random.random() + 0.1

func = (1 - ci) * (a1i * np.sin(t) + a2i * np.cos(t)) + \
ci * (b1i * np.sin(t) + b2i * np.cos(t))
ci * (b1i * np.sin(t) + b2i * np.cos(t))

return func

np.random.seed(1234567)
# Some basic test data, Model 6 from Sun and Genton.
t = np.linspace(0, 2 * np.pi, 250)
data = []
for ii in range(20):
data.append(harmfunc(t))
data = [harmfunc(t) for _ in range(20)]

# fboxplot test
fig = plt.figure()
Expand Down

0 comments on commit 0876cc7

Please sign in to comment.