Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Regressor generalization #101

Merged
merged 9 commits into from
Jun 14, 2016
62 changes: 26 additions & 36 deletions src/plotypus/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@
import dill
from argparse import ArgumentError, ArgumentParser, SUPPRESS
from pandas import read_table
from sklearn.linear_model import (LassoCV, LassoLarsCV, LassoLarsIC,
LinearRegression, RidgeCV, ElasticNetCV)
import sklearn.linear_model
from sklearn.grid_search import GridSearchCV
from matplotlib import rc_params_from_file
from collections import ChainMap
Expand All @@ -20,8 +19,9 @@
plot_periodogram)
import plotypus
from plotypus.preprocessing import Fourier
from plotypus.utils import (colvec, mad, make_sure_path_exists, pmap,
valid_basename, verbose_print)
from plotypus.utils import (colvec, import_name, mad, make_sure_path_exists,
pmap, strlist_to_dict, valid_basename,
verbose_print)
from plotypus.resources import matplotlibrc

import pkg_resources # part of setuptools
Expand Down Expand Up @@ -60,6 +60,7 @@ def get_args():
parallel_group = parser.add_argument_group('Parallel')
period_group = parser.add_argument_group('Periodogram')
fourier_group = parser.add_argument_group('Fourier')
regressor_group = parser.add_argument_group('Regressor')
outlier_group = parser.add_argument_group('Outlier Detection')
# regression_group = parser.add_argument_group('Regression')

Expand Down Expand Up @@ -276,12 +277,6 @@ def get_args():
default=(2, 20), metavar=('MIN', 'MAX'),
help='range of degrees of fourier fits to use '
'(default = 2 20)')
fourier_group.add_argument('-r', '--regressor',
choices=['LassoCV', 'LassoLarsCV', 'LassoLarsIC', 'OLS', 'RidgeCV',
'ElasticNetCV'],
default='LassoLarsIC',
help='type of regressor to use '
'(default = "Lasso")')
fourier_group.add_argument('--selector',
choices=['Baart', 'GridSearch'],
default='GridSearch',
Expand All @@ -292,15 +287,23 @@ def get_args():
help='form of Fourier series to use in coefficient output, '
'does not affect the fit '
'(default = "cos")')
fourier_group.add_argument('--max-iter', type=int,
default=1000, metavar='N',
help='maximum number of iterations in the regularization path '
'(default = 1000)')
fourier_group.add_argument('--regularization-cv', type=int,
default=None, metavar='N',
help='number of folds used in regularization regularization_cv '
'validation '
'(default = 3)')

## Regressor Group #######################################################

fourier_group.add_argument('-r', '--regressor', type=import_name,
default=sklearn.linear_model.LassoLarsIC,
help='type of regressor to use, loads any Python object named like '
'*module.submodule.etc.object_name*, though it must behave like a '
'scikit-learn regressor '
'(default = "sklearn.linear_model.LassoLarsIC")')
regressor_group.add_argument('--regressor-options', type=str, nargs='+',
default=[], metavar="KEY VALUE",
help='list of key value pairs to pass to regressor object. '
'accepted keys depend on regressor. '
'values which form valid Python literals (e.g. 2, True, [1,2]) '
'are all parsed to their obvious type, or left as strings '
'otherwise '
'(default = None)')

## Outlier Group #########################################################

Expand All @@ -327,22 +330,9 @@ def get_args():
fail_on_error=True)
plotypus.lightcurve.matplotlib.rcParams = rcParams

regressor_choices = {
"LassoCV" : LassoCV(max_iter=args.max_iter,
cv=args.regularization_cv,
fit_intercept=False),
"LassoLarsCV" : LassoLarsCV(max_iter=args.max_iter,
cv=args.regularization_cv,
fit_intercept=False),
"LassoLarsIC" : LassoLarsIC(max_iter=args.max_iter,
fit_intercept=False),
"OLS" : LinearRegression(fit_intercept=False),
"RidgeCV" : RidgeCV(cv=args.regularization_cv,
fit_intercept=False),
"ElasticNetCV" : ElasticNetCV(max_iter=args.max_iter,
cv=args.regularization_cv,
fit_intercept=False)
}
# parse regressor (TODO: and selector) options into a dict
regressor_options = strlist_to_dict(args.regressor_options)

selector_choices = {
"Baart" : None,
"GridSearch" : GridSearchCV
Expand All @@ -363,7 +353,7 @@ def get_args():
}
args.scoring = scoring_choices[args.scoring]

args.regressor = regressor_choices[args.regressor]
args.regressor = args.regressor(**regressor_options)
Selector = selector_choices[args.selector] or GridSearchCV
args.periodogram = periodogram_choices[args.periodogram]
args.sigma_clipping = sigma_clipping_choices[args.sigma_clipping]
Expand Down
70 changes: 43 additions & 27 deletions src/plotypus/lightcurve.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,20 +40,20 @@
]


def make_predictor(regressor=LassoLarsIC(fit_intercept=False),
def make_predictor(regressor=LassoLarsIC(),
Selector=GridSearchCV, fourier_degree=(2, 25),
selector_processes=1,
use_baart=False, scoring='r2', scoring_cv=3,
**kwargs):
"""make_predictor(regressor=LassoLarsIC(fit_intercept=False), Selector=GridSearchCV, fourier_degree=(2, 25), selector_processes=1, use_baart=False, scoring='r2', scoring_cv=3, **kwargs)
"""make_predictor(regressor=LassoLarsIC(), Selector=GridSearchCV, fourier_degree=(2, 25), selector_processes=1, use_baart=False, scoring='r2', scoring_cv=3, **kwargs)

Makes a predictor object for use in :func:`get_lightcurve`.

**Parameters**

regressor : object with "fit" and "transform" methods, optional
Regression object used for solving Fourier matrix
(default ``sklearn.linear_model.LassoLarsIC(fit_intercept=False)``).
(default ``sklearn.linear_model.LassoLarsIC()``).
Selector : class with "fit" and "predict" methods, optional
Model selection class used for finding the best fit
(default :class:`sklearn.grid_search.GridSearchCV`).
Expand All @@ -77,7 +77,6 @@ def make_predictor(regressor=LassoLarsIC(fit_intercept=False),
out : object with "fit" and "predict" methods
The created predictor object.
"""
#
fourier = Fourier(degree_range=fourier_degree, regressor=regressor) \
if use_baart else Fourier()
pipeline = Pipeline([('Fourier', fourier),
Expand Down Expand Up @@ -247,14 +246,27 @@ def get_lightcurve(data, copy=False, name=None,
verbose_print("{}: finding period".format(name),
operation="period", verbosity=verbosity)
_period, pgram = find_period(signal,
min_period, max_period,
coarse_precision, fine_precision,
periodogram, period_processes,
output_periodogram=output_periodogram)
min_period, max_period,
coarse_precision, fine_precision,
periodogram, period_processes,
output_periodogram=output_periodogram)
# provide period to predictor object
# note: Depending on selector used, parameter may be at different level
# of nesting. If selector is a true selector, the "try" block
# should work, and if it is simply a Pipeline, the "except" block
# should work instead.
#
# Some refactoring could avoid this issue altogether.
try:
predictor.set_params(estimator__Fourier__period=_period)
except ValueError:
predictor.set_params(Fourier__period=_period)

verbose_print("{}: using period {}".format(name, _period),
operation="period", verbosity=verbosity)
phase, mag, *err = rephase(signal, _period).T

time, mag, *err = signal.T
phase = rephase(signal, _period)[:,0]

# TODO ###
# Generalize number of bins to function parameter ``coverage_bins``, which
Expand All @@ -276,15 +288,15 @@ def get_lightcurve(data, copy=False, name=None,
# Predict light curve
with warnings.catch_warnings(record=True) as w:
try:
predictor = predictor.fit(colvec(phase), mag)
predictor = predictor.fit(colvec(time), mag)
except Warning:
# not sure if this should be only in verbose mode
print(name, w, file=stderr)
return

# Reject outliers and repeat the process if there are any
if sigma:
outliers = find_outliers(rephase(data.data, _period), predictor,
outliers = find_outliers(data.data, predictor,
sigma, sigma_clipping)
num_outliers = np.sum(outliers[:,0])
if num_outliers == 0 or \
Expand All @@ -298,29 +310,32 @@ def get_lightcurve(data, copy=False, name=None,
verbosity=verbosity)
data.mask = np.ma.mask_or(data.mask, outliers)

# replace *phase*, *mag*, and *err* with the newly masked values in *data*
phase, mag, *err = data.T
# replace *time*, *mag*, and *err* with the newly masked values in *data*
time, mag, *err = data.T
err = err[0] if len(err) == 1 else np.repeat(0, np.size(time))

# Build predicted light curve and residuals
lightcurve = predictor.predict(colvec(phases))
residuals = prediction_residuals(phase, mag, predictor)
lightcurve = predictor.predict(colvec(_period * phases))
residuals = prediction_residuals(time, mag, predictor)
# determine phase shift for max light, if a specific shift was not provided
if shift is None:
arg_max_light = lightcurve.argmin()
lightcurve = np.concatenate((lightcurve[arg_max_light:],
lightcurve[:arg_max_light]))
lightcurve[:arg_max_light]))
shift = arg_max_light/len(phases)
# shift observed light curve to max light
phase = rephase(data.data, _period, shift).T[0]
phase = rephase(data, _period, shift).T[0]
# use rephased phase points from *data* in residuals
residuals = np.column_stack((data.T[0], phase, data.T[1], residuals,
data.T[2]))
data.T[0] = phase

# Grab the coefficients from the model
coefficients = predictor.named_steps['Regressor'].coef_ \
residuals = np.ma.column_stack((time, phase, mag, residuals, err))
data[:,0] = phase
# grab the regressor used in the model
regressor = predictor.named_steps['Regressor'] \
if isinstance(predictor, Pipeline) \
else predictor.best_estimator_.named_steps['Regressor'].coef_
else predictor.best_estimator_.named_steps['Regressor']
# Grab the coefficients from the model
coefficients = regressor.coef_
intercept = regressor.intercept_
coefficients = np.insert(coefficients, 0, intercept)

# compute R^2 and MSE if they haven't already been
# (one or zero have been computed, depending on the predictor)
Expand All @@ -332,9 +347,10 @@ def get_score(scoring):
if hasattr(predictor, 'best_score_') and predictor.scoring == scoring:
return predictor.best_score_
else:
return cross_val_score(estimator, colvec(phase), mag,
cv=scoring_cv, scoring=scoring,
n_jobs=scoring_processes).mean()
return cross_val_score(estimator,
colvec(time.data[~time.mask]), mag.data[~mag.mask],
cv=scoring_cv, scoring=scoring,
n_jobs=scoring_processes).mean()

## Formatting Coefficients ##
# convert to phase-shifted form (A_k, Phi_k) if necessary
Expand Down
2 changes: 1 addition & 1 deletion src/plotypus/periodogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ def rephase(data, period=1.0, shift=0.0, col=0, copy=True):
Array containing the rephased *data*.
"""
rephased = np.ma.array(data, copy=copy)
rephased[:, col] = get_phase(rephased[:, col], period, shift)
rephased.data[:, col] = get_phase(rephased.data[:, col], period, shift)

return rephased

Expand Down
Loading