Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature generation refactor (closes gh-32) #70

Merged
merged 9 commits into from
Oct 1, 2015
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions mltsp/Flask/flask_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
from .. import predict_class as predict
from .. import build_model

all_available_features_list = cfg.features_list + cfg.features_list_science
all_available_features_list = cfg.features_list_obs + cfg.features_list_science

# Flask initialization
app = Flask(__name__, static_folder=None)
Expand Down Expand Up @@ -1621,7 +1621,7 @@ def get_list_of_available_features():

def get_list_of_available_features_set2():
"""Return list of additional built-in time-series features."""
return sorted([feat for feat in cfg.features_list if feat not in
return sorted([feat for feat in cfg.features_list_obs if feat not in
cfg.ignore_feats_list_science])


Expand Down Expand Up @@ -2157,10 +2157,10 @@ def verifyNewScript():
try:
test_results = cft.verify_new_script(script_fpath=scriptfile_path)
ks = []
for thisone in test_results:
for k, v in thisone.items():
if k not in ks:
ks.append(k)
# for thisone in test_results:
for k, v in test_results.items():
if k not in ks:
ks.append(k)
res_str = ""
for k in ks:
res_str += (("<input type='checkbox' value='%s' "
Expand Down Expand Up @@ -2664,7 +2664,7 @@ def featurizationPage(

"""
projkey = project_name_to_key(project_name)
if already_featurized == True and zipfile_name is None:
if already_featurized and zipfile_name is None:
# User is uploading pre-featurized data, without time-series data
features_filename = headerfile_name
features_filepath = os.path.join(
Expand Down
1 change: 1 addition & 0 deletions mltsp/TCP/Algorithms/fitcurve/lightcurve.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# TODO remove?
#!/usr/bin/env python

## Filename: lightcurve.py
Expand Down
1 change: 1 addition & 0 deletions mltsp/TCP/Algorithms/fitcurve/lomb_scargle_refine.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# TODO duplicate?
from numpy import empty,pi,sqrt,sin,cos,dot,where,arange,arctan2,array,diag,ix_,log10,outer,hstack,log,round,zeros
from scipy import weave
from lomb_scargle import lprob2sigma
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ def specific_obj(self,output):
pass
def extract(self): # delegates the actual implementation to subclasses
pass
# TODO why try / except KeyError?
def set_names(self,where):
""" prepares the most commonly used inputs for easy access """
try:
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
# # # 20100912 disabled due to lomb() call# from second_lomb_extractor import second_lomb_extractor
# # # 20100912 disabled due to lomb() call# from frequency_ratio_extractor import ratio21, ratio31, ratio32
#from example_extractor import example_extractor
#from n_points_extractor import n_points_extractor
from .n_points_extractor import n_points_extractor

#from position_intermediate_extractor import position_intermediate_extractor
#20101207#from galb_extractor import galb_extractor
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from scipy import stats
import numpy

# TODO scipy.stats.scoreatpercentile is deprecated; should use numpy.percentile
class amplitude_extractor(FeatureExtractor):
""" Returns the half the difference between the maximum magnitude and the minimum magnitude.
Note this will also work for data that is given in terms of flux. So in a sense, it's
Expand All @@ -33,6 +34,8 @@ def extract(self):

return amplitude/2.0

# TODO what is up with this assumed_unit stuff? this seems well-defined for any units
# in general, should we assume things are log-scale?
class percent_amplitude_extractor(FeatureExtractor):
""" Returns the largest percentage difference between the maximum
magnitude and the minimum magnitude relative to the median.
Expand All @@ -43,6 +46,7 @@ class percent_amplitude_extractor(FeatureExtractor):
extname = 'percent_amplitude' #extractor's name
assumed_unit = "mag" # 20100518: actually, I think a unitless percentage is returned.
def extract(self):
# TODO is there really any reason to call these extractors instead of just Python functions...
maxm = self.fetch_extr('max') # maximum magnitude
minm = self.fetch_extr('min') # minimum magnitude
medd = self.fetch_extr("median")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ def extract(self):
m = self.flux_data
n = len(t)

# TODO could be more flexible here; do we always want to subtract the full LS fit?
# maybe just the polynomial fit? looks like that's what the paper does
# get the LS model residuals (for principal frequency)
lomb_dict = self.fetch_extr('lomb_scargle')
model = lomb_dict.get('freq1_model',[])
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from ..FeatureExtractor import FeatureExtractor

# TODO simplify
class beyond1std_extractor(FeatureExtractor):
""" calculates the percentage of points that lie beyond one standard deviation from the weighted mean """
active = True
Expand All @@ -16,5 +17,6 @@ def extract(self):
except:
self.ex_error(text="beyond1std_extractor")
return ret_val
# TODO why
def x_devs(self): # how many deviations ?
self.devs = 1.0
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
except:
pass

# TODO I think this is all pretty unnecessary, just use weighted least squares
class ChiSquare(object): #gives extractors the ability to calculate chi squares
def chi_square_sum(self,y,f,x=None,rms=None):
""" inputs: [y]-data (array) [f]unction (function), [x]-axis [rms] noise (array)"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@
pass
import pprint

import numpy
import numpy as np
# TODO use namespace
from numpy import *

from scipy import random
Expand Down Expand Up @@ -172,7 +173,7 @@ def make_psd_plot(self, psd=None, srcid=None, freqin=None):
try:
from matplotlib import pyplot as plt
### Plot the PSD(freq)
psd_array = numpy.array(psd)
psd_array = np.array(psd)
#import matplotlib
#matplotlib.use('PNG')
#import matplotlib.pyplot as plt
Expand All @@ -181,7 +182,7 @@ def make_psd_plot(self, psd=None, srcid=None, freqin=None):

fig = plt.figure(figsize=(5,3), dpi=100)
ax2 = fig.add_subplot(211)
ax2.plot(freqin, numpy.log10(psd_array))
ax2.plot(freqin, np.log10(psd_array))
ax2.set_ylim(0,3)
ax2.set_yticks([3,2,1,0])
ax2.set_ylabel("Log10(psd)")
Expand All @@ -191,7 +192,7 @@ def make_psd_plot(self, psd=None, srcid=None, freqin=None):
fontsize=10)
ax2.set_title(str(srcid) + " Non-prewhitened, Power Spectral Density", fontsize=9)
ax3 = fig.add_subplot(212)
ax3.plot(numpy.log10(freqin), numpy.log10(psd_array))
ax3.plot(np.log10(freqin), np.log10(psd_array))
ax3.set_ylim(-3,3)
ax3.set_xlim(-3,1)
ax3.set_yticks([3,2,1,0,-1,-2,-3])
Expand Down Expand Up @@ -251,6 +252,7 @@ def model_neg(t):
max_4_a = fmin(model_f, min_3_a + 0.01)[0]

try:
# TODO is this wrong? seems like it should be a minus
out_dict['model_phi1_phi2'] = (min_3_a - max_2_a) / (max_4_a / min_3_a)
out_dict['model_min_delta_mags'] = abs(model_f(min_1_a) - model_f(min_3_a))
out_dict['model_max_delta_mags'] = abs(model_f(max_2_a) - model_f(max_4_a))
Expand All @@ -269,8 +271,8 @@ def model_neg(t):

ax2.plot(t_plot, modl, 'go')

ax2.plot([max_2_a, max_4_a], numpy.array([model_f(max_2_a), model_f(max_4_a)]) + y_median + 0.5, 'yo')
ax2.plot([min_1_a, min_3_a], numpy.array([model_f(min_1_a), model_f(min_3_a)]) + y_median + 0.5, 'ko')
ax2.plot([max_2_a, max_4_a], np.array([model_f(max_2_a), model_f(max_4_a)]) + y_median + 0.5, 'yo')
ax2.plot([min_1_a, min_3_a], np.array([model_f(min_1_a), model_f(min_3_a)]) + y_median + 0.5, 'ko')

ax2.set_title("srcid=%d P=%f min_1=%f max_2=%f min_3=%f max_4=%f" % (srcid, 1. / freq1_freq, min_1_a, max_2_a, min_3_a, max_4_a), fontsize=9)
plt.savefig("/tmp/ggg.png")
Expand Down Expand Up @@ -306,6 +308,7 @@ def lomb_code(self, y, dy, x, sys_err=0.05, srcid=0):
#alias_std = std( x-x.round() )

Xmax = x.max()
# TODO why? seems arbitrary; applies to this whole section...
f0 = 1./Xmax
df = 0.8/Xmax # 20120202 : 0.1/Xmax
fe = 33. #pre 20120126: 10. # 25
Expand All @@ -329,7 +332,7 @@ def lomb_code(self, y, dy, x, sys_err=0.05, srcid=0):
for i in range(num_freq_comps):
if (i==0):
psd,res = lombr(x,ytest,dy0,f0,df,numf, tone_control=tone_control,
lambda0_range=lambda0_range, nharm=nharm, detrend_order=1)
lambda0_range=lambda0_range, nharm=nharm, detrend_order=1)
### I think it still makes sense to set these here, even though freq1 may be replaced by another non-alias freq. This is because these are parameters that are derived from the first prewhitening application:
out_dict['lambda'] = res['lambda0'] # 20120206 added
out_dict['chi0'] = res['chi0']
Expand Down Expand Up @@ -367,14 +370,15 @@ def lomb_code(self, y, dy, x, sys_err=0.05, srcid=0):
freq_dict['harmonics_time_offset'] = res['time0']
freq_dict['harmonics_y_offset'] = res['cn0'] # 20110429: disable since it was previously mean subtracted and not useful, and not mean subtracted is avg-mag and essentially survey biased # out_dict['cn0']

# TODO what is going on here? also, decouple
### Here we check for "1-day" aliases in ASAS / Deboss sources
dstr_alias = []
dstr_all = ["freq%i" % (i + 1) for i in range(num_freq_comps)]
### 20120223 co:
#for dstr in dstr_all:
# period = 1./out_dict[dstr]['frequency']
# if (((period >= 0.93) and (period <= 1.07) and
# (out_dict[dstr]['signif'] < (3.771221/numpy.power(numpy.abs(period - 1.), 0.25) + 3.293027))) or
# (out_dict[dstr]['signif'] < (3.771221/np.power(np.abs(period - 1.), 0.25) + 3.293027))) or
# ((period >= 0.485) and (period <= 0.515) and (out_dict[dstr]['signif'] < 10.0)) or
# ((period >= 0.325833333) and (period <= 0.340833333) and (out_dict[dstr]['signif'] < 8.0))):
# dstr_alias.append(dstr) # this frequency has a "1 day" alias (or 0.5 or 0.33
Expand Down Expand Up @@ -406,7 +410,7 @@ def lomb_code(self, y, dy, x, sys_err=0.05, srcid=0):
for a in alias:
if ((period >= a['p_low']) and
(period <= a['p_high']) and
(out_dict[dstr]['signif'] < (a['alpha_1']/numpy.power(numpy.abs(period - a['per']), 0.25) + a['alpha_2']))):
(out_dict[dstr]['signif'] < (a['alpha_1']/np.power(np.abs(period - a['per']), 0.25) + a['alpha_2']))):
dstr_alias.append(dstr) # this frequency has a "1 day" alias (or 0.5 or 0.33
break # only need to do this once per period, if an alias is found.

Expand Down Expand Up @@ -448,34 +452,34 @@ def lomb_code(self, y, dy, x, sys_err=0.05, srcid=0):
tups = list(zip(t_2per_fold, y))#, range(len(t_2per_fold)))
tups.sort()
t_2fold, m_2fold = zip(*tups) #So: m_2fold[30] == y[i_fold[30]]
m_2fold_array = numpy.array(m_2fold)
sumsqr_diff_folded = numpy.sum((m_2fold_array[1:] - m_2fold_array[:-1])**2)
sumsqr_diff_unfold = numpy.sum((y[1:] - y[:-1])**2)
m_2fold_array = np.array(m_2fold)
sumsqr_diff_folded = np.sum((m_2fold_array[1:] - m_2fold_array[:-1])**2)
sumsqr_diff_unfold = np.sum((y[1:] - y[:-1])**2)
p2p_scatter_2praw = sumsqr_diff_folded / sumsqr_diff_unfold
out_dict['p2p_scatter_2praw'] = p2p_scatter_2praw

mad = numpy.median(numpy.abs(y - median(y)))
out_dict['p2p_scatter_over_mad'] = numpy.median(numpy.abs(y[1:] - y[:-1])) / mad
mad = np.median(np.abs(y - median(y)))
out_dict['p2p_scatter_over_mad'] = np.median(np.abs(y[1:] - y[:-1])) / mad

### eta feature from arXiv 1101.3316 Kim QSO paper:
out_dict['p2p_ssqr_diff_over_var'] = sumsqr_diff_unfold / ((len(y) - 1) * numpy.var(y))
out_dict['p2p_ssqr_diff_over_var'] = sumsqr_diff_unfold / ((len(y) - 1) * np.var(y))

t_1per_fold = x % (1./out_dict['freq1']['frequency'])
tups = list(zip(t_1per_fold, y))#, range(len(t_2per_fold)))
tups.sort()
t_1fold, m_1fold = zip(*tups) #So: m_1fold[30] == y[i_fold[30]]
m_1fold_array = numpy.array(m_1fold)
m_1fold_array = np.array(m_1fold)
out_dict['p2p_scatter_pfold_over_mad'] = \
numpy.median(numpy.abs(m_1fold_array[1:] - m_1fold_array[:-1])) / mad
np.median(np.abs(m_1fold_array[1:] - m_1fold_array[:-1])) / mad

######################## # # #
### This section is used to calculate Dubath (10. Percentile90:2P/P)
### Which requires regenerating a model using 2P where P is the original found period
### NOTE: this essentially runs everything a second time, so makes feature
### generation take roughly twice as long.

model_vals = numpy.zeros(len(y))
#all_model_vals = numpy.zeros(len(y))
model_vals = np.zeros(len(y))
#all_model_vals = np.zeros(len(y))
freq_2p = out_dict['freq1']['frequency'] * 0.5
ytest_2p=1.*y # makes a copy of the array

Expand All @@ -496,8 +500,8 @@ def lomb_code(self, y, dy, x, sys_err=0.05, srcid=0):
#all_model_vals += res['model']
ytest_2p -= res['model']

out_dict['medperc90_2p_p'] = scoreatpercentile(numpy.abs(ytest_2p), 90) / \
scoreatpercentile(numpy.abs(ytest), 90)
out_dict['medperc90_2p_p'] = scoreatpercentile(np.abs(ytest_2p), 90) / \
scoreatpercentile(np.abs(ytest), 90)

some_feats = self.get_2P_modeled_features(x=x, y=y, freq1_freq=out_dict['freq1']['frequency'], srcid=srcid, ls_dict=out_dict)
out_dict.update(some_feats)
Expand All @@ -516,11 +520,15 @@ def lomb_code(self, y, dy, x, sys_err=0.05, srcid=0):

t_2per_fold = x % (1/freq_2p)
tups = list(zip(t_2per_fold, model_vals))
# TODO why?
tups.sort()
t_2fold, m_2fold = zip(*tups)
t_2fold_array = numpy.array(t_2fold)
m_2fold_array = numpy.array(m_2fold)
t_2fold_array = np.array(t_2fold)
m_2fold_array = np.array(m_2fold)
slopes = (m_2fold_array[1:] - m_2fold_array[:-1]) / (t_2fold_array[1:] - t_2fold_array[:-1])
# TODO why not just get the (almost) steepest positive/negative slopes directly?
# wrong for strictly increasing/decreasing
# TODO why is this here instead of another extractor...
out_dict['fold2P_slope_10percentile'] = scoreatpercentile(slopes,10) # this gets the steepest negative slope?
out_dict['fold2P_slope_90percentile'] = scoreatpercentile(slopes,90) # this gets the steepest positive slope?

Expand Down Expand Up @@ -658,9 +666,9 @@ def get_source_arrays(self, source_id):

src_dict = {}
src_dict['src_id'] = results[0][0]
src_dict['t'] = numpy.array(t_list)
src_dict['m'] = numpy.array(m_list)
src_dict['m_err'] = numpy.array(merr_list)
src_dict['t'] = np.array(t_list)
src_dict['m'] = np.array(m_list)
src_dict['m_err'] = np.array(merr_list)
#src_dict['m_err'] = [] #/@/ Justin changed 20090804
return src_dict

Expand Down Expand Up @@ -722,9 +730,9 @@ def get_source_arrays__dotastro(self, source_id):

src_dict = {}
src_dict['src_id'] = source_id
src_dict['t'] = numpy.array(t_list)
src_dict['m'] = numpy.array(m_list)
src_dict['m_err'] = numpy.array(merr_list)
src_dict['t'] = np.array(t_list)
src_dict['m'] = np.array(m_list)
src_dict['m_err'] = np.array(merr_list)
#src_dict['m_err'] = [] #/@/ Justin changed 20090804
return src_dict

Expand Down Expand Up @@ -814,9 +822,9 @@ def using_features_generate_resampled(self, src_dict):
# plot_period = 1.0 / freq_list[0]
# x_axis = arange(0,plot_period, .01)
#
# y_axis= amp_list[0]*sin(2*numpy.pi*freq_list[0]*(x_axis-rel_phase[0]))+\
# amp_list[1]*sin(2*numpy.pi*freq_list[1]*(x_axis-rel_phase[1]))+\
# amp_list[2]*sin(2*numpy.pi*freq_list[2]*(x_axis-rel_phase[2]))
# y_axis= amp_list[0]*sin(2*np.pi*freq_list[0]*(x_axis-rel_phase[0]))+\
# amp_list[1]*sin(2*np.pi*freq_list[1]*(x_axis-rel_phase[1]))+\
# amp_list[2]*sin(2*np.pi*freq_list[2]*(x_axis-rel_phase[2]))
#
# feature_resampled_dict = {'feature_resampled':{'t':x_axis, 'm':y_axis,
# 'color':self.pars['color_feature_resampled']}}
Expand Down Expand Up @@ -845,16 +853,16 @@ def using_features_generate_resampled(self, src_dict):
m_offset_kludge = amp_kludge/2. + min(src_dict['m'])

# 20090720: dstarr replaces the following with something 4-6 lines below
#y_axis= (amp_list[0]*sin(2*numpy.pi*freq_list[0]*(x_axis-rel_phase[0]))+\
# amp_list[1]*sin(2*numpy.pi*freq_list[1]*(x_axis-rel_phase[1]))+\
# amp_list[2]*sin(2*numpy.pi*freq_list[2]*(x_axis-rel_phase[2])))
y_axis= (amp_list[0]*sin(2*numpy.pi*freq_list[0]*(x_axis)))+\
amp_list[1]*sin(2*numpy.pi*freq_list[1]*(x_axis) + rel_phase[1])+\
amp_list[2]*sin(2*numpy.pi*freq_list[2]*(x_axis) + rel_phase[2])

y_axis= (amp_list[0]*sin(2*numpy.pi*freq_list[0]*(x_axis) + rel_phase[0]))+\
amp_list[1]*sin(2*numpy.pi*freq_list[1]*(x_axis) + rel_phase[1])+\
amp_list[2]*sin(2*numpy.pi*freq_list[2]*(x_axis) + rel_phase[2])
#y_axis= (amp_list[0]*sin(2*np.pi*freq_list[0]*(x_axis-rel_phase[0]))+\
# amp_list[1]*sin(2*np.pi*freq_list[1]*(x_axis-rel_phase[1]))+\
# amp_list[2]*sin(2*np.pi*freq_list[2]*(x_axis-rel_phase[2])))
y_axis= (amp_list[0]*sin(2*np.pi*freq_list[0]*(x_axis)))+\
amp_list[1]*sin(2*np.pi*freq_list[1]*(x_axis) + rel_phase[1])+\
amp_list[2]*sin(2*np.pi*freq_list[2]*(x_axis) + rel_phase[2])

y_axis= (amp_list[0]*sin(2*np.pi*freq_list[0]*(x_axis) + rel_phase[0]))+\
amp_list[1]*sin(2*np.pi*freq_list[1]*(x_axis) + rel_phase[1])+\
amp_list[2]*sin(2*np.pi*freq_list[2]*(x_axis) + rel_phase[2])

amp_sampled_m = max(y_axis) - min(y_axis)
amp_sampled_offset = amp_sampled_m / 2. + min(y_axis)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from numpy import ones,sqrt

# TODO don't see any reason to use this instead of polyfit from numpy
def linfit(x,y,dy=[]):
"""
m = a+b*x
Expand Down
Loading