cesium-ml · acrellin · Oct 1, 2015 · Sep 19, 2015 · Sep 25, 2015 · Sep 28, 2015
diff --git a/mltsp/Flask/flask_app.py b/mltsp/Flask/flask_app.py
@@ -42,7 +42,7 @@
 from .. import predict_class as predict
 from .. import build_model
 
-all_available_features_list = cfg.features_list + cfg.features_list_science
+all_available_features_list = cfg.features_list_obs + cfg.features_list_science
 
 # Flask initialization
 app = Flask(__name__, static_folder=None)
@@ -1621,7 +1621,7 @@ def get_list_of_available_features():
 
 def get_list_of_available_features_set2():
     """Return list of additional built-in time-series features."""
-    return sorted([feat for feat in cfg.features_list if feat not in
+    return sorted([feat for feat in cfg.features_list_obs if feat not in
                    cfg.ignore_feats_list_science])
 
 
@@ -2157,10 +2157,10 @@ def verifyNewScript():
         try:
             test_results = cft.verify_new_script(script_fpath=scriptfile_path)
             ks = []
-            for thisone in test_results:
-                for k, v in thisone.items():
-                    if k not in ks:
-                        ks.append(k)
+#            for thisone in test_results:
+            for k, v in test_results.items():
+                if k not in ks:
+                    ks.append(k)
             res_str = ""
             for k in ks:
                 res_str += (("<input type='checkbox' value='%s' "
@@ -2664,7 +2664,7 @@ def featurizationPage(
 
     """
     projkey = project_name_to_key(project_name)
-    if already_featurized == True and zipfile_name is None:
+    if already_featurized and zipfile_name is None:
         # User is uploading pre-featurized data, without time-series data
         features_filename = headerfile_name
         features_filepath = os.path.join(

diff --git a/mltsp/TCP/Algorithms/fitcurve/lightcurve.py b/mltsp/TCP/Algorithms/fitcurve/lightcurve.py
@@ -1,3 +1,4 @@
+# TODO remove?
 #!/usr/bin/env python
 
 ## Filename: lightcurve.py

diff --git a/mltsp/TCP/Algorithms/fitcurve/lomb_scargle_refine.py b/mltsp/TCP/Algorithms/fitcurve/lomb_scargle_refine.py
@@ -1,3 +1,4 @@
+# TODO duplicate?
 from numpy import empty,pi,sqrt,sin,cos,dot,where,arange,arctan2,array,diag,ix_,log10,outer,hstack,log,round,zeros
 from scipy import weave
 from lomb_scargle import lprob2sigma

diff --git a/mltsp/TCP/Software/feature_extract/Code/FeatureExtractor.py b/mltsp/TCP/Software/feature_extract/Code/FeatureExtractor.py
@@ -101,6 +101,7 @@ def specific_obj(self,output):
         pass
     def extract(self): # delegates the actual implementation to subclasses
         pass
+# TODO why try / except KeyError?
     def set_names(self,where):
         """ prepares the most commonly used inputs for easy access """
         try:

diff --git a/mltsp/TCP/Software/feature_extract/Code/extractors/.__init__.py.swp b/mltsp/TCP/Software/feature_extract/Code/extractors/.__init__.py.swp
diff --git a/mltsp/TCP/Software/feature_extract/Code/extractors/__init__.py b/mltsp/TCP/Software/feature_extract/Code/extractors/__init__.py
@@ -35,7 +35,7 @@
 # # # 20100912 disabled due to lomb() call# from second_lomb_extractor import second_lomb_extractor
 # # # 20100912 disabled due to lomb() call# from frequency_ratio_extractor import ratio21, ratio31, ratio32
 #from example_extractor  import example_extractor
-#from n_points_extractor import n_points_extractor
+from .n_points_extractor import n_points_extractor
 
 #from position_intermediate_extractor import position_intermediate_extractor
 #20101207#from galb_extractor import galb_extractor

diff --git a/mltsp/TCP/Software/feature_extract/Code/extractors/amplitude_extractor.py b/mltsp/TCP/Software/feature_extract/Code/extractors/amplitude_extractor.py
@@ -12,6 +12,7 @@
 from scipy import stats
 import numpy
 
+# TODO scipy.stats.scoreatpercentile is deprecated; should use numpy.percentile
 class amplitude_extractor(FeatureExtractor):
     """ Returns the half the difference between the maximum magnitude and the minimum magnitude.
     Note this will also work for data that is given in terms of flux. So in a sense, it's
@@ -33,6 +34,8 @@ def extract(self):
 
         return amplitude/2.0
 
+# TODO what is up with this assumed_unit stuff? this seems well-defined for any units
+# in general, should we assume things are log-scale?
 class percent_amplitude_extractor(FeatureExtractor):
         """ Returns the largest percentage difference between the maximum 
         magnitude and the minimum magnitude relative to the median.
@@ -43,6 +46,7 @@ class percent_amplitude_extractor(FeatureExtractor):
         extname = 'percent_amplitude' #extractor's name
         assumed_unit = "mag" # 20100518: actually, I think a unitless percentage is returned.
         def extract(self):
+# TODO is there really any reason to call these extractors instead of just Python functions...
             maxm = self.fetch_extr('max') # maximum magnitude
             minm = self.fetch_extr('min') # minimum magnitude
             medd = self.fetch_extr("median")

diff --git a/mltsp/TCP/Software/feature_extract/Code/extractors/ar_is_extractor.py b/mltsp/TCP/Software/feature_extract/Code/extractors/ar_is_extractor.py
@@ -26,6 +26,8 @@ def extract(self):
             m = self.flux_data
             n = len(t)
 
+# TODO could be more flexible here; do we always want to subtract the full LS fit?
+# maybe just the polynomial fit? looks like that's what the paper does
             # get the LS model residuals (for principal frequency)
             lomb_dict = self.fetch_extr('lomb_scargle')
             model = lomb_dict.get('freq1_model',[])

diff --git a/mltsp/TCP/Software/feature_extract/Code/extractors/beyond1std_extractor.py b/mltsp/TCP/Software/feature_extract/Code/extractors/beyond1std_extractor.py
@@ -1,5 +1,6 @@
 from ..FeatureExtractor import FeatureExtractor
 
+# TODO simplify
 class beyond1std_extractor(FeatureExtractor):
     """ calculates the percentage of points that lie beyond one standard deviation from the weighted mean """
     active = True
@@ -16,5 +17,6 @@ def extract(self):
         except:
             self.ex_error(text="beyond1std_extractor")
         return ret_val
+# TODO why
     def x_devs(self): # how many deviations ?
         self.devs = 1.0
diff --git a/mltsp/TCP/Software/feature_extract/Code/extractors/common_functions/ChiSquare.py b/mltsp/TCP/Software/feature_extract/Code/extractors/common_functions/ChiSquare.py
@@ -6,6 +6,7 @@
 except:
 	pass
 
+# TODO I think this is all pretty unnecessary, just use weighted least squares
 class ChiSquare(object): #gives extractors the ability to calculate chi squares
 	def chi_square_sum(self,y,f,x=None,rms=None):
 		""" inputs: [y]-data (array) [f]unction (function), [x]-axis [rms] noise (array)"""

diff --git a/mltsp/TCP/Software/feature_extract/Code/extractors/common_functions/lightcurve.py b/mltsp/TCP/Software/feature_extract/Code/extractors/common_functions/lightcurve.py
@@ -38,7 +38,8 @@
     pass
 import pprint
 
-import numpy
+import numpy as np
+# TODO use namespace
 from numpy import *
 
 from scipy import random
@@ -172,7 +173,7 @@ def make_psd_plot(self, psd=None, srcid=None, freqin=None):
             try:
                 from matplotlib import pyplot as plt
                 ### Plot the PSD(freq)
-                psd_array = numpy.array(psd)
+                psd_array = np.array(psd)
                 #import matplotlib
                 #matplotlib.use('PNG')
                 #import matplotlib.pyplot as plt
@@ -181,7 +182,7 @@ def make_psd_plot(self, psd=None, srcid=None, freqin=None):
 
                 fig = plt.figure(figsize=(5,3), dpi=100)
                 ax2 = fig.add_subplot(211)
-                ax2.plot(freqin, numpy.log10(psd_array))
+                ax2.plot(freqin, np.log10(psd_array))
                 ax2.set_ylim(0,3)
                 ax2.set_yticks([3,2,1,0])
                 ax2.set_ylabel("Log10(psd)")
@@ -191,7 +192,7 @@ def make_psd_plot(self, psd=None, srcid=None, freqin=None):
                             fontsize=10)
                 ax2.set_title(str(srcid) + " Non-prewhitened, Power Spectral Density", fontsize=9)
                 ax3 = fig.add_subplot(212)
-                ax3.plot(numpy.log10(freqin), numpy.log10(psd_array))
+                ax3.plot(np.log10(freqin), np.log10(psd_array))
                 ax3.set_ylim(-3,3)
                 ax3.set_xlim(-3,1)
                 ax3.set_yticks([3,2,1,0,-1,-2,-3])
@@ -251,6 +252,7 @@ def model_neg(t):
         max_4_a = fmin(model_f, min_3_a + 0.01)[0]
 
         try:
+# TODO is this wrong? seems like it should be a minus
             out_dict['model_phi1_phi2'] = (min_3_a - max_2_a) / (max_4_a / min_3_a)
             out_dict['model_min_delta_mags'] = abs(model_f(min_1_a) - model_f(min_3_a))
             out_dict['model_max_delta_mags'] = abs(model_f(max_2_a) - model_f(max_4_a))
@@ -269,8 +271,8 @@ def model_neg(t):
 
             ax2.plot(t_plot, modl, 'go')
 
-            ax2.plot([max_2_a, max_4_a], numpy.array([model_f(max_2_a), model_f(max_4_a)]) + y_median + 0.5, 'yo')
-            ax2.plot([min_1_a, min_3_a], numpy.array([model_f(min_1_a), model_f(min_3_a)]) + y_median + 0.5, 'ko')
+            ax2.plot([max_2_a, max_4_a], np.array([model_f(max_2_a), model_f(max_4_a)]) + y_median + 0.5, 'yo')
+            ax2.plot([min_1_a, min_3_a], np.array([model_f(min_1_a), model_f(min_3_a)]) + y_median + 0.5, 'ko')
 
             ax2.set_title("srcid=%d   P=%f min_1=%f   max_2=%f   min_3=%f   max_4=%f" % (srcid, 1. / freq1_freq, min_1_a, max_2_a, min_3_a, max_4_a), fontsize=9)
             plt.savefig("/tmp/ggg.png")
@@ -306,6 +308,7 @@ def lomb_code(self, y, dy, x, sys_err=0.05, srcid=0):
         #alias_std = std( x-x.round() )
 
         Xmax = x.max()
+# TODO why? seems arbitrary; applies to this whole section...
         f0 = 1./Xmax
         df = 0.8/Xmax    # 20120202 :    0.1/Xmax
         fe = 33. #pre 20120126: 10. # 25
@@ -329,7 +332,7 @@ def lomb_code(self, y, dy, x, sys_err=0.05, srcid=0):
         for i in range(num_freq_comps):
             if (i==0):
                 psd,res = lombr(x,ytest,dy0,f0,df,numf, tone_control=tone_control,
-                                lambda0_range=lambda0_range, nharm=nharm, detrend_order=1)                    
+                                lambda0_range=lambda0_range, nharm=nharm, detrend_order=1)
                 ### I think it still makes sense to set these here, even though freq1 may be replaced by another non-alias freq.  This is because these are parameters that are derived from the first prewhitening application:
                 out_dict['lambda'] = res['lambda0'] # 20120206 added
                 out_dict['chi0'] = res['chi0']
@@ -367,14 +370,15 @@ def lomb_code(self, y, dy, x, sys_err=0.05, srcid=0):
             freq_dict['harmonics_time_offset'] = res['time0']
             freq_dict['harmonics_y_offset'] = res['cn0'] # 20110429: disable since it was previously mean subtracted and not useful, and not mean subtracted is avg-mag and essentially survey biased # out_dict['cn0']
 
+# TODO what is going on here? also, decouple
         ### Here we check for "1-day" aliases in ASAS / Deboss sources
         dstr_alias = []
         dstr_all = ["freq%i" % (i + 1) for i in range(num_freq_comps)]
         ### 20120223 co:
         #for dstr in dstr_all:
         #    period = 1./out_dict[dstr]['frequency']
         #    if (((period >= 0.93) and (period <= 1.07) and 
-        #         (out_dict[dstr]['signif'] < (3.771221/numpy.power(numpy.abs(period - 1.), 0.25) + 3.293027))) or
+        #         (out_dict[dstr]['signif'] < (3.771221/np.power(np.abs(period - 1.), 0.25) + 3.293027))) or
         #        ((period >= 0.485) and (period <= 0.515) and (out_dict[dstr]['signif'] < 10.0)) or
         #        ((period >= 0.325833333) and (period <= 0.340833333) and (out_dict[dstr]['signif'] < 8.0))):
         #        dstr_alias.append(dstr) # this frequency has a "1 day" alias (or 0.5 or 0.33
@@ -406,7 +410,7 @@ def lomb_code(self, y, dy, x, sys_err=0.05, srcid=0):
             for a in alias:
                 if ((period >= a['p_low']) and 
                     (period <= a['p_high']) and 
-                    (out_dict[dstr]['signif'] < (a['alpha_1']/numpy.power(numpy.abs(period - a['per']), 0.25) + a['alpha_2']))):
+                    (out_dict[dstr]['signif'] < (a['alpha_1']/np.power(np.abs(period - a['per']), 0.25) + a['alpha_2']))):
                     dstr_alias.append(dstr) # this frequency has a "1 day" alias (or 0.5 or 0.33
                     break # only need to do this once per period, if an alias is found.
 
@@ -448,34 +452,34 @@ def lomb_code(self, y, dy, x, sys_err=0.05, srcid=0):
         tups = list(zip(t_2per_fold, y))#, range(len(t_2per_fold)))
         tups.sort()
         t_2fold, m_2fold = zip(*tups) #So:  m_2fold[30] == y[i_fold[30]]
-        m_2fold_array = numpy.array(m_2fold)
-        sumsqr_diff_folded = numpy.sum((m_2fold_array[1:] - m_2fold_array[:-1])**2)
-        sumsqr_diff_unfold = numpy.sum((y[1:] - y[:-1])**2)
+        m_2fold_array = np.array(m_2fold)
+        sumsqr_diff_folded = np.sum((m_2fold_array[1:] - m_2fold_array[:-1])**2)
+        sumsqr_diff_unfold = np.sum((y[1:] - y[:-1])**2)
         p2p_scatter_2praw = sumsqr_diff_folded / sumsqr_diff_unfold
         out_dict['p2p_scatter_2praw'] = p2p_scatter_2praw
 
-        mad = numpy.median(numpy.abs(y - median(y)))
-        out_dict['p2p_scatter_over_mad'] = numpy.median(numpy.abs(y[1:] - y[:-1])) / mad
+        mad = np.median(np.abs(y - median(y)))
+        out_dict['p2p_scatter_over_mad'] = np.median(np.abs(y[1:] - y[:-1])) / mad
 
         ### eta feature from arXiv 1101.3316 Kim QSO paper:
-        out_dict['p2p_ssqr_diff_over_var'] = sumsqr_diff_unfold / ((len(y) - 1) * numpy.var(y))
+        out_dict['p2p_ssqr_diff_over_var'] = sumsqr_diff_unfold / ((len(y) - 1) * np.var(y))
 
         t_1per_fold = x % (1./out_dict['freq1']['frequency'])
         tups = list(zip(t_1per_fold, y))#, range(len(t_2per_fold)))
         tups.sort()
         t_1fold, m_1fold = zip(*tups) #So:  m_1fold[30] == y[i_fold[30]]
-        m_1fold_array = numpy.array(m_1fold)
+        m_1fold_array = np.array(m_1fold)
         out_dict['p2p_scatter_pfold_over_mad'] = \
-                           numpy.median(numpy.abs(m_1fold_array[1:] - m_1fold_array[:-1])) / mad
+                           np.median(np.abs(m_1fold_array[1:] - m_1fold_array[:-1])) / mad
 
         ######################## # # #
         ### This section is used to calculate Dubath (10. Percentile90:2P/P)
         ###     Which requires regenerating a model using 2P where P is the original found period
         ### NOTE: this essentially runs everything a second time, so makes feature
         ###     generation take roughly twice as long.
 
-        model_vals = numpy.zeros(len(y))
-        #all_model_vals = numpy.zeros(len(y))
+        model_vals = np.zeros(len(y))
+        #all_model_vals = np.zeros(len(y))
         freq_2p = out_dict['freq1']['frequency'] * 0.5
         ytest_2p=1.*y # makes a copy of the array
 
@@ -496,8 +500,8 @@ def lomb_code(self, y, dy, x, sys_err=0.05, srcid=0):
             #all_model_vals += res['model']
             ytest_2p -= res['model']
 
-        out_dict['medperc90_2p_p'] = scoreatpercentile(numpy.abs(ytest_2p), 90) / \
-                                             scoreatpercentile(numpy.abs(ytest), 90)
+        out_dict['medperc90_2p_p'] = scoreatpercentile(np.abs(ytest_2p), 90) / \
+                                             scoreatpercentile(np.abs(ytest), 90)
 
         some_feats = self.get_2P_modeled_features(x=x, y=y, freq1_freq=out_dict['freq1']['frequency'], srcid=srcid, ls_dict=out_dict)
         out_dict.update(some_feats)
@@ -516,11 +520,15 @@ def lomb_code(self, y, dy, x, sys_err=0.05, srcid=0):
 
         t_2per_fold = x % (1/freq_2p)
         tups = list(zip(t_2per_fold, model_vals))
+# TODO why?
         tups.sort()
         t_2fold, m_2fold = zip(*tups)
-        t_2fold_array = numpy.array(t_2fold)
-        m_2fold_array = numpy.array(m_2fold)
+        t_2fold_array = np.array(t_2fold)
+        m_2fold_array = np.array(m_2fold)
         slopes = (m_2fold_array[1:] - m_2fold_array[:-1]) / (t_2fold_array[1:] - t_2fold_array[:-1])
+# TODO why not just get the (almost) steepest positive/negative slopes directly?
+# wrong for strictly increasing/decreasing
+# TODO why is this here instead of another extractor...
         out_dict['fold2P_slope_10percentile'] = scoreatpercentile(slopes,10) # this gets the steepest negative slope?
         out_dict['fold2P_slope_90percentile'] = scoreatpercentile(slopes,90) # this gets the steepest positive slope?
 
@@ -658,9 +666,9 @@ def get_source_arrays(self, source_id):
 
         src_dict = {}
         src_dict['src_id'] = results[0][0]
-        src_dict['t'] = numpy.array(t_list)
-        src_dict['m'] = numpy.array(m_list)
-        src_dict['m_err'] = numpy.array(merr_list)
+        src_dict['t'] = np.array(t_list)
+        src_dict['m'] = np.array(m_list)
+        src_dict['m_err'] = np.array(merr_list)
         #src_dict['m_err'] = []   #/@/ Justin changed 20090804
         return src_dict
 
@@ -722,9 +730,9 @@ def get_source_arrays__dotastro(self, source_id):
 
         src_dict = {}
         src_dict['src_id'] = source_id
-        src_dict['t'] = numpy.array(t_list)
-        src_dict['m'] = numpy.array(m_list)
-        src_dict['m_err'] = numpy.array(merr_list)
+        src_dict['t'] = np.array(t_list)
+        src_dict['m'] = np.array(m_list)
+        src_dict['m_err'] = np.array(merr_list)
         #src_dict['m_err'] = []   #/@/ Justin changed 20090804
         return src_dict
 
@@ -814,9 +822,9 @@ def using_features_generate_resampled(self, src_dict):
         #    plot_period = 1.0 / freq_list[0]
         #    x_axis = arange(0,plot_period, .01)
         #
-        #    y_axis= amp_list[0]*sin(2*numpy.pi*freq_list[0]*(x_axis-rel_phase[0]))+\
-        #        amp_list[1]*sin(2*numpy.pi*freq_list[1]*(x_axis-rel_phase[1]))+\
-        #        amp_list[2]*sin(2*numpy.pi*freq_list[2]*(x_axis-rel_phase[2]))
+        #    y_axis= amp_list[0]*sin(2*np.pi*freq_list[0]*(x_axis-rel_phase[0]))+\
+        #        amp_list[1]*sin(2*np.pi*freq_list[1]*(x_axis-rel_phase[1]))+\
+        #        amp_list[2]*sin(2*np.pi*freq_list[2]*(x_axis-rel_phase[2]))
         #
         #    feature_resampled_dict = {'feature_resampled':{'t':x_axis, 'm':y_axis, 
         #                                               'color':self.pars['color_feature_resampled']}}
@@ -845,16 +853,16 @@ def using_features_generate_resampled(self, src_dict):
             m_offset_kludge = amp_kludge/2. + min(src_dict['m'])
 
             # 20090720: dstarr replaces the following with something 4-6 lines below
-            #y_axis= (amp_list[0]*sin(2*numpy.pi*freq_list[0]*(x_axis-rel_phase[0]))+\
-            #    amp_list[1]*sin(2*numpy.pi*freq_list[1]*(x_axis-rel_phase[1]))+\
-            #    amp_list[2]*sin(2*numpy.pi*freq_list[2]*(x_axis-rel_phase[2]))) 
-            y_axis= (amp_list[0]*sin(2*numpy.pi*freq_list[0]*(x_axis)))+\
-                amp_list[1]*sin(2*numpy.pi*freq_list[1]*(x_axis) + rel_phase[1])+\
-                amp_list[2]*sin(2*numpy.pi*freq_list[2]*(x_axis) + rel_phase[2]) 
-
-            y_axis= (amp_list[0]*sin(2*numpy.pi*freq_list[0]*(x_axis) + rel_phase[0]))+\
-                amp_list[1]*sin(2*numpy.pi*freq_list[1]*(x_axis) + rel_phase[1])+\
-                amp_list[2]*sin(2*numpy.pi*freq_list[2]*(x_axis) + rel_phase[2]) 
+            #y_axis= (amp_list[0]*sin(2*np.pi*freq_list[0]*(x_axis-rel_phase[0]))+\
+            #    amp_list[1]*sin(2*np.pi*freq_list[1]*(x_axis-rel_phase[1]))+\
+            #    amp_list[2]*sin(2*np.pi*freq_list[2]*(x_axis-rel_phase[2])))
+            y_axis= (amp_list[0]*sin(2*np.pi*freq_list[0]*(x_axis)))+\
+                amp_list[1]*sin(2*np.pi*freq_list[1]*(x_axis) + rel_phase[1])+\
+                amp_list[2]*sin(2*np.pi*freq_list[2]*(x_axis) + rel_phase[2])
+
+            y_axis= (amp_list[0]*sin(2*np.pi*freq_list[0]*(x_axis) + rel_phase[0]))+\
+                amp_list[1]*sin(2*np.pi*freq_list[1]*(x_axis) + rel_phase[1])+\
+                amp_list[2]*sin(2*np.pi*freq_list[2]*(x_axis) + rel_phase[2])
 
             amp_sampled_m = max(y_axis) - min(y_axis)
             amp_sampled_offset = amp_sampled_m / 2. + min(y_axis)

diff --git a/mltsp/TCP/Software/feature_extract/Code/extractors/common_functions/linfit.py b/mltsp/TCP/Software/feature_extract/Code/extractors/common_functions/linfit.py
@@ -1,5 +1,6 @@
 from numpy import ones,sqrt
 
+# TODO don't see any reason to use this instead of polyfit from numpy
 def linfit(x,y,dy=[]):
     """
     m = a+b*x