Remove old import + other style changes

Tidying up: remove import of deprecated `lc_tools`, change some test data, remove extraneous comments, add a couple of docstrings.
cesium-ml · Oct 1, 2015 · f76e839 · f76e839
1 parent 7c54921
commit f76e839
Show file tree

Hide file tree

Showing 11 changed files with 97 additions and 50 deletions.
diff --git a/mltsp/featurize.py b/mltsp/featurize.py
@@ -12,7 +12,6 @@
 import numpy as np
 
 from . import cfg
-from . import lc_tools
 from . import custom_feature_tools as cft
 from . import util
 from . import custom_exceptions

diff --git a/mltsp/obs_feature_tools.py b/mltsp/obs_feature_tools.py
@@ -1,8 +1,7 @@
-import copy
 import numpy as np
 import scipy.stats as stats
 from . import cfg
-from dask.async import get_sync as dget
+import dask.async
 
 
 def double_to_single_step(cads):
@@ -83,12 +82,22 @@ def generate_obs_features(t, m, e, features_to_compute=cfg.features_list_obs):
 
     Parameters
     ----------
+    t : array_like
+        Array containing time values.
+
+    m : array_like
+        Array containing data values.
+
+    e : array_like
+        Array containing measurement error values.
+
+    features_to_compute : list
+        Optional list containing names of desired features.
 
     Returns
     -------
     dict
         Dictionary containing generated time series features.
-
     """
     features_to_compute = [f for f in features_to_compute if f in
                            cfg.features_list_obs]
@@ -147,5 +156,5 @@ def generate_obs_features(t, m, e, features_to_compute=cfg.features_list_obs):
     # Do not execute in parallel; parallelization has already taken place at
     # the level of time series, so we compute features for a single time series
     # in serial.
-    values = dget(feature_graph, features_to_compute)
+    values = dask.async.get_sync(feature_graph, features_to_compute)
     return dict(zip(features_to_compute, values))
diff --git a/mltsp/predict_class.py b/mltsp/predict_class.py
@@ -12,9 +12,7 @@
 
 from . import cfg
 from . import custom_exceptions
-from . import lc_tools
 from . import custom_feature_tools as cft
-from . import util
 from .celery_tasks import pred_featurize_single
 
 

diff --git a/mltsp/science_feature_tools.py b/mltsp/science_feature_tools.py
@@ -1,7 +1,7 @@
 import numpy as np
 import cfg
 import science_features as sf
-from dask.async import get_sync as dget
+import dask.async
 
 
 def generate_science_features(t, m, e, features_to_compute=cfg.features_list_science):
@@ -18,12 +18,14 @@ def generate_science_features(t, m, e, features_to_compute=cfg.features_list_sci
     e : array_like
         Array containing measurement error values.
 
+    features_to_compute : list
+        Optional list containing names of desired features.
+
     Returns
     -------
     dict
         Dictionary containing newly-generated features. Keys are
         feature names, values are feature values (floats).
-
     """
     features_to_compute = [f for f in features_to_compute if f in
                            cfg.features_list_science]
@@ -121,5 +123,5 @@ def generate_science_features(t, m, e, features_to_compute=cfg.features_list_sci
     # Do not execute in parallel; parallelization has already taken place at
     # the level of time series, so we compute features for a single time series
     # in serial.
-    values = dget(feature_graph, features_to_compute)
+    values = dask.async.get_sync(feature_graph, features_to_compute)
     return dict(zip(features_to_compute, values))
diff --git a/mltsp/science_features/tests/test_science_features.py b/mltsp/science_features/tests/test_science_features.py
@@ -263,15 +263,13 @@ def test_lomb_scargle_regular_single_freq():
 
     # Only test the first (true) frequency; the rest correspond to noise
     for j in range(1, NUM_HARMONICS):
-        # TODO why is this what 'relative phase' means?
         npt.assert_allclose(phase*j*(-1**j),
             all_lomb['freq1_rel_phase{}'.format(j+1)], rtol=1e-2, atol=1e-2)
 
     # Frequency ratio not relevant since there is only; only test amplitude/signif
     for i in [2,3]:
         npt.assert_allclose(0., all_lomb['freq_amplitude_ratio_{}1'.format(i)], atol=1e-3)
 
-    # TODO make significance test more precise
     npt.assert_array_less(10., all_lomb['freq1_signif'])
 
     # Only one frequency, so this should explain basically all the variance
@@ -325,7 +323,6 @@ def test_lomb_scargle_irregular_single_freq():
             npt.assert_allclose(phase*j*(-1**j),
                 all_lomb['freq1_rel_phase{}'.format(j+1)], rtol=1e-1, atol=1e-1)
 
-    # TODO make significance test more precise
     npt.assert_array_less(10., all_lomb['freq1_signif'])
 
     # Only one frequency, so this should explain basically all the variance
@@ -399,13 +396,7 @@ def test_lomb_scargle_regular_multi_freq():
         npt.assert_allclose(amplitudes[i-1,0] / amplitudes[0,0],
                 all_lomb['freq_amplitude_ratio_{}1'.format(i)], atol=2e-2)
 
-    # TODO make significance test more precise
     npt.assert_array_less(10., all_lomb['freq1_signif'])
-    """
-    e_name = 'freq_signif_ratio_{}1_extractor'.format(i)
-    e = getattr(extractors, e_name)()
-    npt.assert_allclose(0., all_lomb, atol=1e-3)
-    """
 
 
 def test_lomb_scargle_irregular_multi_freq():
@@ -437,13 +428,7 @@ def test_lomb_scargle_irregular_multi_freq():
         npt.assert_allclose(frequencies[i-1] / frequencies[0],
                 all_lomb['freq_frequency_ratio_{}1'.format(i)], atol=5e-2)
 
-    # TODO make significance test more precise
     npt.assert_array_less(10., all_lomb['freq1_signif'])
-"""
-    e_name = 'freq_signif_ratio_{}1_extractor'.format(i)
-    e = getattr(extractors, e_name)()
-    npt.assert_allclose(0., all_lomb, atol=1e-3)
-"""
 
 
 def test_max():
@@ -453,7 +438,7 @@ def test_max():
     npt.assert_equal(f.values()[0], max(values))
 
 
-# TODO this returns the index of the biggest slope...seems wrong
+# TODO uncomment when feature is fixed
 #def test_max_slope():
 #    """Test maximum slope feature, which finds the INDEX of the largest slope."""
 #    times, values, errors = irregular_random()
@@ -470,7 +455,7 @@ def test_median_absolute_deviation():
         np.median(values))))
 
 
-    # TODO should replace with commented version once sign problems fixed
+# TODO should replace with commented version once sign problems fixed
 def test_percent_close_to_median():
     """Test feature which finds the percentage of points near the median value."""
     times, values, errors = irregular_random()

diff --git a/mltsp/tests/data/test_features.csv b/mltsp/tests/data/test_features.csv
@@ -1,4 +1,4 @@
-meta1,meta2,meta3,std_err
-0.180734306909,0.548427238218,0.187956237253,0.00540072367701
-0.196072341892,1.17178931753,0.174802803661,0.00665375480667
-0.558093146298,0.265003093326,0.10984235246,0.00811408738542
+meta1,meta2,meta3,std_err,amplitude
+0.180734306909,0.548427238218,0.187956237253,0.00540072367701,0.0
+0.196072341892,1.17178931753,0.174802803661,0.00665375480667,0.0
+0.558093146298,0.265003093326,0.10984235246,0.00811408738542,0.0
diff --git a/mltsp/tests/data/test_features_wcust.csv b/mltsp/tests/data/test_features_wcust.csv
@@ -1,4 +1,4 @@
-meta1,meta2,meta3,std_err,f
-0.180734306909,0.548427238218,0.187956237253,0.3622,0.3
-0.196072341892,1.17178931753,0.174802803661,0.116672,0.2
-0.558093146298,0.265003093326,0.10984235246,1.3343,1.4
+meta1,meta2,meta3,std_err,amplitude,f
+0.180734306909,0.548427238218,0.187956237253,0.3622,0.0,0.3
+0.196072341892,1.17178931753,0.174802803661,0.116672,0.0,0.2
+0.558093146298,0.265003093326,0.10984235246,1.3343,0.0,1.4
diff --git a/mltsp/tests/data/test_features_with_classes.csv b/mltsp/tests/data/test_features_with_classes.csv
@@ -1,4 +1,4 @@
 class,meta1,meta2,meta3,std_err
-class1,0.180734306909,0.548427238218,0.187956237253,0.00540072367701
-class2,0.196072341892,1.17178931753,0.174802803661,0.00665375480667
-class3,0.558093146298,0.265003093326,0.10984235246,0.00811408738542
+class1,0.180734306909,0.548427238218,0.187956237253
+class2,0.196072341892,1.17178931753,0.174802803661
+class3,0.558093146298,0.265003093326,0.10984235246
diff --git a/mltsp/tests/test_flask_app.py b/mltsp/tests/test_flask_app.py
@@ -1465,7 +1465,7 @@ def test_featurize_proc(self):
                         "asas_training_subset_classes_with_metadata.dat"),
                     zipfile_path=pjoin(cfg.UPLOAD_FOLDER,
                                        "asas_training_subset.tar.gz"),
-                    features_to_use=["std_err"],
+                    features_to_use=["std_err", "amplitude"],
                     featureset_key="TEST01", is_test=True, email_user=False,
                     already_featurized=False,
                     custom_script_path=pjoin(cfg.UPLOAD_FOLDER,
@@ -1540,7 +1540,8 @@ def test_prediction_proc(self):
             r.table("features").insert({"id": "TEMP_TEST01",
                                         "name": "TEMP_TEST01",
                                         "projkey": "TEMP_TEST01",
-                                        "featlist": ["std_err"]}).run(conn)
+                                        "featlist": ["std_err",
+                                            "amplitude"]}).run(conn)
             r.table("projects").insert({"id": "TEMP_TEST01",
                                         "name": "TEMP_TEST01"}).run(conn)
             r.table("predictions").insert({"id": "TEMP_TEST01"}).run(conn)
@@ -1554,8 +1555,7 @@ def test_prediction_proc(self):
             entry = r.table("predictions").get("TEMP_TEST01").run(conn)
             pred_results_list_dict = entry
             assert(pred_results_list_dict["pred_results_list_dict"]
-                   ["TESTRUN_215153"][0][0] in ['Beta_Lyrae',
-                                                    'Herbig_AEBE'])
+                ["TESTRUN_215153"][0][0] in ['Beta_Lyrae', 'Herbig_AEBE'])
 
             assert all(key in pred_results_list_dict for key in \
                        ("ts_data_dict", "features_dict"))
@@ -2148,7 +2148,7 @@ def test_upload_data_featurize(self):
                                      'featureset_name': 'abc123',
                                      'featureset_project_name_select': 'abc123',
                                      'sep': ',',
-                                     'features_selected': ['std_err'],
+                                     'features_selected': ['std_err', 'amplitude'],
                                      'custom_script_tested': 'yes',
                                      'custom_feat_script_file':
                                      (open(pjoin(DATA_DIR, "testfeature1.py")),
@@ -2180,7 +2180,7 @@ def test_upload_data_featurize(self):
                                               "%s_features.csv" % new_key))
             cols = df.columns
             values = df.values
-            npt.assert_array_equal(sorted(cols), ["f", "std_err"])
+            npt.assert_array_equal(sorted(cols), ["amplitude", "f", "std_err"])
             fpaths = []
             for fpath in [
                     pjoin(cfg.FEATURES_FOLDER, "%s_features.csv" % new_key),
@@ -2235,7 +2235,7 @@ def test_upload_data_featurize_no_custom(self):
                                      'featureset_name': 'abc123',
                                      'featureset_project_name_select': 'abc123',
                                      'sep': ',',
-                                     'features_selected': ['std_err'],
+                                     'features_selected': ['std_err', 'amplitude'],
                                      'custom_script_tested': "no",
                                      'is_test': 'True'})
             res_dict = json.loads(rv.data)
@@ -2262,7 +2262,7 @@ def test_upload_data_featurize_no_custom(self):
                                               "%s_features.csv" % new_key))
             cols = df.columns
             values = df.values
-            npt.assert_array_equal(sorted(cols), ["std_err"])
+            npt.assert_array_equal(sorted(cols), ["amplitude", "std_err"])
             fpaths = []
             for fpath in [
                     pjoin(cfg.FEATURES_FOLDER, "%s_features.csv" % new_key),
@@ -2386,7 +2386,7 @@ def test_featurization_page_already_featurized(self):
             rv = fa.featurizationPage(
                 featureset_name="abc123", project_name="abc123",
                 headerfile_name=headerfile_name, zipfile_name=None,
-                sep=",", featlist=["std_err"], is_test=True,
+                sep=",", featlist=["std_err", "amplitude"], is_test=True,
                 email_user=False, already_featurized=True,
                 custom_script_path=custom_script_path)
             res_dict = json.loads(rv.data)
@@ -2413,7 +2413,7 @@ def test_featurization_page_already_featurized(self):
             cols = df.columns
             values = df.values
             npt.assert_array_equal(sorted(cols), ["meta1", "meta2", "meta3",
-                                                  "std_err"])
+                                                  "std_err", "amplitude"])
             fpaths = []
             for fpath in [
                     pjoin(cfg.FEATURES_FOLDER, "%s_features.csv" % new_key),

diff --git a/mltsp/tests/test_obs_features.py b/mltsp/tests/test_obs_features.py
@@ -0,0 +1,53 @@
+from mltsp import obs_feature_tools as oft
+import itertools
+
+import numpy as np
+import numpy.testing as npt
+
+
+def irregular_random(seed=0, size=50):
+    """Generate random test data at irregularly-sampled times."""
+    state = np.random.RandomState(seed)
+    times = np.sort(state.uniform(0, 10, size))
+    values = state.normal(1, 1, size)
+    errors = state.exponential(0.1, size)
+    return times, values, errors
+
+
+def test_delta_t_hist():
+    """Test histogram of all time lags."""
+    times, values, errors = irregular_random()
+    delta_ts = [pair[1] - pair[0] for pair in itertools.combinations(times, 2)]
+    nbins = 50
+    bins = np.linspace(0, max(times) - min(times), nbins+1)
+    npt.assert_allclose(oft.delta_t_hist(times, nbins), np.histogram(delta_ts,
+        bins=bins)[0])
+
+
+def test_normalize_hist():
+    """Test normalization of histogram."""
+    times, values, errors = irregular_random()
+    delta_ts = [pair[1] - pair[0] for pair in itertools.combinations(times, 2)]
+    nbins = 50
+    bins = np.linspace(0, max(times) - min(times), nbins+1)
+    nhist = oft.normalize_hist(oft.delta_t_hist(times, nbins), max(times) -
+                               min(times))
+    npt.assert_allclose(nhist, np.histogram(delta_ts,
+        bins=bins, density=True)[0])
+
+def test_find_sorted_peaks():
+    """Test peak-finding algorithm."""
+    x = np.array([0,5,3,1]) # Single peak
+    npt.assert_allclose(oft.find_sorted_peaks(x), np.array([[1,5]]))
+
+    x = np.array([0,5,3,6,1]) # Multiple peaks
+    npt.assert_allclose(oft.find_sorted_peaks(x), np.array([[3,6],[1,5]]))
+
+    x = np.array([3,1,3]) # End-points can be peaks
+    npt.assert_allclose(oft.find_sorted_peaks(x), np.array([[0,3],[2,3]]))
+
+    x = np.array([0,3,3,3,0]) # In case of ties, peak is left-most point
+    npt.assert_allclose(oft.find_sorted_peaks(x), np.array([[1,3]]))
+
+    x = np.array([0,3,3,5,0]) # Tie is a peak only if greater than next value
+    npt.assert_allclose(oft.find_sorted_peaks(x), np.array([[3,5]]))
diff --git a/mltsp/tests/test_predict.py b/mltsp/tests/test_predict.py
@@ -27,7 +27,8 @@ def test_determine_feats_used():
             pjoin(DATA_PATH, "test_%s" % suffix),
             pjoin(cfg.FEATURES_FOLDER, "TEST001_%s" % suffix))
     feats_used = pred.determine_feats_used("TEST001")
-    npt.assert_array_equal(feats_used, ["meta1", "meta2", "meta3", "std_err"])
+    npt.assert_array_equal(feats_used, ["meta1", "meta2", "meta3",
+                                        "std_err","amplitude"])
 
     for fname in ["TEST001_features.csv", "TEST001_classes.npy"]:
         os.remove(pjoin(cfg.FEATURES_FOLDER, fname))
@@ -135,7 +136,7 @@ def test_do_model_predictions():
             pjoin(cfg.FEATURES_FOLDER, "TEST001_%s" % suffix))
     featset_key = "TEST001"
     model_type = "RF"
-    features_to_use = ["std_err", "avg_err", "med_err", "n_epochs"]
+    features_to_use = ["std_err", "avg_err", "med_err", "n_epochs", "amplitude"]
     data_dict = pred.featurize_tsdata(
         pjoin(DATA_PATH, "dotastro_215153.dat"),
         "TEST001",