Remove old import + other style changes

Tidying up: remove import of deprecated `lc_tools`, add a couple tests for `obs_feature_tools`, change some test data, remove extraneous comments, add a couple of docstrings, change `test_flask_app` database teardown procedure.
cesium-ml · Oct 1, 2015 · 2660c11 · 2660c11
1 parent 7c54921
commit 2660c11
Show file tree

Hide file tree

Showing 12 changed files with 153 additions and 217 deletions.
diff --git a/mltsp/custom_feature_tools.py b/mltsp/custom_feature_tools.py
@@ -541,22 +541,26 @@ def generate_custom_features(custom_script_path, t, m, e,
 
     Parameters
     ----------
+    t : array_like
+        Array containing time values.
+
+    m : array_like
+        Array containing data values.
+
+    e : array_like
+        Array containing measurement error values.
+
     custom_script_path : str
         Path to custom features script.
-    ts_data : list OR tuple, optional
-        List (or tuple) of lists (or tuples) containing time,
-        measurement (and optionally associated error values) data.
-        Defaults to None. If None, path_to_csv must not be None,
-        otherwise raises an Exception.
+
     features_already_known : dict, optional
-        List of dicts containing any meta-features associated with
-        provided time-series data. Defaults to [].
+        Dict containing any meta-features associated with provided time-series
+        data. Defaults to {}.
 
     Returns
     -------
-    list of dict
-        List of dictionaries containing newly-generated features.
-
+    dict
+        Dictionary containing newly-generated features.
     """
     if "t" not in features_already_known:
         features_already_known['t'] = t

diff --git a/mltsp/featurize.py b/mltsp/featurize.py
@@ -12,7 +12,6 @@
 import numpy as np
 
 from . import cfg
-from . import lc_tools
 from . import custom_feature_tools as cft
 from . import util
 from . import custom_exceptions
@@ -146,7 +145,6 @@ def generate_features(headerfile_path, zipfile_path, features_to_use,
                       custom_script_path, is_test, already_featurized,
                       in_docker_container):
     """Generate features for provided time-series data."""
-    all_features_list = cfg.features_list_obs[:] + cfg.features_list_science[:]
     if already_featurized:
         # Read in features from CSV file
         objects = parse_prefeaturized_csv_data(headerfile_path)

diff --git a/mltsp/obs_feature_tools.py b/mltsp/obs_feature_tools.py
@@ -1,8 +1,7 @@
-import copy
 import numpy as np
 import scipy.stats as stats
 from . import cfg
-from dask.async import get_sync as dget
+import dask.async
 
 
 def double_to_single_step(cads):
@@ -83,12 +82,22 @@ def generate_obs_features(t, m, e, features_to_compute=cfg.features_list_obs):
 
     Parameters
     ----------
+    t : array_like
+        Array containing time values.
+
+    m : array_like
+        Array containing data values.
+
+    e : array_like
+        Array containing measurement error values.
+
+    features_to_compute : list
+        Optional list containing names of desired features.
 
     Returns
     -------
     dict
         Dictionary containing generated time series features.
-
     """
     features_to_compute = [f for f in features_to_compute if f in
                            cfg.features_list_obs]
@@ -147,5 +156,5 @@ def generate_obs_features(t, m, e, features_to_compute=cfg.features_list_obs):
     # Do not execute in parallel; parallelization has already taken place at
     # the level of time series, so we compute features for a single time series
     # in serial.
-    values = dget(feature_graph, features_to_compute)
+    values = dask.async.get_sync(feature_graph, features_to_compute)
     return dict(zip(features_to_compute, values))
diff --git a/mltsp/predict_class.py b/mltsp/predict_class.py
@@ -12,9 +12,7 @@
 
 from . import cfg
 from . import custom_exceptions
-from . import lc_tools
 from . import custom_feature_tools as cft
-from . import util
 from .celery_tasks import pred_featurize_single
 
 

diff --git a/mltsp/science_feature_tools.py b/mltsp/science_feature_tools.py
@@ -1,10 +1,11 @@
 import numpy as np
 import cfg
 import science_features as sf
-from dask.async import get_sync as dget
+import dask.async
 
 
-def generate_science_features(t, m, e, features_to_compute=cfg.features_list_science):
+def generate_science_features(t, m, e,
+                              features_to_compute=cfg.features_list_science):
     """Generate science features for provided time series data.
 
     Parameters
@@ -18,12 +19,14 @@ def generate_science_features(t, m, e, features_to_compute=cfg.features_list_sci
     e : array_like
         Array containing measurement error values.
 
+    features_to_compute : list
+        Optional list containing names of desired features.
+
     Returns
     -------
     dict
         Dictionary containing newly-generated features. Keys are
         feature names, values are feature values (floats).
-
     """
     features_to_compute = [f for f in features_to_compute if f in
                            cfg.features_list_science]
@@ -43,7 +46,8 @@ def generate_science_features(t, m, e, features_to_compute=cfg.features_list_sci
        'percent_amplitude': (sf.percent_amplitude, m),
        'percent_beyond_1_std': (sf.percent_beyond_1_std, m, e),
        'percent_close_to_median': (sf.percent_close_to_median, m),
-       'percent_difference_flux_percentile': (sf.percent_difference_flux_percentile, m),
+       'percent_difference_flux_percentile': (
+           sf.percent_difference_flux_percentile, m),
        'skew': (sf.skew, m),
        'std': (sf.std, m),
        'stetson_j': (sf.stetson_j, m),
@@ -114,12 +118,13 @@ def generate_science_features(t, m, e, features_to_compute=cfg.features_list_sci
         'p2p_model': (sf.p2p_model, t, m, 'freq1_freq'),
         'p2p_scatter_2praw': (sf.get_p2p_scatter_2praw, 'p2p_model'),
         'p2p_scatter_over_mad': (sf.get_p2p_scatter_over_mad, 'p2p_model'),
-        'p2p_scatter_pfold_over_mad': (sf.get_p2p_scatter_pfold_over_mad, 'p2p_model'),
+        'p2p_scatter_pfold_over_mad': (sf.get_p2p_scatter_pfold_over_mad,
+                                       'p2p_model'),
         'p2p_ssqr_diff_over_var': (sf.get_p2p_ssqr_diff_over_var, 'p2p_model'),
    }
 
     # Do not execute in parallel; parallelization has already taken place at
     # the level of time series, so we compute features for a single time series
     # in serial.
-    values = dget(feature_graph, features_to_compute)
+    values = dask.async.get_sync(feature_graph, features_to_compute)
     return dict(zip(features_to_compute, values))
diff --git a/mltsp/science_features/tests/test_science_features.py b/mltsp/science_features/tests/test_science_features.py
@@ -263,15 +263,13 @@ def test_lomb_scargle_regular_single_freq():
 
     # Only test the first (true) frequency; the rest correspond to noise
     for j in range(1, NUM_HARMONICS):
-        # TODO why is this what 'relative phase' means?
         npt.assert_allclose(phase*j*(-1**j),
             all_lomb['freq1_rel_phase{}'.format(j+1)], rtol=1e-2, atol=1e-2)
 
     # Frequency ratio not relevant since there is only; only test amplitude/signif
     for i in [2,3]:
         npt.assert_allclose(0., all_lomb['freq_amplitude_ratio_{}1'.format(i)], atol=1e-3)
 
-    # TODO make significance test more precise
     npt.assert_array_less(10., all_lomb['freq1_signif'])
 
     # Only one frequency, so this should explain basically all the variance
@@ -325,7 +323,6 @@ def test_lomb_scargle_irregular_single_freq():
             npt.assert_allclose(phase*j*(-1**j),
                 all_lomb['freq1_rel_phase{}'.format(j+1)], rtol=1e-1, atol=1e-1)
 
-    # TODO make significance test more precise
     npt.assert_array_less(10., all_lomb['freq1_signif'])
 
     # Only one frequency, so this should explain basically all the variance
@@ -399,13 +396,7 @@ def test_lomb_scargle_regular_multi_freq():
         npt.assert_allclose(amplitudes[i-1,0] / amplitudes[0,0],
                 all_lomb['freq_amplitude_ratio_{}1'.format(i)], atol=2e-2)
 
-    # TODO make significance test more precise
     npt.assert_array_less(10., all_lomb['freq1_signif'])
-    """
-    e_name = 'freq_signif_ratio_{}1_extractor'.format(i)
-    e = getattr(extractors, e_name)()
-    npt.assert_allclose(0., all_lomb, atol=1e-3)
-    """
 
 
 def test_lomb_scargle_irregular_multi_freq():
@@ -437,13 +428,7 @@ def test_lomb_scargle_irregular_multi_freq():
         npt.assert_allclose(frequencies[i-1] / frequencies[0],
                 all_lomb['freq_frequency_ratio_{}1'.format(i)], atol=5e-2)
 
-    # TODO make significance test more precise
     npt.assert_array_less(10., all_lomb['freq1_signif'])
-"""
-    e_name = 'freq_signif_ratio_{}1_extractor'.format(i)
-    e = getattr(extractors, e_name)()
-    npt.assert_allclose(0., all_lomb, atol=1e-3)
-"""
 
 
 def test_max():
@@ -453,7 +438,7 @@ def test_max():
     npt.assert_equal(f.values()[0], max(values))
 
 
-# TODO this returns the index of the biggest slope...seems wrong
+# TODO uncomment when feature is fixed
 #def test_max_slope():
 #    """Test maximum slope feature, which finds the INDEX of the largest slope."""
 #    times, values, errors = irregular_random()
@@ -470,7 +455,7 @@ def test_median_absolute_deviation():
         np.median(values))))
 
 
-    # TODO should replace with commented version once sign problems fixed
+# TODO should replace with commented version once sign problems fixed
 def test_percent_close_to_median():
     """Test feature which finds the percentage of points near the median value."""
     times, values, errors = irregular_random()

diff --git a/mltsp/tests/data/test_features.csv b/mltsp/tests/data/test_features.csv
@@ -1,4 +1,4 @@
-meta1,meta2,meta3,std_err
-0.180734306909,0.548427238218,0.187956237253,0.00540072367701
-0.196072341892,1.17178931753,0.174802803661,0.00665375480667
-0.558093146298,0.265003093326,0.10984235246,0.00811408738542
+meta1,meta2,meta3,std_err,amplitude
+0.180734306909,0.548427238218,0.187956237253,0.00540072367701,2.0
+0.196072341892,1.17178931753,0.174802803661,0.00665375480667,2.5
+0.558093146298,0.265003093326,0.10984235246,0.00811408738542,3.0
diff --git a/mltsp/tests/data/test_features_wcust.csv b/mltsp/tests/data/test_features_wcust.csv
@@ -1,4 +1,4 @@
-meta1,meta2,meta3,std_err,f
-0.180734306909,0.548427238218,0.187956237253,0.3622,0.3
-0.196072341892,1.17178931753,0.174802803661,0.116672,0.2
-0.558093146298,0.265003093326,0.10984235246,1.3343,1.4
+meta1,meta2,meta3,std_err,amplitude,f
+0.180734306909,0.548427238218,0.187956237253,0.3622,2.0,0.3
+0.196072341892,1.17178931753,0.174802803661,0.116672,2.5,0.2
+0.558093146298,0.265003093326,0.10984235246,1.3343,3.0,1.4
diff --git a/mltsp/tests/data/test_features_with_classes.csv b/mltsp/tests/data/test_features_with_classes.csv
@@ -1,4 +1,4 @@
-class,meta1,meta2,meta3,std_err
-class1,0.180734306909,0.548427238218,0.187956237253,0.00540072367701
-class2,0.196072341892,1.17178931753,0.174802803661,0.00665375480667
-class3,0.558093146298,0.265003093326,0.10984235246,0.00811408738542
+class,meta1,meta2,meta3,std_err,amplitude
+class1,0.180734306909,0.548427238218,0.187956237253,0.5,2.0
+class2,0.196072341892,1.17178931753,0.174802803661,0.23,2.5
+class3,0.558093146298,0.265003093326,0.10984235246,1.20,3.0