Skip to content

Commit

Permalink
Improve performance (#121)
Browse files Browse the repository at this point in the history
* added driftbif data set submodule

* added notebook with fc runtimes

* marked entropy features as computational expensive

* do not calculate high computational cost features

* omit driftbif_datasets.py in .coveragerc

* made doctest python 3 compatible
  • Loading branch information
MaxBenChrist committed Dec 19, 2016
1 parent 9ea0421 commit 8a586ee
Show file tree
Hide file tree
Showing 7 changed files with 678 additions and 2 deletions.
1 change: 1 addition & 0 deletions .coveragerc
Expand Up @@ -11,6 +11,7 @@ omit = tsfresh/utilities/profiling.py
tsfresh/convenience/__init__.py
tsfresh/examples/har_dataset.py
tsfresh/examples/robot_execution_failures.py
tsfresh/examples/driftbif_datasets.py

[report]
# Regexes for lines to exclude from consideration
Expand Down
525 changes: 525 additions & 0 deletions notebooks/compare-runtimes-of-feature-calculators.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion tests/feature_extraction/test_settings.py
Expand Up @@ -58,7 +58,7 @@ def test_default_calculates_all_features(self):
"""
settings = FeatureExtractionSettings()
all_feature_calculators = [name for name, func in feature_calculators.__dict__.items()
if hasattr(func, "fctype")]
if hasattr(func, "fctype") and not hasattr(func, "high_comp_cost")]

for calculator in all_feature_calculators:
self.assertIn(calculator, settings.name_to_param,
Expand Down
3 changes: 2 additions & 1 deletion tsfresh/examples/__init__.py
Expand Up @@ -5,4 +5,5 @@
"""
from __future__ import absolute_import
from .robot_execution_failures import load_robot_execution_failures, download_robot_execution_failures
from .har_dataset import download_har_dataset, load_har_classes, load_har_dataset
from .har_dataset import download_har_dataset, load_har_classes, load_har_dataset
from .driftbif_datasets import load_driftbif
141 changes: 141 additions & 0 deletions tsfresh/examples/driftbif_datasets.py
@@ -0,0 +1,141 @@
# -*- coding: utf-8 -*-
# This file as well as the whole tsfresh package are licenced under the MIT licence (see the LICENCE.txt)
# Maximilian Christ (maximilianchrist.com), Blue Yonder Gmbh, 2016

# Thanks to Andreas Kempa-Liehr for providing this snippet

import pandas as pd
import numpy as np

# todo: add possibility to extract data set for regression problem (estimation of tau parameter)
# todo: add unit test

class velocity(object):
"""
Simulates the velocity of one dissipative soliton (kind of self organized particle)
label 0 means tau<=1/0.3, Dissipative Soliton with Brownian motion (purely noise driven)
label 1 means tau> 1/0.3, Dissipative Soliton with Active Brownian motion (intrinsiv velocity with overlaid noise)
References
----------
.. [6] Andreas Kempa-Liehr (2013, p. 159-170)
Dynamics of Dissipative Soliton
Dissipative Solitons in Reaction Diffusion Systems.
Springer: Berlin
>>> ds = velocity(tau=3.5) # Dissipative soliton with equilibrium velocity 1.5e-3
>>> print(ds.label) # Discriminating before or beyond Drift-Bifurcation
1
>>> print(ds.deterministic) # Equilibrium velocity
0.0015191090506254991
>>> v = ds.simulate(20000) # Simulate velocity time series with 20000 time steps being disturbed by Gaussian white noise
"""

def __init__(self, tau=2.87, kappa_3=0.3, Q=1950.0, R=3e-4, delta_t=0.005):
"""
:param tau: time-scale constant
:type tau: float
:param kappa_3: Feedback of fast inhibitor
:type kappa_3:
:param Q: Shape parameter of dissipative soliton
:type Q: float
:param R: Noise amplitude
:type R: float
:param delta_t: temporal discretization
:type delta_t: float
"""
# todo: improve description of constants
# todo: add start seed

self.delta_t = delta_t
self.a = self.delta_t * kappa_3 ** 2 * (tau - 1.0 / kappa_3)
self.b = self.delta_t * Q / kappa_3
self.label = int(tau > 1.0 / kappa_3)
self.c = np.sqrt(self.delta_t) * R
self.delta_t = self.delta_t

if tau <= 1.0 / kappa_3:
self.deterministic = 0.0
else:
self.deterministic = kappa_3 ** 1.5 * np.sqrt((tau - 1.0 / kappa_3) / Q)

def __call__(self, v):
"""
returns deterministic dynamic = acceleration (without noise)
:param v: vector of velocity
:rtype v:
:return:
:return type:
"""

# todo: which type v, array?
# todo: descripton of return?

return v * (1.0 + self.a - self.b * np.dot(v, v))

def simulate(self, N, v0=np.zeros(2)):
"""
:param N: number of time steps
:type N:
:param v0: initial velocity
:return:
:rtype:
"""

# todo: fill out docstring
# todo: complete parameter description

v = [v0] # first value is initial condition
n = N - 1 # Because we are returning the initial condition,
# only (N-1) time steps are computed
gamma = np.random.randn(n, 2)
for i in xrange(n):
next_v = self.__call__(v[i]) + self.c * gamma[i]
v.append(next_v)
v_vec = np.array(v)
return v_vec


def load_driftbif(n, l):
"""
Creates and loads the drift bifurcation dataset.
:param n: number of different samples
:type n: int
:param l: length of the time series
:type l: int
:return: X, y. Time series container and target vector
:rtype X: pandas.DataFrame
:rtype y: pandas.DataFrame
"""

# todo: add ratio of classes
# todo: add start seed
# todo: draw tau random from range [2, 4] so we really get a random dataset
# todo: add variable for number of dimensions

m = 2 # number of different time series for each sample
id = np.repeat(range(n), l * m)
dimensions = list(np.repeat(range(m), l)) * n

labels = list()
values = list()

ls_tau = np.linspace(2.87, 3.8, n).tolist()

for i, tau in enumerate(ls_tau):
ds = velocity(tau=tau)
labels.append(ds.label)
values.append(ds.simulate(l).transpose().flatten())
time = np.stack([ds.delta_t * np.arange(l)] * n * m).flatten()

df = pd.DataFrame({'id': id, "time": time, "value": np.stack(values).flatten(), "dimension": dimensions})
y = pd.Series(labels)
y.index = range(n)

return df, y
2 changes: 2 additions & 0 deletions tsfresh/feature_extraction/feature_calculators.py
Expand Up @@ -1045,6 +1045,7 @@ def binned_entropy(x, max_bins):

# todo - include latex formula
# todo - check if vectorizable
@set_property("high_comp_cost", True)
@set_property("fctype", "aggregate")
def sample_entropy(x):
"""
Expand Down Expand Up @@ -1200,6 +1201,7 @@ def range_count(x, min, max):


@set_property("fctype", "aggregate_with_parameters")
@set_property("high_comp_cost", True)
def approximate_entropy(x, m, r):
"""
Implements a vectorized Approximate entropy algorithm.
Expand Down
6 changes: 6 additions & 0 deletions tsfresh/feature_extraction/settings.py
Expand Up @@ -19,6 +19,7 @@
import numpy as np
from tsfresh.feature_extraction import feature_calculators
from multiprocessing import cpu_count
import six


# todo: this classes' docstrings are not completely up-to-date
Expand Down Expand Up @@ -102,6 +103,11 @@ def __init__(self, calculate_all_features=True):
"approximate_entropy": [{"m": 2, "r": r} for r in [.1, .3, .5, .7, .9]]
})

# drop all features with high computational costs
for fname, f in six.iteritems(feature_calculators.__dict__):
if hasattr(f, "high_comp_cost"):
del self.name_to_param[fname]

# default None means one procesqs per cpu
n_cores = int(os.getenv("NUMBER_OF_CPUS") or cpu_count())
self.n_processes = max(1, n_cores//2)
Expand Down

0 comments on commit 8a586ee

Please sign in to comment.