Skip to content

Commit

Permalink
Bug fixing: removing id fields from anomaly filter to generate anomal…
Browse files Browse the repository at this point in the history
…ies dataset
  • Loading branch information
mmerce committed Jun 25, 2015
1 parent 5e985b3 commit b879a24
Show file tree
Hide file tree
Showing 7 changed files with 83 additions and 7 deletions.
6 changes: 6 additions & 0 deletions HISTORY.rst
Expand Up @@ -3,6 +3,12 @@
History
-------

4.1.6 (2015-06-25)
~~~~~~~~~~~~~~~~~~

- Fixing bug: Removing id fields from the filter to select the anomalies listed
in the Anomaly object from the origin dataset.

4.1.5 (2015-06-06)
~~~~~~~~~~~~~~~~~~

Expand Down
2 changes: 1 addition & 1 deletion bigml/__init__.py
@@ -1 +1 @@
__version__ = '4.1.5'
__version__ = '4.1.6'
10 changes: 8 additions & 2 deletions bigml/anomaly.py
Expand Up @@ -41,6 +41,7 @@
LOGGER = logging.getLogger('BigML')

import math
import json

from bigml.api import FINISHED
from bigml.api import (BigML, get_anomaly_id, get_status)
Expand Down Expand Up @@ -72,6 +73,7 @@ def __init__(self, anomaly, api=None):
self.expected_mean_depth = None
self.iforest = None
self.top_anomalies = None
self.id_fields = []
if not (isinstance(anomaly, dict) and 'resource' in anomaly and
anomaly['resource'] is not None):
if api is None:
Expand All @@ -90,6 +92,7 @@ def __init__(self, anomaly, api=None):
anomaly = anomaly['object']
self.sample_size = anomaly.get('sample_size')
self.input_fields = anomaly.get('input_fields')
self.id_fields = anomaly.get('id_fields', [])
if 'model' in anomaly and isinstance(anomaly['model'], dict):
ModelFields.__init__(self, anomaly['model'].get('fields'))
if ('top_anomalies' in anomaly['model'] and
Expand Down Expand Up @@ -166,15 +169,18 @@ def anomalies_filter(self, include=True):
row = anomaly.get('row', [])
for index in range(len(row)):
field_id = self.input_fields[index]
if field_id in self.id_fields:
continue
value = row[index]
if value is None:
filter_rules.append('(missing? "%s")' % field_id)
else:
if (self.fields[field_id]["optype"]
in ["categorical", "text"]):
value = '"%s"' % value
value = json.dumps(value)
filter_rules.append('(= (f "%s") %s)' % (field_id, value))
anomaly_filters.append("(and %s)" % " ".join(filter_rules))
if filter_rules:
anomaly_filters.append("(and %s)" % " ".join(filter_rules))

anomalies_filter = " ".join(anomaly_filters)
if include:
Expand Down
1 change: 1 addition & 0 deletions bigml/ensemble.py
Expand Up @@ -263,6 +263,7 @@ def predict(self, input_data, by_name=True, method=PLURALITY_CODE,
if median:
for prediction in votes.predictions:
prediction['prediction'] = prediction['median']
print votes.predictions
return votes.combine(method=method, with_confidence=with_confidence,
add_confidence=add_confidence,
add_distribution=add_distribution,
Expand Down
3 changes: 2 additions & 1 deletion bigml/resourcehandler.py
Expand Up @@ -63,7 +63,8 @@
CENTROID_RE = re.compile(r'^%s/%s$' % (CENTROID_PATH, ID_PATTERN))
BATCH_CENTROID_RE = re.compile(r'^%s/%s$' % (BATCH_CENTROID_PATH,
ID_PATTERN))
ANOMALY_RE = re.compile(r'^%s/%s$' % (ANOMALY_PATH, ID_PATTERN))
ANOMALY_RE = re.compile(r'^(public/)?%s/%s$|^shared/%s/%s$' % (
ANOMALY_PATH, ID_PATTERN, ANOMALY_PATH, SHARED_PATTERN))
ANOMALY_SCORE_RE = re.compile(r'^%s/%s$' % (ANOMALY_SCORE_PATH, ID_PATTERN))
BATCH_ANOMALY_SCORE_RE = re.compile(r'^%s/%s$' % (BATCH_ANOMALY_SCORE_PATH,
ID_PATTERN))
Expand Down
25 changes: 25 additions & 0 deletions bigml/tests/create_anomaly_steps.py
Expand Up @@ -11,6 +11,7 @@
from bigml.api import FINISHED
from bigml.api import FAULTY
from bigml.api import get_status
from bigml.anomaly import Anomaly

#@step(r'I check the anomaly detector stems from the original dataset list')
def i_check_anomaly_datasets_and_datasets_ids(step):
Expand Down Expand Up @@ -51,6 +52,17 @@ def i_create_an_anomaly_from_dataset(step):
world.anomaly = resource['object']
world.anomalies.append(resource['resource'])

#@step(r'I create an anomaly detector with (\d+) anomalies from a dataset$')
def i_create_an_anomaly_with_top_n_from_dataset(step, top_n):
dataset = world.dataset.get('resource')
resource = world.api.create_anomaly(
dataset, {'seed': 'BigML', 'top_n': int(top_n)})
world.status = resource['code']
assert world.status == HTTP_CREATED, "Expected: %s, found: %s" % (
HTTP_CREATED, world.status)
world.location = resource['location']
world.anomaly = resource['object']
world.anomalies.append(resource['resource'])

#@step(r'I create an anomaly detector from a dataset list$')
def i_create_an_anomaly_from_dataset_list(step):
Expand All @@ -77,3 +89,16 @@ def wait_until_anomaly_status_code_is(step, code1, code2, secs):
#@step(r'I wait until the anomaly detector is ready less than (\d+)')
def the_anomaly_is_finished_in_less_than(step, secs):
wait_until_anomaly_status_code_is(step, FINISHED, FAULTY, secs)

#@step(r'I create a dataset with only the anomalies')
def create_dataset_with_anomalies(step):
local_anomalies = Anomaly(world.anomaly['resource'])
world.dataset = world.api.create_dataset(
world.dataset['resource'],
{"lisp_filter": local_anomalies.anomalies_filter()})
world.datasets.append(world.dataset['resource'])

#@step(r'I check that the dataset has (\d+) rows')
def the_dataset_has_n_rows(step, rows):
assert world.dataset['rows'] == int(rows), "Expected: %s, found: %s" % (
rows, world.dataset['rows'])
43 changes: 40 additions & 3 deletions bigml/tests/test_18_create_anomaly.py
Expand Up @@ -26,7 +26,7 @@
import create_multimodel_steps as mm_create

class TestAnomaly(object):

def test_scenario1(self):
"""
Expand All @@ -47,8 +47,8 @@ def test_scenario1(self):
And I check the anomaly detector stems from the original dataset list
Examples:
| data | time_1 | time_2 | time_3 | time_4 |
| ../data/tiny_kdd.csv | 40 | 40 | 40 | 100
| data | time_1 | time_2 | time_3 | time_4 |
| ../data/tiny_kdd.csv | 40 | 40 | 80 | 100
"""
print self.test_scenario1.__doc__
examples = [
Expand All @@ -73,3 +73,40 @@ def test_scenario1(self):
anomaly_create.the_anomaly_is_finished_in_less_than(self,
example[4])
anomaly_create.i_check_anomaly_datasets_and_datasets_ids(self)

def test_scenario2(self):
"""
Scenario: Successfully creating an anomaly detector from a dataset and generating the anomalous dataset:
Given I create a data source uploading a "<data>" file
And I wait until the source is ready less than <time_1> secs
And I create a dataset
And I wait until the dataset is ready less than <time_2> secs
Then I create an anomaly detector of <rows> anomalies from a dataset
And I wait until the anomaly detector is ready less than <time_4> secs
And I create a dataset with only the anomalies
And I wait until the dataset is ready less than <time_3> secs
And I check that the dataset has <rows> rows
Examples:
| data | time_1 | time_2 | time_3 |time_4| rows|
| ../data/iris_anomalous.csv | 40 | 40 | 80 | 40 | 1
"""
print self.test_scenario2.__doc__
examples = [
['data/iris_anomalous.csv', '40', '40', '80', '40', '1']]
for example in examples:
print "\nTesting with:\n", example
source_create.i_upload_a_file(self, example[0])
source_create.the_source_is_finished(self, example[1])
dataset_create.i_create_a_dataset(self)
dataset_create.the_dataset_is_finished_in_less_than(self,
example[2])
anomaly_create.i_create_an_anomaly_with_top_n_from_dataset(
self, example[5])
anomaly_create.the_anomaly_is_finished_in_less_than(self,
example[3])
anomaly_create.create_dataset_with_anomalies(self)
dataset_create.the_dataset_is_finished_in_less_than(self,
example[4])
anomaly_create.the_dataset_has_n_rows(self, example[5])

0 comments on commit b879a24

Please sign in to comment.