Bug fixing: removing id fields from anomaly filter to generate anomal…

…ies dataset
bigmlcom · Jun 25, 2015 · b879a24 · b879a24
1 parent 5e985b3
commit b879a24
Show file tree

Hide file tree

Showing 7 changed files with 83 additions and 7 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -3,6 +3,12 @@
 History
 -------
 
+4.1.6 (2015-06-25)
+~~~~~~~~~~~~~~~~~~
+
+- Fixing bug: Removing id fields from the filter to select the anomalies listed
+  in the Anomaly object from the origin dataset.
+
 4.1.5 (2015-06-06)
 ~~~~~~~~~~~~~~~~~~
 

diff --git a/bigml/__init__.py b/bigml/__init__.py
@@ -1 +1 @@
-__version__ = '4.1.5'
+__version__ = '4.1.6'
diff --git a/bigml/anomaly.py b/bigml/anomaly.py
@@ -41,6 +41,7 @@
 LOGGER = logging.getLogger('BigML')
 
 import math
+import json
 
 from bigml.api import FINISHED
 from bigml.api import (BigML, get_anomaly_id, get_status)
@@ -72,6 +73,7 @@ def __init__(self, anomaly, api=None):
         self.expected_mean_depth = None
         self.iforest = None
         self.top_anomalies = None
+        self.id_fields = []
         if not (isinstance(anomaly, dict) and 'resource' in anomaly and
                 anomaly['resource'] is not None):
             if api is None:
@@ -90,6 +92,7 @@ def __init__(self, anomaly, api=None):
             anomaly = anomaly['object']
             self.sample_size = anomaly.get('sample_size')
             self.input_fields = anomaly.get('input_fields')
+            self.id_fields = anomaly.get('id_fields', [])
         if 'model' in anomaly and isinstance(anomaly['model'], dict):
             ModelFields.__init__(self, anomaly['model'].get('fields'))
             if ('top_anomalies' in anomaly['model'] and
@@ -166,15 +169,18 @@ def anomalies_filter(self, include=True):
             row = anomaly.get('row', [])
             for index in range(len(row)):
                 field_id = self.input_fields[index]
+                if field_id in self.id_fields:
+                    continue
                 value = row[index]
                 if value is None:
                     filter_rules.append('(missing? "%s")' % field_id)
                 else:
                     if (self.fields[field_id]["optype"]
                             in ["categorical", "text"]):
-                        value = '"%s"' % value
+                        value = json.dumps(value)
                     filter_rules.append('(= (f "%s") %s)' % (field_id, value))
-            anomaly_filters.append("(and %s)" % " ".join(filter_rules))
+            if filter_rules:
+                anomaly_filters.append("(and %s)" % " ".join(filter_rules))
 
         anomalies_filter = " ".join(anomaly_filters)
         if include:

diff --git a/bigml/ensemble.py b/bigml/ensemble.py
@@ -263,6 +263,7 @@ def predict(self, input_data, by_name=True, method=PLURALITY_CODE,
             if median:
                 for prediction in votes.predictions:
                     prediction['prediction'] = prediction['median']
+        print votes.predictions
         return votes.combine(method=method, with_confidence=with_confidence,
                              add_confidence=add_confidence,
                              add_distribution=add_distribution,

diff --git a/bigml/resourcehandler.py b/bigml/resourcehandler.py
@@ -63,7 +63,8 @@
 CENTROID_RE = re.compile(r'^%s/%s$' % (CENTROID_PATH, ID_PATTERN))
 BATCH_CENTROID_RE = re.compile(r'^%s/%s$' % (BATCH_CENTROID_PATH,
                                              ID_PATTERN))
-ANOMALY_RE = re.compile(r'^%s/%s$' % (ANOMALY_PATH, ID_PATTERN))
+ANOMALY_RE = re.compile(r'^(public/)?%s/%s$|^shared/%s/%s$' % (
+    ANOMALY_PATH, ID_PATTERN, ANOMALY_PATH, SHARED_PATTERN))
 ANOMALY_SCORE_RE = re.compile(r'^%s/%s$' % (ANOMALY_SCORE_PATH, ID_PATTERN))
 BATCH_ANOMALY_SCORE_RE = re.compile(r'^%s/%s$' % (BATCH_ANOMALY_SCORE_PATH,
                                                   ID_PATTERN))

diff --git a/bigml/tests/create_anomaly_steps.py b/bigml/tests/create_anomaly_steps.py
@@ -11,6 +11,7 @@
 from bigml.api import FINISHED
 from bigml.api import FAULTY
 from bigml.api import get_status
+from bigml.anomaly import Anomaly
 
 #@step(r'I check the anomaly detector stems from the original dataset list')
 def i_check_anomaly_datasets_and_datasets_ids(step):
@@ -51,6 +52,17 @@ def i_create_an_anomaly_from_dataset(step):
     world.anomaly = resource['object']
     world.anomalies.append(resource['resource'])
 
+#@step(r'I create an anomaly detector with (\d+) anomalies from a dataset$')
+def i_create_an_anomaly_with_top_n_from_dataset(step, top_n):
+    dataset = world.dataset.get('resource')
+    resource = world.api.create_anomaly(
+        dataset, {'seed': 'BigML', 'top_n': int(top_n)})
+    world.status = resource['code']
+    assert world.status == HTTP_CREATED, "Expected: %s, found: %s" % (
+        HTTP_CREATED, world.status)
+    world.location = resource['location']
+    world.anomaly = resource['object']
+    world.anomalies.append(resource['resource'])
 
 #@step(r'I create an anomaly detector from a dataset list$')
 def i_create_an_anomaly_from_dataset_list(step):
@@ -77,3 +89,16 @@ def wait_until_anomaly_status_code_is(step, code1, code2, secs):
 #@step(r'I wait until the anomaly detector is ready less than (\d+)')
 def the_anomaly_is_finished_in_less_than(step, secs):
     wait_until_anomaly_status_code_is(step, FINISHED, FAULTY, secs)
+
+#@step(r'I create a dataset with only the anomalies')
+def create_dataset_with_anomalies(step):
+    local_anomalies = Anomaly(world.anomaly['resource'])
+    world.dataset = world.api.create_dataset(
+        world.dataset['resource'],
+        {"lisp_filter": local_anomalies.anomalies_filter()})
+    world.datasets.append(world.dataset['resource'])
+
+#@step(r'I check that the dataset has (\d+) rows')
+def the_dataset_has_n_rows(step, rows):
+    assert world.dataset['rows'] == int(rows), "Expected: %s, found: %s" % (
+        rows, world.dataset['rows'])
diff --git a/bigml/tests/test_18_create_anomaly.py b/bigml/tests/test_18_create_anomaly.py
@@ -26,7 +26,7 @@
 import create_multimodel_steps as mm_create
 
 class TestAnomaly(object):
-        
+
     def test_scenario1(self):
         """
 
@@ -47,8 +47,8 @@ def test_scenario1(self):
                 And I check the anomaly detector stems from the original dataset list
 
                 Examples:
-                | data                 | time_1  | time_2 | time_3 |  time_4 | 
-                | ../data/tiny_kdd.csv | 40      | 40     | 40     |  100
+                | data                 | time_1  | time_2 | time_3 |  time_4 |
+                | ../data/tiny_kdd.csv | 40      | 40     | 80     |  100
         """
         print self.test_scenario1.__doc__
         examples = [
@@ -73,3 +73,40 @@ def test_scenario1(self):
             anomaly_create.the_anomaly_is_finished_in_less_than(self,
                                                                 example[4])
             anomaly_create.i_check_anomaly_datasets_and_datasets_ids(self)
+
+    def test_scenario2(self):
+        """
+
+            Scenario: Successfully creating an anomaly detector from a dataset and generating the anomalous dataset:
+                Given I create a data source uploading a "<data>" file
+                And I wait until the source is ready less than <time_1> secs
+                And I create a dataset
+                And I wait until the dataset is ready less than <time_2> secs
+                Then I create an anomaly detector of <rows> anomalies from a dataset
+                And I wait until the anomaly detector is ready less than <time_4> secs
+                And I create a dataset with only the anomalies
+                And I wait until the dataset is ready less than <time_3> secs
+                And I check that the dataset has <rows> rows
+
+                Examples:
+                | data                       | time_1  | time_2 | time_3 |time_4|  rows|
+                | ../data/iris_anomalous.csv | 40      | 40     | 80     | 40   |  1
+        """
+        print self.test_scenario2.__doc__
+        examples = [
+            ['data/iris_anomalous.csv', '40', '40', '80', '40', '1']]
+        for example in examples:
+            print "\nTesting with:\n", example
+            source_create.i_upload_a_file(self, example[0])
+            source_create.the_source_is_finished(self, example[1])
+            dataset_create.i_create_a_dataset(self)
+            dataset_create.the_dataset_is_finished_in_less_than(self,
+                                                                example[2])
+            anomaly_create.i_create_an_anomaly_with_top_n_from_dataset(
+                self, example[5])
+            anomaly_create.the_anomaly_is_finished_in_less_than(self,
+                                                                example[3])
+            anomaly_create.create_dataset_with_anomalies(self)
+            dataset_create.the_dataset_is_finished_in_less_than(self,
+                                                                example[4])
+            anomaly_create.the_dataset_has_n_rows(self, example[5])