Skip to content

Commit

Permalink
Adding bigmler anomaly as new subcommand
Browse files Browse the repository at this point in the history
  • Loading branch information
mmerce committed Oct 18, 2014
1 parent d4ab287 commit abd70a2
Show file tree
Hide file tree
Showing 27 changed files with 1,618 additions and 103 deletions.
6 changes: 6 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@
History
-------

1.14.0 (2014-10-19)
~~~~~~~~~~~~~~~~~~~

- Adding bigmler anomaly as new subcommand to generate anomaly detectors,
anomaly scores and batch anomaly scores.

1.13.3 (2014-10-13)
~~~~~~~~~~~~~~~~~~~

Expand Down
2 changes: 1 addition & 1 deletion bigmler/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-

__version__ = '1.13.3'
__version__ = '1.14.0'

if __name__ == '__main__':
import sys
Expand Down
Empty file added bigmler/anomaly/__init__.py
Empty file.
283 changes: 283 additions & 0 deletions bigmler/anomaly/dispatcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,283 @@
# -*- coding: utf-8 -*-
#
# Copyright 2014 BigML
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

"""BigMLer - anomaly subcommand processing dispatching
"""
from __future__ import absolute_import

import sys
import os

import bigml.api
import bigmler.utils as u
import bigmler.resources as r
import bigmler.processing.args as a
import bigmler.processing.sources as ps
import bigmler.processing.datasets as pd
import bigmler.processing.anomalies as pa

from bigmler.defaults import DEFAULTS_FILE
#from bigmler.centroid import centroid, remote_centroid
from bigmler.anomaly_score import anomaly_score, remote_anomaly_score
from bigmler.reports import clear_reports, upload_reports
from bigmler.command import Command, StoredCommand
from bigmler.dispatcher import (SESSIONS_LOG, command_handling,
clear_log_files,
has_test, has_train)

COMMAND_LOG = u".bigmler_anomaly"
DIRS_LOG = u".bigmler_anomaly_dir_stack"
LOG_FILES = [COMMAND_LOG, DIRS_LOG, u.NEW_DIRS_LOG]
MINIMUM_MODEL = "full=false"
DEFAULT_OUTPUT = u"anomaly_scores.csv"


def anomaly_dispatcher(args=sys.argv[1:]):
"""Parses command line and calls the different processing functions
"""

# If --clear-logs the log files are cleared
if "--clear-logs" in args:
clear_log_files(LOG_FILES)

command = command_handling(args, COMMAND_LOG)

# Parses command line arguments.
command_args = a.parse_and_check(command)
resume = command_args.resume
if command_args.resume:
# Keep the debug option if set
debug = command_args.debug
# Restore the args of the call to resume from the command log file
stored_command = StoredCommand(args, COMMAND_LOG, DIRS_LOG)
command = Command(None, stored_command=stored_command)
# Logs the issued command and the resumed command
session_file = os.path.join(stored_command.output_dir, SESSIONS_LOG)
stored_command.log_command(session_file=session_file)
# Parses resumed arguments.
command_args = a.parse_and_check(command)
if command_args.predictions is None:
command_args.predictions = os.path.join(stored_command.output_dir,
DEFAULT_OUTPUT)
else:
if command_args.output_dir is None:
command_args.output_dir = a.NOW
if command_args.predictions is None:
command_args.predictions = os.path.join(command_args.output_dir,
DEFAULT_OUTPUT)
if len(os.path.dirname(command_args.predictions).strip()) == 0:
command_args.predictions = os.path.join(command_args.output_dir,
command_args.predictions)
directory = u.check_dir(command_args.predictions)
session_file = os.path.join(directory, SESSIONS_LOG)
u.log_message(command.command + "\n", log_file=session_file)
try:
defaults_file = open(DEFAULTS_FILE, 'r')
contents = defaults_file.read()
defaults_file.close()
defaults_copy = open(os.path.join(directory, DEFAULTS_FILE),
'w', 0)
defaults_copy.write(contents)
defaults_copy.close()
except IOError:
pass
u.sys_log_message(u"%s\n" % os.path.abspath(directory),
log_file=DIRS_LOG)

# Creates the corresponding api instance
if resume and debug:
command_args.debug = True
api = a.get_api_instance(command_args, u.check_dir(session_file))

# Selects the action to perform
if (has_train(command_args) or has_test(command_args)):
output_args = a.get_output_args(api, command_args, resume)
a.transform_args(command_args, command.flags, api,
command.user_defaults)
compute_output(**output_args)
u.log_message("_" * 80 + "\n", log_file=session_file)


def compute_output(api, args):
""" Creates one or more models using the `training_set` or uses the ids
of previously created BigML models to make predictions for the `test_set`.
"""
source = None
dataset = None
anomaly = None
anomalies = None
fields = None
# no multi-label support at present

# variables from command-line options
resume = args.resume_
anomaly_ids = args.anomaly_ids_
output = args.predictions
# there's only one anomaly detector to be generated at present
args.max_parallel_anomalies = 1
# anomalies cannot be published yet.
args.public_anomaly = False

# It is compulsory to have a description to publish either datasets or
# anomalies
if (not args.description_ and (args.public_anomaly or
args.public_dataset)):
sys.exit("You should provide a description to publish.")

# When using --new-fields, it is compulsory to specify also a dataset
# id
if args.new_fields and not args.dataset:
sys.exit("To use --new-fields you must also provide a dataset id"
" to generate the new dataset from it.")

path = u.check_dir(output)
session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG)
csv_properties = {}
# If logging is required set the file for logging
log = None
if args.log_file:
u.check_dir(args.log_file)
log = args.log_file
# If --clear_logs the log files are cleared
clear_log_files([log])

source, resume, csv_properties, fields = ps.source_processing(
api, args, resume,
csv_properties=csv_properties,
session_file=session_file, path=path, log=log)

datasets, resume, csv_properties, fields = pd.dataset_processing(
source, api, args, resume,
fields=fields,
csv_properties=csv_properties,
session_file=session_file, path=path, log=log)
if datasets:
dataset = datasets[0]
if args.to_csv is not None:
resume = pd.export_dataset(dataset, api, args, resume,
session_file=session_file, path=path)

# If test_split is used, split the dataset in a training and a test dataset
# according to the given split
if args.test_split > 0:
dataset, test_dataset, resume = pd.split_processing(
dataset, api, args, resume,
session_file=session_file, path=path, log=log)
datasets[0] = dataset

# If multi-dataset flag is on, generate a new dataset from the given
# list of datasets
if args.multi_dataset:
dataset, resume = pd.create_new_dataset(
datasets, api, args, resume, fields=fields,
session_file=session_file, path=path, log=log)
datasets = [dataset]

# Check if the dataset has a generators file associated with it, and
# generate a new dataset with the specified field structure
if args.new_fields:
dataset, resume = pd.create_new_dataset(
dataset, api, args, resume, fields=fields,
session_file=session_file, path=path, log=log)
datasets[0] = dataset

anomalies, anomaly_ids, resume = pa.anomalies_processing(
datasets, anomalies, anomaly_ids, api, args, resume, fields=fields,
session_file=session_file, path=path, log=log)
if anomalies:
anomaly = anomalies[0]

# We update the anomaly's public state if needed
if anomaly:
if isinstance(anomaly, basestring):
if not has_test(args):
query_string = MINIMUM_MODEL
else:
query_string = ''
anomaly = u.check_resource(anomaly, query_string=query_string,
api=api)
anomalies[0] = anomaly
if (args.public_anomaly or
(args.shared_flag and r.shared_changed(args.shared, anomaly))):
anomaly_args = {}
if args.shared_flag and r.shared_changed(args.shared, anomaly):
anomaly_args.update(shared=args.shared)
if args.public_anomaly:
anomaly_args.update(r.set_publish_anomaly_args(args))
if anomaly_args:
anomaly = r.update_anomaly(anomaly, anomaly_args, args,
api=api, path=path,
session_file=session_file)
anomalies[0] = anomaly

# We get the fields of the anomaly detector if we haven't got
# them yet and need them
if anomaly and args.test_set:
fields = pa.get_anomaly_fields(anomaly, csv_properties, args)

# If predicting
if anomalies and has_test(args):
test_dataset = None

# Remote anomaly scores: scores are computed as batch anomaly scores
# in bigml.com except when --no-batch flag is set on
if args.remote and not args.no_batch:
# create test source from file
test_name = "%s - test" % args.name
if args.test_source is None:
(test_source,
resume,
csv_properties,
test_fields) = ps.test_source_processing(
api, args, resume, name=test_name,
session_file=session_file, path=path, log=log)
else:
test_source_id = bigml.api.get_source_id(args.test_source)
test_source = api.check_resource(test_source_id,
api.get_source)
if args.test_dataset is None:
# create test dataset from test source
dataset_args = r.set_basic_dataset_args(args, name=test_name)
test_dataset, resume = pd.alternative_dataset_processing(
test_source, "test", dataset_args, api, args,
resume, session_file=session_file, path=path, log=log)
else:
test_dataset_id = bigml.api.get_dataset_id(args.test_dataset)
test_dataset = api.check_resource(test_dataset_id,
api.get_dataset)
test_fields = pd.get_fields_structure(test_dataset,
csv_properties)
batch_anomaly_score_args = r.set_batch_anomaly_score_args(
args, fields=fields,
dataset_fields=test_fields)

remote_anomaly_score(anomaly, test_dataset,
batch_anomaly_score_args, args,
api, resume, prediction_file=output,
session_file=session_file, path=path, log=log)

else:
anomaly_score(anomalies, fields, args, session_file=session_file)

u.print_generated_files(path, log_file=session_file,
verbosity=args.verbosity)
if args.reports:
clear_reports(path)
if args.upload:
upload_reports(args.reports, path)
Loading

0 comments on commit abd70a2

Please sign in to comment.