In [119]:
# Real life data

import logging
import threading
import itertools
import json
import os
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt
from matplotlib import cm
from mpl_toolkits.mplot3d import axes3d
import seaborn as seabornInstance
from sqlalchemy import Column, Integer, String, Float, DateTime, Boolean, func
from iotfunctions import base
from iotfunctions import bif
from iotfunctions import entity
from iotfunctions import metadata
from iotfunctions.metadata import EntityType
from iotfunctions.db import Database
from iotfunctions.enginelog import EngineLogging
from iotfunctions import estimator
from iotfunctions.ui import (UISingle, UIMultiItem, UIFunctionOutSingle,
                 UISingleItem, UIFunctionOutMulti, UIMulti, UIExpression,
                 UIText, UIStatusFlag, UIParameters)
from mmfunctions.anomaly import (SaliencybasedGeneralizedAnomalyScore, SpectralAnomalyScore,
                 FFTbasedGeneralizedAnomalyScore, KMeansAnomalyScore)
import datetime as dt
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score, r2_score

import scipy as sp
import scipy.fftpack
import skimage as ski

from skimage import util as skiutil # for nifty windowing
import pyod as pyod
from pyod.utils.data import generate_data
from pyod.utils.data import evaluate_print
from pyod.utils.example import visualize
from pyod.models.knn import KNN
from pyod.models.iforest import IForest
%matplotlib inline
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

EngineLogging.configure_console_logging(logging.INFO)

#### Train a 2-layered LSTM in Watson Machine Learning

 
Telemanom ([Detecting Spacecraft Anomalies Using LSTMs and Nonparametric Dynamic Thresholding](https://arxiv.org/pdf/1802.04431.pdf)


Let's find out what ML libraries are supported in WML.


In [146]:
# make sure to downgrade to sklearn 0.22.2 (no >= 0.23)
from watson_machine_learning_client import WatsonMachineLearningAPIClient

with open('credentials_wml.json', encoding='utf-8') as F:
    wml_credentials = json.loads(F.read())
    
with open('credentials_cos.json', encoding='utf-8') as F:
    cos_credentials = json.loads(F.read())

wml_url=wml_credentials['url']
wml_instance_id=wml_credentials['instance_id']
wml_apikey=wml_credentials['apikey']

wml_data_source_type= 's3'


cos_endpoint = cos_credentials['endpoints']
cos_apikey = cos_credentials['apikey']
cos_access_key = cos_credentials['cos_hmac_keys']['access_key_id']
cos_secret_key = cos_credentials['cos_hmac_keys']['secret_access_key']
# 'https://s3.eu.cloud-object-storage.appdomain.cloud'

cos_input_bucket = 'githubanalyzer-donotdelete-pr-b9xa3kxotzh5in'
cos_output_bucket = 'githubanalyzer-donotdelete-pr-b9xa3kxotzh5in'

client = WatsonMachineLearningAPIClient(wml_credentials)
rep_list = client.runtimes.list(limit=4000)

--------------------------  --------------------------  ------------------------  --------
GUID                        NAME                        CREATED                   PLATFORM
do_12.10                    do_12.10                    2020-03-20T04:19:17.471Z  do
xgboost_0.90-py3.6          xgboost_0.90-py3.6          2020-03-20T04:19:03.205Z  python
scikit-learn_0.22-py3.6     scikit-learn_0.22-py3.6     2020-03-20T04:18:53.589Z  python
spark-mllib_2.4             spark-mllib_2.4             2020-02-06T09:30:35.538Z  spark
tensorflow_1.15-py3.6       tensorflow_1.15-py3.6       2020-02-06T09:30:30.574Z  python
pytorch-onnx_1.2-py3.6      pytorch-onnx_1.2-py3.6      2020-02-06T09:29:58.456Z  python
pytorch-onnx_1.2-py3.6-edt  pytorch-onnx_1.2-py3.6-edt  2020-02-06T09:29:54.031Z  python
tensorflow_1.14-py3.6       tensorflow_1.14-py3.6       2019-10-23T09:54:32.847Z  python
pytorch-onnx_1.1-py3.6      pytorch-onnx_1.1-py3.6      2019-10-23T09:54:02.251Z  python
pytorch-onnx_1.1-py3.6

### First step 

Apparently Keras is missing out. However, Telemanom is built on Keras, so we have to port it to either Tensorflow, Pytorch, Mxnet, Caffe or Theano.

I opted for Pytorch for skill building purposes and ported Telemanom to Pytorch.



<small>
    
```
    
class LSTM_2L(nn.Module):
    def __init__(self, n_features = 1, hidden_dims = [80,80], seq_length = 250,
                 batch_size = 64, n_predictions = 10, dropout = 0.3):
        super(LSTM_2L, self).__init__()
        print ('LSTM_2L', n_features, hidden_dims, seq_length, batch_size, n_predictions, dropout)

        self.n_features = n_features
        self.hidden_dims = hidden_dims
        self.seq_length = seq_length
        self.num_layers = len(self.hidden_dims)
        self.batch_size = batch_size

        self.lstm1 = nn.LSTM(
            input_size = self.n_features,
            hidden_size = self.hidden_dims[0],
            batch_first = True,
            dropout = dropout,
            num_layers = 2)

        self.linear = nn.Linear(self.hidden_dims[1], n_predictions)
        self.init_hidden_state()
        
    def init_hidden_state(self):

        self.hidden = (
            torch.randn(self.num_layers, self.batch_size, self.hidden_dims[0]), #.to(self.device),
            torch.randn(self.num_layers, self.batch_size, self.hidden_dims[0]), #.to(self.device),
            )

    def forward(self, sequences):

        batch_size, seq_len, n_features = sequences.size()  # batch first

        lstm1_out , (h1_n, c1_n) = self.lstm1(sequences, (self.hidden[0], self.hidden[1]))

        last_time_step = lstm1_out[:,-1,:]

        y_pred = self.linear(last_time_step)

        return y_pred
 ```
</small>

In [101]:
# part of mmfunctions
import numpy as np
import pandas as pd 

import telemanom
from telemanom.helpers import Config
from telemanom.errors import Errors
import telemanom.helpers as helpers
from telemanom.channel import Channel
from telemanom.modeling import Model

conf = Config("./telemanom/config.yaml")

conf.dictionary['l_s'] = 250
conf.dictionary['epochs'] = 80
conf.dictionary['dropout'] = 0.2
conf.batch_size = 512
conf.l_s = 250
conf.epochs = 80    # max
conf.dropout = 0.2
conf.lstm_batch_size=64

In [105]:
#
# Define structure for local data
#              telemanom supports multiple channels to reflect spacecraft sensors, we only need a single one now
#
device="Armstarknew"
chan = Channel(conf, device)
helpers.make_dirs(conf.use_id, conf, "./telemanom")
print(chan)
conf

# load data

chan.train = np.loadtxt('./telemanom/wml_train.csv')
chan.test = np.loadtxt('./telemanom/wml_test.csv')



Channel:Channel


#### The following steps replay the code in wml_telemanom.py

We jump over the next few cells unless we want to initiate a local training run

In [106]:
# producing overlapping windows of length 260 for lookback (250) and prediction (10)
chan.shape_data(chan.train, train=True)
chan.shape_data(chan.test, train=False)

2020-07-27 12:28:39,117 - telemanom - INFO - FFT channel: False


2020-07-27T12:28:39.117 INFO telemanom.shape_data FFT channel: False
(129300, 2)


2020-07-27 12:28:39,563 - telemanom - INFO - FFT channel: False


2020-07-27T12:28:39.563 INFO telemanom.shape_data FFT channel: False
(129195, 2)


In [107]:
# init the Python double stacked LSTM model
model = Model(conf, conf.use_id, chan, "./telemanom", False)

LSTM_2L 2 [80, 80] 250 64 10 0.2
init hidden state
Hidden dimension 0:  2 64 80
Hidden dimension 1:  2 64 80
input shape:  (None, 2)


In [109]:
import torch
try:
    model.model.load_state_dict(torch.load("./mytrainedpytorchmodel"))
    model.model.eval()
except Exception:
    # drink a coffee - training takes roughly 30 minutes
    model.train_new(chan)
    torch.save(model.model.state_dict(), "./mytrainedpytorchmodel")

# no training run - we've already spent CPU cycles last week
#

In [110]:
model.model.state_dict

<bound method Module.state_dict of LSTM_2L(
  (lstm1): LSTM(2, 80, num_layers=2, batch_first=True, dropout=0.2)
  (linear): Linear(in_features=80, out_features=10, bias=True)
)>

In [111]:
from IPython.display import display, Markdown
Markdown('<strong>{}</strong><br/>{}'.format('Make sure you have uploaded the code in mmfunctions/telemanom as zip file to COS bucket', cos_input_bucket))

<strong>Make sure you have uploaded the code in mmfunctions/telemanom as zip file to COS bucket</strong><br/>githubanalyzer-donotdelete-pr-b9xa3kxotzh5in

In [139]:
# Zip the code in the ./telemanom subdirectory first

import subprocess
output = None
try:
    output = subprocess.check_output("ls ./telemanom/wml_model.zip", shell=True).decode('ascii')  + 'found - good'
except Exception:
    output = 'Not found - do it now'

Markdown('<strong>{}</strong><br/>'.format(output))

<strong>./telemanom/wml_model.zip
found - good</strong><br/>

In [162]:
# check whether we have uploaded the code
!s3cmd --access_key {cos_access_key} --secret_key {cos_secret_key} \
--access_token {cos_apikey} --host s3.eu.cloud-object-storage.appdomain.cloud --host-bucket=s3.eu.cloud-object-storage.appdomain.cloud \
ls s3://githubanalyzer-donotdelete-pr-b9xa3kxotzh5in

                          DIR  s3://githubanalyzer-donotdelete-pr-b9xa3kxotzh5in/_wml_checkpoints/
                          DIR  s3://githubanalyzer-donotdelete-pr-b9xa3kxotzh5in/bf25a013-c8e9-4501-a8af-bd5a3bbc22a6/
                          DIR  s3://githubanalyzer-donotdelete-pr-b9xa3kxotzh5in/notebook/
                          DIR  s3://githubanalyzer-donotdelete-pr-b9xa3kxotzh5in/training-h9VOfZVMR/
2020-07-24 16:54      2661699  s3://githubanalyzer-donotdelete-pr-b9xa3kxotzh5in/wml_model.zip


In [164]:

wml_train_code='./telemanom/wml_model.zip' # where this notebook finds the code

wml_execution_command='python3 wml_telemanom.py' # command to start training

wml_framework_name='pytorch'
wml_framework_version='1.1' # we run on pytorch 1.1
wml_runtime = 'python'
wml_runtime_version='3.6' # and python 3.6

wml_run_definition = 'wml-pytorch-definition' # dummy name
wml_run_name = 'wml-pytorch-run' # more dummy
wml_model_name='wml-tensorflow-miregal' # even more dummy

wml_compute_name='k80'  # free tier machine type
wml_compute_nodes='1'   # free tier

wml_runtime_version_v4 = wml_framework_version + '-py' + wml_runtime_version   # sdk level
wml_compute_nodes_v4 = int(wml_compute_nodes)

model_code = wml_train_code


In [165]:
wml_train_code

'./telemanom/wml_model.zip'

In [167]:
#
# define library meta data for our training code
#
lib_meta = {
    client.runtimes.LibraryMetaNames.NAME: wml_run_definition,
    client.runtimes.LibraryMetaNames.VERSION: wml_framework_version,
    client.runtimes.LibraryMetaNames.FILEPATH: model_code,
    client.runtimes.LibraryMetaNames.PLATFORM: {"name": wml_framework_name, "versions": [wml_framework_version]}
}

In [51]:
#
# do we have a library with that name defined ?
#   delete it first and then store the new updated library
#
library_details = client.runtimes.get_library_details()
for library_detail in library_details['resources']:
    if library_detail['entity']['name'] == wml_run_definition:
        # Delete library if exist because we cannot update model_code
        uid = client.runtimes.get_library_uid(library_detail)
        print ('delete ', library_detail)
        client.repository.delete(uid)
        break

custom_library_details = client.runtimes.store_library(lib_meta)
custom_library_uid = client.runtimes.get_library_uid(custom_library_details)


In [53]:
#
# define a pipeline with a single entry (node) for the training run
#  we could add more node for scaling/normalizing, imputation, feature extraction, "you name it"
#
doc = {
    "doc_type": "pipeline",
    "version": "2.0",
    "primary_pipeline": wml_framework_name,
    "pipelines": [{
        "id": wml_framework_name,
        "runtime_ref": "hybrid",
        "nodes": [{
            "id": "training",
            "type": "model_node",
            "op": "dl_train",
            "runtime_ref": wml_run_name,
            "inputs": [],
            "outputs": [],
            "parameters": {
                "name": "tf-mnist",
                "description": wml_run_definition,
                "command": wml_execution_command,
                "training_lib_href": "/v4/libraries/"+custom_library_uid,
                "compute": {
                    "name": wml_compute_name,            # specify where to run it (not that I have a choice)
                    "nodes": wml_compute_nodes_v4
                }
            }
        }]
    }],
    "runtimes": [{
        "id": wml_run_name,
        "name": wml_framework_name,         # run it on a pytorch image
        "version": wml_runtime_version_v4
    }]
}

# put it in metadata object
metadata = {
    client.repository.PipelineMetaNames.NAME: wml_run_name,
    client.repository.PipelineMetaNames.DOCUMENT: doc
}

# and create the pipeline
pipeline_id = client.pipelines.get_uid(client.repository.store_pipeline(meta_props=metadata))


In [55]:
# this is my pipeline now
client.pipelines.get_details(pipeline_id)

{'metadata': {'name': 'wml-pytorch-run',
  'guid': '871b686e-b077-4005-a0ee-28e87f3facab',
  'rev': '00d0f17c-cb83-4401-9d66-585c106562e0',
  'id': '871b686e-b077-4005-a0ee-28e87f3facab',
  'modified_at': '2020-07-24T16:37:37.663Z',
  'created_at': '2020-07-24T16:37:37.596Z',
  'href': '/v4/pipelines/871b686e-b077-4005-a0ee-28e87f3facab?rev=00d0f17c-cb83-4401-9d66-585c106562e0'},
 'entity': {'space': {'id': '88740b60-6b2f-4f74-b6d8-20528d14db8b',
   'href': '/v4/spaces/88740b60-6b2f-4f74-b6d8-20528d14db8b'},
  'name': 'wml-pytorch-run',
  'document': {'doc_type': 'pipeline',
   'version': '2.0',
   'pipelines': [{'id': 'pytorch',
     'runtime_ref': 'hybrid',
     'nodes': [{'outputs': [],
       'id': 'training',
       'inputs': [],
       'type': 'model_node',
       'parameters': {'name': 'tf-mnist',
        'description': 'wml-pytorch-definition',
        'compute': {'name': 'k80', 'nodes': 1},
        'command': 'python3 wml_telemanom.py',
        'training_lib_href': '/v4/librar

In [70]:
# start the training run for v4
metadata = {
    client.training.ConfigurationMetaNames.TRAINING_RESULTS_REFERENCE: {
        "name": "training-results-reference_name",
        "connection": {
            "endpoint_url": cos_endpoint,
            "access_key_id": cos_access_key,
            "secret_access_key": cos_secret_key
        },
        "location": {
            "bucket": cos_output_bucket
        },
        "type": wml_data_source_type
    },
    client.training.ConfigurationMetaNames.TRAINING_DATA_REFERENCES:[{
        "name": "training_input_data",
        "type": wml_data_source_type,
        "connection": {
            "endpoint_url": cos_endpoint,
            "access_key_id": cos_access_key,
            "secret_access_key": cos_secret_key
        },
        "location": {
            "bucket": cos_input_bucket
        }
    }],
    client.training.ConfigurationMetaNames.PIPELINE_UID: pipeline_id
}

training_id = client.training.get_uid(client.training.run(meta_props=metadata))
print("training_id", client.training.get_details(training_id))
print("get status", client.training.get_status(training_id))


training_id {'metadata': {'created_at': '2020-07-24T16:55:55.904Z', 'guid': 'bf25a013-c8e9-4501-a8af-bd5a3bbc22a6', 'href': '/v4/trainings/bf25a013-c8e9-4501-a8af-bd5a3bbc22a6', 'id': 'bf25a013-c8e9-4501-a8af-bd5a3bbc22a6', 'space_id': '88740b60-6b2f-4f74-b6d8-20528d14db8b'}, 'entity': {'pipeline': {'href': '/v4/pipelines/871b686e-b077-4005-a0ee-28e87f3facab', 'id': '871b686e-b077-4005-a0ee-28e87f3facab'}, 'results_reference': {'location': {'training': 'bf25a013-c8e9-4501-a8af-bd5a3bbc22a6', 'pipeline_model': 'bf25a013-c8e9-4501-a8af-bd5a3bbc22a6/pipeline-model.json', 'training_status': 'bf25a013-c8e9-4501-a8af-bd5a3bbc22a6/training-status.json', 'pipeline': 'bf25a013-c8e9-4501-a8af-bd5a3bbc22a6/pipeline.json', 'bucket': 'githubanalyzer-donotdelete-pr-b9xa3kxotzh5in', 'assets_path': 'bf25a013-c8e9-4501-a8af-bd5a3bbc22a6/assets'}, 'type': 's3', 'connection': {'access_key_id': 'cc04444c99374c9e9589b8f85e931323', 'secret_access_key': '1a5062d937b09507a05b521a41b8baf6848c0cd6936e2864', 'en

In [88]:
run_details = client.training.get_details(training_id)
run_uid = training_id

# print logs

client.training.monitor_logs(run_uid)
client.training.monitor_metrics(run_uid)

# should not have run after restarting the notebook 



##########################################################################

Log monitor started for training run: bf25a013-c8e9-4501-a8af-bd5a3bbc22a6

##########################################################################


ERROR - Cannot find pipeline_model.json in the bucket bf25a013-c8e9-4501-a8af-bd5a3bbc22a6


UnboundLocalError: local variable 'pipeline_model' referenced before assignment

In [87]:
status = client.training.get_status(run_uid)
status

{'failure': {'trace': '328d08cef1309a8c129566b260007dab',
  'errors': [{'code': 'dl_job_failed (C201)',
    'message': 'Learner process crashed (C201) with exit code (1), please check the job logs for more information',
    'more_info': 'http://watson-ml-api.mybluemix.net/'}]},
 'message': {'text': 'Node training: 1', 'level': 'info'},
 'running_at': '2020-07-24T16:56:26.203Z',
 'state': 'failed'}

#### loading the logs from COS

Dumb me, forgot to import sys.

Fortunately model training has succeeded and the model has been stored in COS. Phew.

<small>

```
...
Batch  1611
Batch  1612
After batch  1612 0.002384878075476655
[1] Training loss: 0.002384878075476655 	 Validation loss: 0.0014487287297119242 
Training complete...
Model saved in file: /mnt/results/githubanalyzer-donotdelete-pr-b9xa3kxotzh5in/training-h9VOfZVMR/model
['_submitted_code', 'learner-1', 'model', 'training-log.txt']
/mnt/results/githubanalyzer-donotdelete-pr-b9xa3kxotzh5in/training-h9VOfZVMR
Traceback (most recent call last):
  File "wml_telemanom.py", line 61, in <module>
    sys.stdout.flush()
NameError: name 'sys' is not defined
```
    
</small>
    


#### Training parameters

```
loss_metric: 'mse'    # minimize mean square error
optimizer: 'adam'     # sort of adaptive stochastic gradient descent
validation_split: 0.2 # 20% of the data is used for validating (val_loss)
dropout: 0.3          # ditch 30% of the LSTMs results when minimizing the loss function to avoid overfitting
lstm_batch_size: 64   # number of training data batches to evaluate per optimizer run to update the model’s parameters

patience: 10          # try at least 10 times to decrease val_loss smaller by ...
min_delta: 0.0003     # ... at least min_delta, else stop, so we get at least 'patience' epochs
epochs: 35            # no more than 35 passes through the entier training dataset.

l_s: 250              # lookback: num previous timesteps provided to model to predict future values
n_predictions: 10     # number of steps ahead to predict
```

This is defined in `telemanom/config.yaml`
<br>