# Executando um fluxo com dataset sintético
> Para demonstração, foi utlizado apenas um dataset sintético com números pequenos de dados, pois o jupter notebook irá rodar apenas num núcleo do processador, aumentando consideravelmente o tempo do experimento

## Importando as bibliotecas usadas

In [51]:
import numpy as np
from pyod.models.abod import ABOD
from pyod.models.lof import LOF

## Criando uma classe para obter o dataset de acordo com o experimento

In [52]:
import pandas as pd
import numpy as np


class DataGenerator:
    def __init__(self, dataset=1):
        self.dataset = dataset

    def generate(self):
        if self.dataset == 1:
            return self.florest_Cover()
        elif self.dataset == 2:
            return self.gauss()
        elif self.dataset == 3:
            return self.tao()
        else:
            raise AttributeError("Choose a valid dataset!")

    def florest_Cover(self):
        path = "dataset/covtypeNorm.csv"
        data = pd.read_csv(path, header=None).iloc[:5000, :]
        cl2 = data.loc[data[54] == 2]
        cl4 = data.loc[data[54] == 4]
        cl2.drop(cl2.columns[54], axis=1, inplace=True)
        cl4.drop(cl4.columns[54], axis=1, inplace=True)
        inliers = cl2
        outliers = cl4
        return inliers.values, outliers.values

    def tao(self):
        number_outlier = 100
        path = "dataset/tao.csv"
        data = pd.read_csv(path, header=None).iloc[:3000, :]
        outliers = np.random.uniform(low=80, high=160, size=(number_outlier, 4))
        return data.values, outliers

    def gauss(self):
        number_inlier = 10000
        number_outlier = 10
        offset = 10

        np.random.seed(42)
        data_1 = 10 * np.random.randn(number_inlier // 2, 2) - offset
        data_2 = 10 * np.random.randn(number_inlier // 2, 2) + offset
        inliers = np.r_[data_1, data_2]
        outliers = np.random.uniform(low=-60, high=60, size=(number_outlier, 2))
        return inliers, outliers


## Criando arquitetura para simular uma stream de dados

In [53]:
import time
import numpy as np


class Streamer:

    def on_start(self, data):
        raise NotImplementedError("Please Implement this method")

    def on_receive(self, data):
        raise NotImplementedError("Please Implement this method")

    def on_finish(self):
        raise NotImplementedError("Please Implement this method")

    def __init__(self, data):
        self.data = data
        self.stats = {}

    def run(self, data_stream, window_size=.1, slide=.05):
        start = time.time()
        self.on_start(self.data)
        self.data_stream = data_stream
        absolute_size = int(len(data_stream) * window_size)
        absolute_slide = int(absolute_size * slide)
        initial_index = 0
        final_index = absolute_size

        # Se o tamanho absoluto for próximo de zero, gera um loop infinito
        if absolute_size == 0:
            raise AttributeError("Window size is too small")

        if absolute_slide == 0:
            raise AttributeError("Slide is too small")

        while final_index <= len(data_stream):
            if len(data_stream) <= final_index + absolute_slide:
                final_index = len(data_stream)
            window = data_stream[initial_index:final_index]
            self.on_receive(window, initial_index, final_index)
            initial_index += absolute_slide
            final_index += absolute_slide
        end = time.time()
        self.on_finish()
        self.stats["time"] = end - start


class OutlierStream(Streamer):

    def __init__(self, inliers, outliers):
        self.ground_truth = []
        [self.ground_truth.append(1) for outlier in outliers]
        [self.ground_truth.append(0) for inline in inliers]
        data_total = np.concatenate((outliers, inliers), axis=0)
        Streamer.__init__(self, data_total)
        self.predictions = np.zeros(len(data_total))

    def on_receive(self, data_window, initial_index, final_index):
        y_pred = self.predict_model(data_window)
        self.predictions[initial_index:final_index] += y_pred
        self.update_model(data_window)

    def on_finish(self):
        self.summary(self.ground_truth, self.predictions)

    def on_start(self, data):
        self.train_model(data)

    def train_model(self, data):
        raise NotImplementedError("Please Implement this method")

    def update_model(self, data):
        raise NotImplementedError("Please Implement this method")

    def predict_model(self, data):
        raise NotImplementedError("Please Implement this method")

    def summary(self, predictions):
        raise NotImplementedError("Please Implement this method")

## Criando o modelo base para experimentos

In [61]:
from streamer import OutlierStream
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from scipy import stats


class TemplateOutlier(OutlierStream):

    def __init__(self, inliers, outliers, model):
        data_total = np.concatenate((inliers, outliers), axis=0)
        self.data_total = data_total
        self.outliers = outliers
        self.inliers = inliers

        OutlierStream.__init__(self, inliers, outliers)
        self.model = model

    def train_model(self, data):
        self.model.fit(data)
        scores_pred = self.model.decision_function(data) * -1
        self.threshold = stats.scoreatpercentile(scores_pred, 100 * 0.006)

    def update_model(self, data):
        return None

    def predict_model(self, data):
        return self.model.predict(data)

    def summary(self, ground_truth, predictions, is_plot=False):

        predictions = list(map(lambda x: 1 if x > 0 else 0, predictions))

        acc = accuracy_score(predictions, ground_truth)
        precision = precision_score(predictions, ground_truth)
        recall = recall_score(predictions, ground_truth)
        f1 = f1_score(predictions, ground_truth)

        self.stats["f1"] = f1
        self.stats["recall"] = recall
        self.stats["precision"] = precision
        self.stats["accuracy"] = acc

        print(confusion_matrix(predictions, ground_truth))

        if is_plot:
            self._plot()

    def _plot(self):

        xx, yy = np.meshgrid(np.linspace(-70, 70, 100), np.linspace(-70, 70, 100))
        Z = self.model.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1
        Z = Z.reshape(xx.shape)

        subplot = plt.subplot(1, 1, 1)
        subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), self.threshold, 7), cmap=plt.cm.Blues_r)
        subplot.contourf(xx, yy, Z, levels=[self.threshold, Z.max()], colors='orange')

        a = subplot.contour(xx, yy, Z, levels=[self.threshold], linewidths=2, colors='red')
        subplot.contourf(xx, yy, Z, levels=[self.threshold, Z.max()], colors='orange')
        b = subplot.scatter(self.outliers[:, 0], self.outliers[:, 1], c='red', s=12, edgecolor='k')
        c = subplot.scatter(self.inliers[:, 0], self.inliers[:, 1], c='white', s=12, edgecolor='k')
        subplot.axis('tight')
        subplot.legend(
            [a.collections[0], b, c],
            ['Borda da funcao', 'Outliers', 'Inliers'],
            loc='lower right')
        subplot.set_xlim((-70, 70))
        subplot.set_ylim((-70, 70))
        plt.suptitle("Angular based outlier detection")
        plt.show()


## Criando classe para modelo ABOD e LOF

In [62]:
from streamer_outlier import TemplateOutlier
from pyod.models.abod import ABOD
class AngularBasedOutlier(TemplateOutlier):

    def __init__(self, inliers, outliers):
        self.model = ABOD(n_neighbors=20, contamination=0.001)
        TemplateOutlier.__init__(self, inliers, outliers, self.model)

In [63]:
from streamer_outlier import TemplateOutlier
from pyod.models.lof import LOF

class LOFOutlier(TemplateOutlier):

    def __init__(self, inliers, outliers):
        self.model = LOF(n_neighbors=20, contamination=0.005)
        TemplateOutlier.__init__(self, inliers, outliers, self.model)

## Carregando dataset

In [64]:
data_gen = DataGenerator(2)
inliers, outliers = data_gen.generate()

## Rodando o experimento

In [65]:
model = LOFOutlier(inliers, outliers)

In [66]:
model.run(np.r_[outliers, inliers], window_size=.4, slide=.3)

[[9962    4]
 [  38    6]]
Acuracia: 0.995804195804
Precision: 0.6
Recall: 0.136363636364
F1: 0.222222222222


## Exibindo resultados

In [60]:
model.stats

{'accuracy': 0.9958041958041958,
 'f1': 0.22222222222222218,
 'precision': 0.6,
 'recall': 0.13636363636363635,
 'time': 0.1896820068359375}