In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

from ipynb.fs.full.Cert_Aux_Functions2 import *

# Importa a biblioteca pandas 
import pandas as pd

# Importa datetime e timedelta para verificar se há gaps de tempo nos datasets preparados
from datetime import datetime, timedelta

# Importa a biblioteca os
import os

## Bibliotecas sklearn
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, FunctionTransformer
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

# One Class SVM
# https://scikit-learn.org/stable/auto_examples/svm/plot_oneclass.html
from sklearn import svm
from sklearn import linear_model

#Para as figuras
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d

#Importa bibliotecas Numpy
import numpy as np

from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Logon + HTTP + USB + Device
df_lhud_1hora = pd.read_pickle("df_lhud_1hora_file.pkl")
df_lhud_1hora.sort_values('date', ascending=True, inplace = True)
df_lhud_1hora.reset_index(inplace = True, drop=True)

## Separa 4 primeiros meses de dados
df_lhud_1hora4m = df_lhud_1hora[(df_lhud_1hora['date'] <= '2010-05-02')]

train_index = df_lhud_1hora4m.index[-1]
test_index = df_lhud_1hora[(df_lhud_1hora['date'] <= '2010-09-02')].index[-1]


## Conjunto de teste
df_lhud_1hora_test = df_lhud_1hora.iloc[train_index:test_index]

### Transformações

#Nomes das features numericas de acordo com o dataset
lhud_numeric_features = ['logon', 'logoff','down','up','vis',
                     'conn','disc','trm','frm','open',
                     'write','copy','delete']

#Nomes das features categoricas - comum a todos os datasets

#numeric_transformer = StandardScaler()
numeric_transformer = MinMaxScaler()
#numeric_transformer = SimpleImputer()

hour_categories = np.arange(0, 24)
dow_categories  = np.arange(0, 7)
user_categories = df_lhud_1hora4m.user.unique()

#categorical_features = ['user','hour', 'dow']
categorical_features = ['hour', 'dow']
categorical_transformer = OneHotEncoder(
#    categories = [user_categories, hour_categories, dow_categories]
    categories = [hour_categories, dow_categories]
)

user_feature = ['user']
#user_transformer = OrdinalEncoder(categories = [user_categories])
user_transformer = OneHotEncoder(categories = [user_categories])
                        
lhud_preprocessor = ColumnTransformer(
    transformers=[
        ('lhud_num', numeric_transformer, lhud_numeric_features),
        ('lhud_cat', categorical_transformer, categorical_features),
        ('lhud_user', user_transformer, user_feature),
    ])

#Transformaçoes
columns = lhud_numeric_features + categorical_features + user_feature
trans_lhud_4m    = lhud_preprocessor.fit_transform(df_lhud_1hora4m[columns])
trans_lhud_test  = lhud_preprocessor.transform(df_lhud_1hora_test[columns])
trans_lhud       = lhud_preprocessor.transform(df_lhud_1hora[columns])

In [None]:
### Carrega Labels preditos do Snorkel
labels_g_pd = pd.read_hdf("labels_g_pd.hdf",'df')
labels_g_pd['anom'] = np.where((labels_g_pd[0]== 1),-1,1)
labels_g_pd[labels_g_pd['anom'] == -1].shape, labels_g_pd[labels_g_pd['anom'] == 1].shape

In [None]:
def objective(space):

    clf_SGDOC=linear_model.SGDOneClassSVM(
        nu=space['nu_s'], fit_intercept=space['fiti_s'], max_iter=space['maxi_s'], 
        tol=space['to_s'], #eta0=0.0,
        learning_rate=space['lr_s']
    )


    clf_SGDOC.fit(trans_lhud_4m)

    pred1 = clf_SGDOC.predict(trans_lhud_test)
    dec1 = clf_SGDOC.decision_function(trans_lhud_test)
    
    anomalyScoresSGDOC = df_lhud_1hora_test
    anomalyScoresSGDOC['scores']=dec1
    anomalyScoresSGDOC['anom']=pred1
    
    a,p,r,f,cm,auc_sc = benchmark_snorkel(labels_g_pd,anomalyScoresSGDOC)
    
    global index
    scores_df.loc[index,:]=np.array([index,space,a,p,r,f,auc_sc,np.reshape(cm,(4))],dtype=object)
    
    index=index+1
    
    print(space,r)
    
    return {'loss': -r, 'status': STATUS_OK, 'space': space,
            'model': clf_SGDOC, 'f1_score': f,'auc_sc': auc_sc,
           'precision': p, 'recall': r, 'c_matrix': cm}

space ={'nu_s': hp.choice('nu_s', [0.01]),
        'fiti_s': hp.choice('fiti_s', [True]),
        'maxi_s': hp.choice('maxi_s', np.arange(10000,100000,10000)),
        'to_s': hp.choice('to_s', [1e-3, 5e-4, 1e-4, 5e-5, 1e-5]),
        'lr_s' : hp.choice('lr_s', ["optimal"]),
    }

In [None]:
scores_df = pd.DataFrame(columns=["Model","Params","Accuracy","Precision","Recall","F1-Score","ROC-AUC", "CM"])
index=0

trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=20,
            trials=trials)

In [None]:
## Melhor modelo pelas metricas Recall e ROC-AUC
scores_df.sort_values(['Recall','ROC-AUC'], ascending=[False,False])