In [1]:
import os
import numpy as np
import pandas as pd
import itertools
import matplotlib.pyplot as plt



from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict, cross_val_predict, train_test_split
from sklearn.metrics import (roc_auc_score, confusion_matrix, precision_score,
                             recall_score, roc_curve, precision_recall_curve,auc)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# from utils import plot_confusion_matrix

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [None]:
dir_path = '/Users/alisa95/Desktop/FEATURES-2014-2015/'
names_list = sorted(os.listdir(dir_path)[::4])
sequences = pd.concat([pd.read_csv(dir_path + name, parse_dates=['timestamp'])
                       for name in names_list], ignore_index=True)

In [None]:
len(names_list)

In [None]:
# сортировка
sequences.sort_values(['host', 'process', 'timestamp'], ascending=True, inplace=True)

In [None]:
labels = sequences['isAnomaly']
sequences = sequences.drop(columns=['isAnomaly', 'host', 'process', 'timestamp'])

In [16]:
sequences = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9],[10, 11, 12],[13, 14, 15]]),columns=['a', 'b', 'c'])
labels = pd.Series(np.array([0,0,1,1,0]))

In [17]:
window_size = 1
# размер окна
window_method = 1
# 0 - если хоть один элемент аномальный то окно аномально, 1 - если все, 2 если больше половины

window_sequences = pd.DataFrame()
window_labels = pd.DataFrame()
for i in range(window_size):
    window_sequences = pd.concat([window_sequences, sequences.shift(periods=i)], axis=1)
    window_labels[f'shifted_{i}'] = labels.shift(periods=i)

sequences = window_sequences
labels = window_labels

sequences.dropna(inplace=True)
labels.dropna(inplace=True)

num_columns = labels.shape[1]
sum_lables = labels.sum(axis=1)
if(window_method==0):
    for val_index in sum_lables.index:
        if sum_lables[val_index] != 0:
            sum_lables[val_index] = 1
elif(window_method==1):
    print(sum_lables)
    print(num_columns)
    for val_index in sum_lables.index:
        if sum_lables[val_index] == num_columns:
            sum_lables[val_index] = 1
        else:
            sum_lables[val_index] = 0

In [None]:
%%time
forest = RandomForestClassifier(n_estimators=60, n_jobs=-1, random_state=1488)
forest_labels = cross_val_predict(forest, sequences, labels, cv=4,
                                  method='predict_proba')[:, 1]

In [None]:
# precision recall
print("precision_score: ",precision_score(labels, forest_labels > 0.02), "\nrecall_score: ",recall_score(labels, forest_labels > 0.02))
# roc_auc_score
print("roc_auc_score: ",roc_auc_score(labels, forest_labels))
# roc_curve
fpr, tpr, _ = roc_curve(labels, forest_labels)
plt.plot(fpr, tpr)
plt.xlabel('FPR')
plt.ylabel('TPR')

In [None]:
# prc_curve
pr, rc, _ = precision_recall_curve(labels, forest_labels)
plt.plot(rc, pr)
plt.xlabel('rc')
plt.ylabel('pr')
print("prc_auc_score: ",auc(rc, pr))

In [None]:
for i in np.arange(0.00, 0.2, 0.0001):
    print('threshold:', i)
    x = confusion_matrix(labels, forest_labels < i).astype(float)
    print(x)
    x /= x.sum(axis=1)[:, None]
    print(x)
    print("pr: ",precision_score(labels, forest_labels < i).astype(float)," ; rc: " ,recall_score(labels, forest_labels < i).astype(float))
    print('\n')

In [None]:
%%time
scaler = StandardScaler(copy=False)
logreg = LogisticRegression(solver='lbfgs', n_jobs=1)
logreg_pipe = Pipeline([('scaler', scaler), ('logreg', logreg)])
logreg_labels = cross_val_predict(logreg_pipe, sequences, labels,
                                  cv=4, method='predict_proba')[:, 1]

In [None]:
%%time
boosting = LGBMClassifier(max_depth=7, n_estimators=80, n_jobs=-1)
boosting_labels = cross_val_predict(boosting, sequences, labels, cv=4,
                                    method='predict_proba')[:, 1]

In [None]:
confusion_matrix(labels, forest_labels)

In [None]:
precision_score(labels, forest_labels), recall_score(labels, forest_labels)

In [None]:
confusion_matrix(labels, boosting_labels)

In [None]:
precision_score(labels, boosting_labels), recall_score(labels, boosting_labels)

In [None]:
confusion_matrix(labels, logreg_labels)

In [None]:
precision_score(labels, logreg_labels), recall_score(labels, logreg_labels)

In [None]:
class_names = np.array(["Healthy", "Anomaly"])
np.set_printoptions(precision=2)
plot_confusion_matrix(labels.astype(int), forest_labels.astype(int),
    classes=class_names, normalize=True, title="Forest Confusion Matrix")
plt.tight_layout()
plt.savefig("images/forest_confusion_matrix.png", dpi=140)

In [None]:
class_names = np.array(["Healthy", "Anomaly"])
np.set_printoptions(precision=2)
plot_confusion_matrix(labels.astype(int), boosting_labels.astype(int),
    classes=class_names, normalize=True, title="Boosting Confusion Matrix")
plt.tight_layout()
plt.savefig("images/boosting_confusion_matrix.png", dpi=140)

In [None]:
class_names = np.array(["Healthy", "Anomaly"])
np.set_printoptions(precision=2)
plot_confusion_matrix(labels.astype(int), logreg_labels.astype(int),
    classes=class_names, normalize=True, title="Logreg Confusion Matrix")
plt.tight_layout()
plt.savefig("images/logreg_confusion_matrix.png", dpi=140)