# Keras Log Classification

This notebook adapts the Scikit Learn Log Classification notebook for Keras by using neural networks with TensorFlow.

We still use Scikit Learn for some functions, like labelling data and providing metrics.

In [1]:
import os
import glob
import shutil
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Activation, Dense, Dropout
from keras.optimizers import Adam

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

Using TensorFlow backend.


In [2]:
def copy_data(src_file_path, dst_file_path):
    if not os.path.exists(dst_file_path):
        os.mkdir(dst_file_path)
    for logfile in glob.glob(src_file_path + "/*.log"):
        if os.stat(logfile)[6] > 10000:
            logfile_name = logfile.split('/')[-1]
            shutil.copyfile(logfile, dst_file_path + "/" + logfile_name)

In [3]:
def read_data(logfile_path):
    log_collection = pd.DataFrame()
    logs = pd.DataFrame()
    logfiles = glob.glob(logfile_path + "/*.log") # Get list of log files
    for logfile in logfiles:
        logs = pd.read_csv(logfile, sep="\n", header=None, names=['data'])
        logs['type'] = logfile.split('/')[-1]
        # Add log file data and type to log collection
        log_collection = log_collection.append(logs)

    # Remove empty lines
    log_collection = log_collection.dropna()
    # Reset the index
    log_collection = log_collection.reset_index(drop=True)
    
    return log_collection

In [4]:
def prepare_data(text, labels):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text)
    X = tokenizer.texts_to_matrix(text, mode='tfidf')
        
    encoder = LabelBinarizer()
    encoder.fit(labels)
    y = encoder.transform(labels)
    
    return X, y

In [5]:
def build_nn(input_size, hidden_size, num_classes, dropout):
    nn = Sequential()
    nn.add(Dense(hidden_size, input_shape=(input_size,)))
    nn.add(Activation('relu'))
    nn.add(Dropout(dropout))
    nn.add(Dense(num_classes))
    nn.add(Activation('softmax'))
    nn.summary()
    
    return nn

In [6]:
def train(X_train, y_train, criterion, optimiser, batch_size, num_epochs):
    network.compile(loss=criterion,
                  optimizer=optimiser,
                  metrics=['accuracy'])

    history = network.fit(X_train, y_train,
                        batch_size=batch_size,
                        epochs=num_epochs,
                        verbose=1,
                        validation_split=0.1)
    
    return network

In [7]:
def report(actual, predictions):
    print("\033[1m Performance Report \033[0m\033[50m\n")
    
    actual = np.array(actual)
    
    print(confusion_matrix(actual, predictions))
    print
    print(classification_report(actual, predictions))
    print("Accuracy: " + str(round(accuracy_score(actual, predictions),2)))
    print

In [8]:
source_data_dir = "/var/log"
data_dir = "data"

copy_data(source_data_dir, data_dir)
log_collection = read_data(data_dir)

X, y = prepare_data(log_collection['data'], log_collection['type'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Hyperparameters
input_size = X_train.shape[1] # this is the vocab size
hidden_size = 512
num_classes = y_train.shape[1]
dropout = 0.3

num_epochs = 5
batch_size = 32
learning_rate = 0.0005

In [10]:
network = build_nn(input_size, hidden_size, num_classes, dropout)
criterion = 'categorical_crossentropy'
optimiser = Adam(lr=learning_rate)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               5273600   
_________________________________________________________________
activation_1 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 7)                 3591      
_________________________________________________________________
activation_2 (Activation)    (None, 7)                 0         
Total params: 5,277,191
Trainable params: 5,277,191
Non-trainable params: 0
_________________________________________________________________


In [11]:
# Training
model = train(X_train, y_train, criterion, optimiser, batch_size, num_epochs)

Train on 63326 samples, validate on 7037 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [12]:
# Prediction
file_types = np.unique(log_collection['type'])
predictions = model.predict(np.array(X_test))
predicted_labels = [ file_types[np.argmax(p)] for p in predictions]
actual_labels = [ file_types[np.argmax(y)] for y in y_test]

In [13]:
# Reporting
report(actual_labels, predicted_labels)

[1m Performance Report [0m[50m

[[5854    0    0    0    0    0    0]
 [   0  302    0    0    0    0    0]
 [   0    0   46    0    0    0    0]
 [   0    0    0 3588   19    0    0]
 [   0    0    0  207 2700    0    0]
 [   0    0    0    0    0  926    1]
 [   0    0    0    0    0    1 3947]]

                                   precision    recall  f1-score   support

                 corecaptured.log       1.00      1.00      1.00      5854
                    fsck_apfs.log       1.00      1.00      1.00       302
                     fsck_hfs.log       1.00      1.00      1.00        46
                      install.log       0.95      0.99      0.97      3607
                       system.log       0.99      0.93      0.96      2907
wifi-11-07-2018__13:38:02.923.log       1.00      1.00      1.00       927
                         wifi.log       1.00      1.00      1.00      3948

                        micro avg       0.99      0.99      0.99     17591
                    