# Pytorch Log Classification

This notebook adapts the Scikit Learn Log Classification notebook for Pytorch by building a neural network.

We still use Scikit Learn for some functions, like labelling data and providing metrics.

In [1]:
import os
import glob
import shutil
import numpy as np
import pandas as pd

from tqdm import tqdm_notebook

from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
def copy_data(src_file_path, dst_file_path):
    if not os.path.exists(dst_file_path):
        os.mkdir(dst_file_path)
    for logfile in glob.glob(src_file_path + "/*.log"):
        if os.stat(logfile)[6] > 10000:
            logfile_name = logfile.split('/')[-1]
            shutil.copyfile(logfile, dst_file_path + "/" + logfile_name)

In [3]:
def read_data(logfile_path):
    log_collection = pd.DataFrame()
    logs = pd.DataFrame()
    logfiles = glob.glob(logfile_path + "/*.log") # Get list of log files
    for logfile in logfiles:
        logs = pd.read_csv(logfile, sep="\n", header=None, names=['data'])
        logs['type'] = logfile.split('/')[-1]
        # Add log file data and type to log collection
        log_collection = log_collection.append(logs)

    # Remove empty lines
    log_collection = log_collection.dropna()
    # Reset the index
    log_collection = log_collection.reset_index(drop=True)
    
    return log_collection

In [4]:
def prepare_data(text, labels):
    tfidf_transformer = TfidfVectorizer()
    X = tfidf_transformer.fit_transform(text).toarray()
        
    encoder = LabelBinarizer()
    encoder.fit(labels)
    y = encoder.transform(labels)
    
    return X, y

In [5]:
def get_batch(X_train, y_train, i, batch_size):
    data = X_train[(i*batch_size):((i*batch_size)+batch_size)]
    labels = y_train[(i*batch_size):((i*batch_size)+batch_size)]
     
    return np.array(data), np.array(labels)

In [6]:
def train(X_train, y_train, num_epochs, batch_size):
    for epoch in tqdm_notebook(range(num_epochs)):
        total_batches = int(len(X_train) / batch_size)
        # Loop over all batches
        for i in tqdm_notebook(range(total_batches)):
            X_batch, y_batch = get_batch(X_train, y_train, i, batch_size)
            data = Variable(torch.FloatTensor(X_batch))
            labels = Variable(torch.LongTensor(y_batch))
            labels = torch.max(labels, 1)[1]

            optimiser.zero_grad()
            outputs = network(data)
            loss = criterion(outputs, labels)
            loss.backward()
            optimiser.step()

        print ('Epoch [%d/%d], Loss: %.4f' %(epoch+1, num_epochs, loss.data[0]))

In [7]:
def report(actual, predictions):
    print("\033[1m Performance Report \033[0m\033[50m\n")
    
    actual = np.array(actual)
    
    print(confusion_matrix(actual, predictions))
    print
    print(classification_report(actual, predictions))
    print("Accuracy: " + str(round(accuracy_score(actual, predictions),2)))
    print

In [8]:
class NN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, dropout):
        super(NN, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, num_classes),
        )

    def forward(self, input):
        return self.main(input)

In [9]:
source_data_dir = "/var/log"
data_dir = "data"

copy_data(source_data_dir, data_dir)
log_collection = read_data(data_dir)

X, y = prepare_data(log_collection['data'], log_collection['type'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Hyperparameters
input_size = X_train.shape[1] # this is the vocab size
hidden_size = 512
num_classes = y_train.shape[1] 
dropout = 0.3

num_epochs = 5
batch_size = 32
learning_rate = 0.0005

In [11]:
network = NN(input_size, hidden_size, num_classes, dropout)
criterion = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(network.parameters(), lr=learning_rate)

In [12]:
train(X_train, y_train, num_epochs, batch_size)

HBox(children=(IntProgress(value=0, max=5), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=2199), HTML(value=u'')))

Epoch [1/5], Loss: 0.0280




HBox(children=(IntProgress(value=0, max=2199), HTML(value=u'')))

Epoch [2/5], Loss: 0.0143


HBox(children=(IntProgress(value=0, max=2199), HTML(value=u'')))

Epoch [3/5], Loss: 0.0228


HBox(children=(IntProgress(value=0, max=2199), HTML(value=u'')))

Epoch [4/5], Loss: 0.0248


HBox(children=(IntProgress(value=0, max=2199), HTML(value=u'')))

Epoch [5/5], Loss: 0.0236



In [13]:
# Predict test labels
test_inputs = Variable(torch.from_numpy(X_test).float())
predicted = network.forward(test_inputs)
predicted_classes = [ np.argmax(p) for p in predicted.detach().numpy() ]

file_types = np.unique(log_collection['type'])
predicted_labels = [ file_types[p] for p in predicted_classes]
actual_labels = [ file_types[np.argmax(y)] for y in y_test]

In [14]:
# Report
report(actual_labels, predicted_labels)

[1m Performance Report [0m[50m

[[5872    0    0    0    0    0    0]
 [   0  327    0    0    0    0    0]
 [   0    0   46    0    0    0    0]
 [   0    0    0 3493   29    0    0]
 [   0    0    0  203 2712    0    0]
 [   0    0    0    0    0  926    1]
 [   0    0    0    0    0    4 3981]]

                                   precision    recall  f1-score   support

                 corecaptured.log       1.00      1.00      1.00      5872
                    fsck_apfs.log       1.00      1.00      1.00       327
                     fsck_hfs.log       1.00      1.00      1.00        46
                      install.log       0.95      0.99      0.97      3522
                       system.log       0.99      0.93      0.96      2915
wifi-11-07-2018__13:38:02.923.log       1.00      1.00      1.00       927
                         wifi.log       1.00      1.00      1.00      3985

                        micro avg       0.99      0.99      0.99     17594
                    