# Scikit Learn Log Classification

This is an updated version of an earlier [log classification script](https://github.com/opencredo/log-classifier). The purpose of that version was to show that machine learning could be performed using mostly native Python functions together with Scikit Learn. The purpose in the present case is to show that the same can be accomplished more simply when the right tools are used for the job. The metrics section has been expanded as well, to give more insight into the successes and failures of the model.

The main additions are:
    1. Collect logs from within the script
    2. Use Pandas and Numpy for data management
    3. Add more metrics

In [1]:
import os
import glob
import shutil
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn import svm, naive_bayes, linear_model, tree, ensemble, neighbors, semi_supervised, neural_network, discriminant_analysis
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
def copy_data(src_file_path, dst_file_path):
    if not os.path.exists(dst_file_path):
        os.mkdir(dst_file_path)
    for logfile in glob.glob(src_file_path + "/*.log"):
        if os.stat(logfile)[6] > 10000:
            logfile_name = logfile.split('/')[-1]
            shutil.copyfile(logfile, dst_file_path + "/" + logfile_name)

In [3]:
def read_data(logfile_path):
    log_collection = pd.DataFrame()
    logs = pd.DataFrame()
    logfiles = glob.glob(logfile_path + "/*.log") # Get list of log files
    for logfile in logfiles:
        logs = pd.read_csv(logfile, sep="\n", header=None, names=['data'])
        logs['type'] = logfile.split('/')[-1]
        # Add log file data and type to log collection
        log_collection = log_collection.append(logs)
    
    # Remove empty lines
    log_collection = log_collection.dropna()

    return log_collection

In [4]:
def train(algorithm, X_train, y_train):
    model = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', algorithm)])
    model.fit(X_train, y_train)
    return model

In [17]:
def report(classifier, actual, predictions):
    print("\033[1m" + classifier + "\033[0m\033[50m\n")
    
    actual = np.array(actual)
    
    print(confusion_matrix(actual, predictions))
    print
    print(classification_report(actual, predictions))
    print("Accuracy: " + str(round(accuracy_score(actual, predictions),2)))
    print

In [6]:
algorithms = [
#    svm.SVC(kernel='linear', C = 1.0),   # QUITE SLOW
    linear_model.SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None),
    naive_bayes.MultinomialNB(),
    naive_bayes.BernoulliNB(),
    tree.DecisionTreeClassifier(max_depth=1000),
    tree.ExtraTreeClassifier(),
    ensemble.ExtraTreesClassifier(),
    svm.LinearSVC(),
#    linear_model.LogisticRegressionCV(multi_class='multinomial'),   # A BIT SLOW
#    neural_network.MLPClassifier(),   # VERY SLOW
    neighbors.NearestCentroid(),
    ensemble.RandomForestClassifier(),
    linear_model.RidgeClassifier(),
]

In [7]:
source_data_dir = "/var/log"
data_dir = "data"

copy_data(source_data_dir, data_dir)
log_collection = read_data(data_dir)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(log_collection['data'], log_collection['type'], test_size=0.2, random_state=42)

In [18]:
# Provide Report for all algorithms
for algorithm in algorithms:
    model = train(algorithm, X_train, y_train)
    predictions = model.predict(X_test)
    report((str(algorithm).split('(')[0]), y_test, predictions)

[1mSGDClassifier[0m[50m

[[2536    0    0    0    0    0    0    0    0    0]
 [   0  131    0    0    0    0    0    0    0    0]
 [   0    0 1569  172    0    0    0    0    0    0]
 [   0    0   39 1277    0    0    0    3    0    0]
 [   0    0    0    0 1091    0    0    0    0    0]
 [   0    0    0    0    0    0  116    0    0    0]
 [   0    0    0    0    0    0  948    0    0    1]
 [   0    0    0    0    0    0    0  919    0    1]
 [   0    0    0    0    0    0    0    0  761    1]
 [   0    0    0    0    3    0    1    0    4 1714]]

                                   precision    recall  f1-score   support

                 corecaptured.log       1.00      1.00      1.00      2536
                    fsck_apfs.log       1.00      1.00      1.00       131
                      install.log       0.98      0.90      0.94      1741
                       system.log       0.88      0.97      0.92      1319
wifi-08-23-2018__12:54:38.121.log       1.00      1.00      1.00

[1mLinearSVC[0m[50m

[[2536    0    0    0    0    0    0    0    0    0]
 [   0  131    0    0    0    0    0    0    0    0]
 [   0    0 1737    3    0    0    0    0    0    1]
 [   0    0  125 1194    0    0    0    0    0    0]
 [   0    0    0    0 1090    0    0    0    0    1]
 [   0    0    0    0    0   33   82    0    0    1]
 [   0    0    0    0    0   87  861    0    0    1]
 [   0    0    0    0    0    0    0  919    0    1]
 [   0    0    0    0    0    0    0    0  761    1]
 [   0    0    0    0    3    0    0    0    2 1717]]

                                   precision    recall  f1-score   support

                 corecaptured.log       1.00      1.00      1.00      2536
                    fsck_apfs.log       1.00      1.00      1.00       131
                      install.log       0.93      1.00      0.96      1741
                       system.log       1.00      0.91      0.95      1319
wifi-08-23-2018__12:54:38.121.log       1.00      1.00      1.00    