In [3]:
import pandas as pd
import json
import sys
sys.path.append('../')

import os
import gc
import pandas as pd
import numpy as np
from logparser import Spell, Drain
import argparse
from tqdm import tqdm

tqdm.pandas()
pd.options.mode.chained_assignment = None

PAD = 0
UNK = 1
START = 2

data_dir = os.path.expanduser("../Loghub-BGL/")
output_dir = os.path.expanduser("../Loghub-BGL/processed/")
log_file = "BGL.log"


# In the first column of the log, "-" indicates non-alert messages while others are alert messages.
def count_anomaly():
    total_size = 0
    normal_size = 0
    with open(data_dir + log_file, encoding="utf8") as f:
        for line in f:
            total_size += 1
            if line.split(' ',1)[0] == '-':
                normal_size += 1
    print("total size {}, abnormal size {}".format(total_size, total_size - normal_size))


def deeplog_file_generator(filename, df, features):
    with open(filename, 'w') as f:
        for _, row in df.iterrows():
            for val in zip(*row[features]):
                f.write(','.join([str(v) for v in val]) + ' ')
            f.write('\n')


def parse_log(input_dir, output_dir, log_file, parser_type):
    log_format = '<Label> <Id> <Date> <Code1> <Time> <Code2> <Component1> <Component2> <Level> <Content>'
    regex = [
        r'(0x)[0-9a-fA-F]+', #hexadecimal
        r'\d+.\d+.\d+.\d+',
        # r'/\w+( )$'
        r'\d+'
    ]
    keep_para = False
    if parser_type == "drain":
        # the hyper parameter is set according to http://jmzhu.logpai.com/pub/pjhe_icws2017.pdf
        st = 0.3  # Similarity threshold
        depth = 3  # Depth of all leaf nodes
        parser = Drain.LogParser(log_format, indir=data_dir, outdir=output_dir, depth=depth, st=st, rex=regex, keep_para=keep_para)
        parser.parse(log_file)
    elif parser_type == "spell":
        tau = 0.55
        parser = Spell.LogParser(indir=data_dir, outdir=output_dir, log_format=log_format, tau=tau, rex=regex, keep_para=keep_para)
        parser.parse(log_file)


def sliding_window(raw_data, para):
    """
    split logs into sliding windows/session
    :param raw_data: dataframe columns=[timestamp, label, eventid, time duration]
    :param para:{window_size: seconds, step_size: seconds}
    :return: dataframe columns=[eventids, time durations, label]
    """
    log_size = raw_data.shape[0]
    label_data, time_data = raw_data.iloc[:, 1], raw_data.iloc[:, 0]
    logkey_data, deltaT_data = raw_data.iloc[:, 2], raw_data.iloc[:, 3]
    new_data = []
    start_end_index_pair = set()

    start_time = time_data[0]
    end_time = start_time + para["window_size"]
    start_index = 0
    end_index = 0

    # get the first start, end index, end time
    for cur_time in time_data:
        if cur_time < end_time:
            end_index += 1
        else:
            break

    start_end_index_pair.add(tuple([start_index, end_index]))

    # move the start and end index until next sliding window
    num_session = 1
    while end_index < log_size:
        start_time = start_time + para['step_size']
        end_time = start_time + para["window_size"]
        for i in range(start_index, log_size):
            if time_data[i] < start_time:
                i += 1
            else:
                break
        for j in range(end_index, log_size):
            if time_data[j] < end_time:
                j += 1
            else:
                break
        start_index = i
        end_index = j

        # when start_index == end_index, there is no value in the window
        if start_index != end_index:
            start_end_index_pair.add(tuple([start_index, end_index]))

        num_session += 1
        if num_session % 1000 == 0:
            print("process {} time window".format(num_session), end='\r')

    for (start_index, end_index) in start_end_index_pair:
        dt = deltaT_data[start_index: end_index].values
        dt[0] = 0
        new_data.append([
            time_data[start_index: end_index].values,
            max(label_data[start_index:end_index]),
            logkey_data[start_index: end_index].values,
            dt
        ])

    assert len(start_end_index_pair) == len(new_data)
    print('there are %d instances (sliding windows) in this dataset\n' % len(start_end_index_pair))
    return pd.DataFrame(new_data, columns=raw_data.columns)


##########
# Parser #
#########
parse_log(data_dir, output_dir, log_file, 'drain')

Parsing file: llnl_data/BGL.log
Total size after encoding is 4713493 4747963
Parsing done. [Time taken: 0:16:14.722308]


In [4]:
#########
# Count #
#########
count_anomaly()

##################
# Transformation #
##################
# mins
window_size = 5
step_size = 1
train_ratio = 0.4

df = pd.read_csv(f'{output_dir}{log_file}_structured.csv')

# data preprocess
df['datetime'] = pd.to_datetime(df['Time'], format='%Y-%m-%d-%H.%M.%S.%f')
df["Label"] = df["Label"].apply(lambda x: int(x != "-"))
df['timestamp'] = df["datetime"].values.astype(np.int64) // 10 ** 9
df['deltaT'] = df['datetime'].diff() / np.timedelta64(1, 's')
df['deltaT'].fillna(0)
# convert time to UTC timestamp
# df['deltaT'] = df['datetime'].apply(lambda t: (t - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s'))

# sampling with fixed window
# features = ["EventId", "deltaT"]
# target = "Label"
# deeplog_df = deeplog_df_transfer(df, features, target, "datetime", window_size=args.w)
# deeplog_df.dropna(subset=[target], inplace=True)

# sampling with sliding window
deeplog_df = sliding_window(df[["timestamp", "Label", "EventId", "deltaT"]],
                            para={"window_size": int(window_size)*60, "step_size": int(step_size) * 60}
                            )
deeplog_df

total size 4747963, abnormal size 348460
there are 37315 instances (sliding windows) in this dataset



Unnamed: 0,timestamp,Label,EventId,deltaT
0,"[1135617911, 1135617913, 1135617915, 113561791...",0,"[3aa50e45, 3aa50e45, 3aa50e45, 3aa50e45, 3aa50...","[0.0, 1.601077, 1.581271, 0.577114, 2.089764, ..."
1,"[1127138101, 1127138101, 1127138101, 112713810...",0,"[4983ff07, 4983ff07, 4983ff07, 4983ff07, 4983f...","[0.0, 0.13094, 0.073317, 0.157466, 0.121008, 0..."
2,"[1134118610, 1134118610, 1134118610, 113411861...",0,"[30b3b946, 8df7ac9e, a450c390, a450c390, 30b3b...","[0.0, 0.109872, 0.024515, 0.034294, 0.035576, ..."
3,"[1136277503, 1136277503, 1136277503, 113627750...",0,"[30b3b946, 8df7ac9e, 30b3b946, 8df7ac9e, 30b3b...","[0.0, 0.115572, 0.032737, 0.016155, 0.0, 0.112..."
4,[1120820379],0,[8a1ae52c],[0.0]
...,...,...,...,...
37310,"[1131198515, 1131198541, 1131198624, 113119869...",0,"[3aa50e45, 3aa50e45, 3aa50e45, 3aa50e45, 3aa50...","[0.0, 0.0, 0.0, 0.0, 0.0]"
37311,"[1131101810, 1131101810, 1131101810, 113110181...",0,"[1b700d02, 1b700d02, 1b700d02, 1b700d02, 1b700...","[0.0, 0.014491, 0.021685, 0.014192, 0.015735, ..."
37312,"[1134374630, 1134374630, 1134374630, 113437463...",0,"[8df7ac9e, 30b3b946, 8df7ac9e, 26c05abc, 26c05...","[0.0, 0.150302, 0.160659, 0.178033, 0.170768, ..."
37313,[1124338339],0,[ba77ab8e],[0.0]


In [5]:
#########
# Train #
#########
df_normal =deeplog_df[deeplog_df["Label"] == 0]
df_normal = df_normal.sample(frac=1, random_state=12).reset_index(drop=True) #shuffle
normal_len = len(df_normal)
train_len = int(normal_len * train_ratio)

train = df_normal[:train_len]
# deeplog_file_generator(os.path.join(output_dir,'train'), train, ["EventId", "deltaT"])
deeplog_file_generator(os.path.join(output_dir,'train'), train, ["EventId"])

print("training size {}".format(train_len))


###############
# Test Normal #
###############
test_normal = df_normal[train_len:]
deeplog_file_generator(os.path.join(output_dir, 'test_normal'), test_normal, ["EventId"])
print("test normal size {}".format(normal_len - train_len))

del df_normal
del train
del test_normal
gc.collect()

#################
# Test Abnormal #
#################
df_abnormal = deeplog_df[deeplog_df["Label"] == 1]
#df_abnormal["EventId"] = df_abnormal["EventId"].progress_apply(lambda e: event_index_map[e] if event_index_map.get(e) else UNK)
deeplog_file_generator(os.path.join(output_dir,'test_abnormal'), df_abnormal, ["EventId"])
print('test abnormal size {}'.format(len(df_abnormal)))

training size 13718
test normal size 20579
test abnormal size 3018


In [None]:
key_count_train = train['EventId'].apply(tuple).nunique()
key_count_test = deeplog_df['EventId'].apply(tuple).nunique()
print("Number of total unique log keys:", key_count_train + key_count_test)
print("Number of unique log keys in test set:", key_count_test)