In [4]:
import pandas as pd
import json
import sys
sys.path.append('../../')

import os
import gc
import pandas as pd
import numpy as np
from logparser import Spell, Drain
import argparse
from tqdm import tqdm

data_dir = os.path.expanduser("../Loghub-HPC/")
output_dir = os.path.expanduser("../Loghub-HPC/processed/")
log_file = "HPC.csv"

def deeplog_file_generator(filename, df, features):
    with open(filename, 'w') as f:
        for _, row in df.iterrows():
            for val in zip(*row[features]):
                f.write(','.join([str(v) for v in val]) + ' ')
            f.write('\n')


def parse_log(input_dir, output_dir, log_file, parser_type):
    log_format = '<LogId> <Node> <Component> <State> <Time> <Flag> <Content>'
    regex = [
        r'(0x)[0-9a-fA-F]+', #hexadecimal
        r'\d+.\d+.\d+.\d+',
        # r'/\w+( )$'
        r'\d+'
    ]
    keep_para = False
    if parser_type == "drain":
        # the hyper parameter is set according to http://jmzhu.logpai.com/pub/pjhe_icws2017.pdf
        st = 0.3  # Similarity threshold
        depth = 3  # Depth of all leaf nodes
        parser = Drain.LogParser(log_format, indir=data_dir, outdir=output_dir, depth=depth, st=st, rex=regex, keep_para=keep_para)
        parser.parse(log_file)
    elif parser_type == "spell":
        tau = 0.55
        parser = Spell.LogParser(indir=data_dir, outdir=output_dir, log_format=log_format, tau=tau, rex=regex, keep_para=keep_para)
        parser.parse(log_file)


def sliding_window(raw_data, para):
    """
    split logs into sliding windows/session
    :param raw_data: dataframe columns=[timestamp, label, eventid, time duration]
    :param para:{window_size: seconds, step_size: seconds}
    :return: dataframe columns=[eventids, time durations, label]
    """
    log_size = raw_data.shape[0]
    label_data, time_data = raw_data.iloc[:, 1], raw_data.iloc[:, 0]
    logkey_data, deltaT_data = raw_data.iloc[:, 2], raw_data.iloc[:, 3]
    new_data = []
    start_end_index_pair = set()

    start_time = time_data[0]
    end_time = start_time + para["window_size"]
    start_index = 0
    end_index = 0

    # get the first start, end index, end time
    for cur_time in time_data:
        if cur_time < end_time:
            end_index += 1
        else:
            break

    start_end_index_pair.add(tuple([start_index, end_index]))

    # move the start and end index until next sliding window
    num_session = 1
    while end_index < log_size:
        start_time = start_time + para['step_size']
        end_time = start_time + para["window_size"]
        for i in range(start_index, log_size):
            if time_data[i] < start_time:
                i += 1
            else:
                break
        for j in range(end_index, log_size):
            if time_data[j] < end_time:
                j += 1
            else:
                break
        start_index = i
        end_index = j

        # when start_index == end_index, there is no value in the window
        if start_index != end_index:
            start_end_index_pair.add(tuple([start_index, end_index]))

        num_session += 1
        if num_session % 1000 == 0:
            print("process {} time window".format(num_session), end='\r')

    for (start_index, end_index) in start_end_index_pair:
        dt = deltaT_data[start_index: end_index].values
        dt[0] = 0
        new_data.append([
            time_data[start_index: end_index].values,
            max(label_data[start_index:end_index]),
            logkey_data[start_index: end_index].values,
            dt
        ])

    assert len(start_end_index_pair) == len(new_data)
    print('there are %d instances (sliding windows) in this dataset\n' % len(start_end_index_pair))
    return pd.DataFrame(new_data, columns=raw_data.columns)


def hybrid_window(raw_data, para):
    """
    Split logs into hybrid windows (time-based and event-based)
    :param raw_data: dataframe columns=[timestamp, Label, EventId, deltaT]
    :param para: {time_window: max time window in seconds, event_window: max number of events in each window}
    :return: dataframe columns=[timestamp, Label, EventId, deltaT]
    """
    time_data, logkey_data, deltaT_data, label_data = (
        raw_data['timestamp'],
        raw_data['EventId'], 
        raw_data['deltaT'],
        raw_data['Label']    
    )
    new_data = []

    start_idx = 0
    while start_idx < len(raw_data):
        end_idx = start_idx + 1
        end_time = time_data[start_idx] + para['time_window']
        
        while (end_idx < len(raw_data) and 
               time_data[end_idx] < end_time and 
               end_idx - start_idx < para['event_window']):
            end_idx += 1
        
        dt = deltaT_data[start_idx:end_idx].values
        dt[0] = 0
        # Aggregate labels within the window by taking the maximum value
        label = int(label_data[start_idx:end_idx].max())
        new_data.append([
            time_data[start_idx:end_idx].values,
            label,
            logkey_data[start_idx:end_idx].values,
            dt
        ])
        start_idx = end_idx

    print(f'There are {len(new_data)} instances (hybrid windows) in this dataset\n')
    return pd.DataFrame(new_data, columns=raw_data.columns)

In [6]:
##########
# Parser #
##########
parse_log(data_dir, output_dir, log_file, 'drain')
df = pd.read_csv(f'{output_dir}{log_file}_structured.csv')

Parsing file: ../Loghub-HPC/HPC.csv
Total size after encoding is 432260 433440
Parsing done. [Time taken: 0:00:11.770759]


In [7]:
df

Unnamed: 0,LineId,LogId,Node,Component,State,Time,Flag,Content,EventId,EventTemplate
0,1,2557285,node-233,unix.hw,state_change.unavailable,1.072946e+09,1,Component State Change: Component \042alt0\042...,2bc64f63,Component State Change: Component <*> is in th...
1,2,2562603,node-233,unix.hw,state_change.unavailable,1.073551e+09,1,Component State Change: Component \042alt0\042...,2bc64f63,Component State Change: Component <*> is in th...
2,3,2561225,node-228,unix.hw,state_change.unavailable,1.073374e+09,1,Component State Change: Component \042alt0\042...,2bc64f63,Component State Change: Component <*> is in th...
3,4,2598209,node-ms0,unix.hw,state_change.unavailable,1.074291e+09,1,Component State Change: Component \042alt0\042...,2bc64f63,Component State Change: Component <*> is in th...
4,5,2598216,node-ms0,unix.hw,state_change.unavailable,1.074292e+09,1,Component State Change: Component \042alt0\042...,2bc64f63,Component State Change: Component <*> is in th...
...,...,...,...,...,...,...,...,...,...,...
432255,432256,461145,node-193,unix.hw,state_change.available,1.145552e+09,1,Component State Change: Component \042alt0\042...,2bc64f63,Component State Change: Component <*> is in th...
432256,432257,465915,node-129,unix.hw,state_change.available,1.145638e+09,1,Component State Change: Component \042alt0\042...,2bc64f63,Component State Change: Component <*> is in th...
432257,432258,465918,node-129,unix.hw,state_change.available,1.145638e+09,1,Component State Change: Component \042alt0\042...,2bc64f63,Component State Change: Component <*> is in th...
432258,432259,465921,node-129,unix.hw,state_change.available,1.145638e+09,1,Component State Change: Component \042alt0\042...,2bc64f63,Component State Change: Component <*> is in th...


In [8]:
##################
# Transformation #
##################
time_window = 15 # mins
event_window = 10 # count
train_ratio = 0.4

# data preprocess
df['datetime'] = pd.to_datetime(df['Time'], unit='s')
event_ids_to_flag = [
    'e7ede03f', 'e07f96ea', '3dff51fb', 'bd82b62d', '562e2cee', 'b02b9741', '74e8333a', '252f035f', 
    '683ade8b', '7e85bcb6', '5cdfd2f3', '7b80ea70', '0cc39f34', 'c0fcff27', 'a84ecf37', '20bdc066', 
    'b38fa951', '8c1777d1', 'dfc857c8', '6eff2f7f', '03b9852a'
]
df["Label"] = df["EventId"].apply(lambda x: int(x in event_ids_to_flag))
df['timestamp'] = df["datetime"].values.astype(np.int64) // 10 ** 9
df['deltaT'] = df['datetime'].diff() / np.timedelta64(1, 's')
df['deltaT'].fillna(0)

# sampling with hybrid window
deeplog_df = hybrid_window(df[["timestamp", "Label", "EventId", "deltaT"]],
                            para={"time_window": int(time_window)*60, "event_window": int(event_window)}
                            )

deeplog_df

There are 50640 instances (hybrid windows) in this dataset



Unnamed: 0,timestamp,Label,EventId,deltaT
0,[1072945921],0,[2bc64f63],[0.0]
1,"[1073550845, 1073373907]",0,"[2bc64f63, 2bc64f63]","[0.0, -176938.0]"
2,"[1074291273, 1074291570]",0,"[2bc64f63, 2bc64f63]","[0.0, 297.0]"
3,"[1074300563, 1074300563, 1074300563, 107430056...",0,"[2bc64f63, 2bc64f63, 2bc64f63, 2bc64f63, 2bc64...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,"[1074300563, 1074300563, 1074300563, 107430056...",0,"[2bc64f63, 2bc64f63, 2bc64f63, 2bc64f63, 2bc64...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...
50635,"[1145544900, 1145544901, 1145544908]",0,"[2bc64f63, 2bc64f63, 2bc64f63]","[0.0, 1.0, 7.0]"
50636,"[1145552100, 1145552106, 1145552116, 114555211...",0,"[2bc64f63, 2bc64f63, 2bc64f63, 2bc64f63, 2bc64...","[0.0, 6.0, 10.0, 3.0, 164.0, 1.0, 0.0, 0.0, 0...."
50637,"[1145552284, 1145552284, 1145552284, 114555228...",0,"[2bc64f63, 2bc64f63, 2bc64f63, 2bc64f63, 2bc64...","[0.0, 0.0, 0.0, 0.0, 6.0, 0.0, 0.0, 0.0, 0.0]"
50638,"[1145638127, 1145638148, 1145638242]",0,"[2bc64f63, 2bc64f63, 2bc64f63]","[0.0, 21.0, 94.0]"


In [9]:
#########
# Train #
#########
df_normal = deeplog_df[deeplog_df["Label"] == 0]
df_normal = df_normal.sample(frac=1, random_state=12).reset_index(drop=True) #shuffle
normal_len = len(df_normal)
train_len = int(normal_len * train_ratio)

train = df_normal[:train_len]
# deeplog_file_generator(os.path.join(output_dir,'train'), train, ["EventId", "deltaT"])
deeplog_file_generator(os.path.join(output_dir,'train'), train, ["EventId"])

print("training size {}".format(train_len))

###############
# Test Normal #
###############
test_normal = df_normal[train_len:]
deeplog_file_generator(os.path.join(output_dir, 'test_normal'), test_normal, ["EventId"])
print("test normal size {}".format(normal_len - train_len))

del df_normal
del train
del test_normal
gc.collect()

#################
# Test Abnormal #
#################
df_abnormal = deeplog_df[deeplog_df["Label"] == 1]
#df_abnormal["EventId"] = df_abnormal["EventId"].progress_apply(lambda e: event_index_map[e] if event_index_map.get(e) else UNK)
deeplog_file_generator(os.path.join(output_dir,'test_abnormal'), df_abnormal, ["EventId"])
print('test abnormal size {}'.format(len(df_abnormal)))


# test_df = df_train[train_len:]
# df_abnormal = test_df[test_df["Label"] == 1]
# deeplog_file_generator(os.path.join(output_dir, 'test_normal'), test_df, ["EventId"])
# print("test size {}".format(len(test_df)))
# print('test abnormal size {}'.format(len(df_abnormal)))

# df_train['SequenceLength'] = df_train['EventId'].apply(len)
# average_length = df_train['SequenceLength'].mean()
# print("average log sequence length:", round(average_length))

training size 17821
test normal size 26732
test abnormal size 6087


In [86]:
key_count_train = train['EventId'].apply(tuple).nunique()
key_count_test = test_df['EventId'].apply(tuple).nunique()
print("Number of total unique log keys:", key_count_train + key_count_test)
print("Number of unique log keys in test set:", key_count_test)

Number of total unique log keys: 10415
Number of unique log keys in test set: 6004
