In [1]:
import sys
sys.path.append('../')

import os
import pandas as pd
import numpy as np
from logparser import Spell, Drain
from tqdm import tqdm
#from logdeep.dataset.session import sliding_window
import glob
tqdm.pandas()
pd.options.mode.chained_assignment = None  # default='warn'


# In the first column of the log, "-" indicates non-alert messages while others are alert messages.
def count_anomaly(log_path):
    total_size = 0
    normal_size = 0
    with open(log_path, errors='ignore') as f:
        for line in f:
            total_size += 1
            if line.split('')[0] == '-':
                normal_size += 1
    print("total size {}, abnormal size {}".format(total_size, total_size - normal_size))


def deeplog_file_generator(filename, df, features):
    with open(filename, 'w') as f:
        for _, row in df.iterrows():
            for val in zip(*row[features]):
                f.write(','.join([str(v) for v in val]) + ' ')
            f.write('\n')




def sample_raw_data(data_file, output_file, sample_window_size, sample_step_size):
    # sample 1M by sliding window, abnormal rate is over 2%
    sample_data = []
    labels = []
    idx = 0

    # spirit dataset can start from the 2Mth line, as there are many abnormal lines gathering in the first 2M
    with open(data_file, 'r', errors='ignore') as f:
        for line in f:
            labels.append(line.split()[0] != '-')
            sample_data.append(line)

            if len(labels) == sample_window_size:
                abnormal_rate = sum(np.array(labels)) / len(labels)
                print(f"{idx + 1} lines, abnormal rate {abnormal_rate}")
                break

            idx += 1
            if idx % sample_step_size == 0:
                print(f"Process {round(idx/sample_window_size * 100,4)} % raw data", end='\r')

    with open(output_file, "w") as f:
        f.writelines(sample_data)

    print("Sampling done")




In [2]:
def parse_log(input_dir, output_dir, log_file, parser_type):
    #log_format = '<Label> <Id> <Date> <Admin> <Month> <Day> <Time> <AdminAddr> <Content>'
    log_format = '<Date> <Time>, <Type>             <Admin>  <Content>'
    regex = [
        r'(0x)[0-9a-fA-F]+',  # hexadecimal
        r'\d+\.\d+\.\d+\.\d+',
        r'(?<=Warning: we failed to resolve data source name )[\w\s]+',
        r'\d+'
    ]
    keep_para = False
    if parser_type == "drain":
        # the hyper parameter is set according to http://jmzhu.logpai.com/pub/pjhe_icws2017.pdf
        st = 0.3  # Similarity threshold
        depth = 3  # Depth of all leaf nodes

        # Drain is modified
        parser = Drain.LogParser(log_format,
                                 indir=input_dir,
                                 outdir=output_dir,
                                 depth=depth,
                                 st=st,
                                 rex=regex,
                                 keep_para=keep_para, maxChild=1000)
        parser.parse(log_file)

    elif parser_type == "spell":
        tau = 0.35
        parser = Spell.LogParser(indir=data_dir,
                                 outdir=output_dir,
                                 log_format=log_format,
                                 tau=tau,
                                 rex=regex,
                                 keep_para=keep_para)
        parser.parse(log_file)

In [8]:
##File parser

data_dir = os.path.expanduser("../dataset/Windows")
output_dir = "../output/windows/"
raw_log_file = "xaa.txt"
sample_log_file = "xaa.txt"
sample_window_size = 2*10**7
sample_step_size = 10**4
window_name = ''
log_file = sample_log_file

parser_type = 'drain'
#mins
window_size = 1
step_size = 0.5
train_ratio = 6000

########
# count anomaly
########
# count_anomaly(data_dir + log_file)
# sys.exit()

import glob
file_list = glob.glob(data_dir +"/*.txt")
#########
# sample raw data
#########
#sample_raw_data(data_dir+raw_log_file, data_dir+sample_log_file, sample_window_size, sample_step_size )

a = glob.glob(data_dir +"/*.*") 
file_list=[os.path.basename(list_item) for list_item in a]
print(file_list)
##########
# Parser #
#########
for file in file_list:
    print("\nParsing", file)
    parse_log(data_dir, output_dir, file, parser_type)


##################
# Transformation #
##################
for file in file_list:
    print("\nTransforming", file)
    df = pd.read_csv(f'{output_dir}{file}_structured.csv')
    df['Label'] = 0

    df.loc[df['Content'].str.contains("HRESULT"), "Label"] = 1
    df.to_excel(f'{output_dir}{file}_labeled.xlsx')  
    

 
# csv files in the path
file_list = glob.glob(output_dir + "*_labeled.xlsx")
 
# list of excel files we want to merge.
# pd.read_excel(file_path) reads the excel
# data into pandas dataframe.
excl_list = []
 
for file in file_list:
    excl_list.append(pd.read_excel(file))
 
# create a new dataframe to store the
# merged excel file.
excl_merged = pd.DataFrame()
 
for excl_file in excl_list:
     
    # appends the data into the excl_merged
    # dataframe.
    excl_merged = excl_merged.append(
      excl_file, ignore_index=True)
 
# exports the dataframe into excel file with
# specified name.
excl_merged.to_excel('total_output.xlsx', index=False)
print("done!!")

['wrd00.txt', 'wrd01.txt', 'wrd02.txt', 'wrd03.txt', 'wrd04.txt', 'wrd05.txt', 'wrd06.txt', 'wrd07.txt']

Parsing wrd00.txt
Parsing file: ../dataset/Windows\wrd00.txt
Total size after encoding is 9995 10000
Parsing done. [Time taken: 0:00:00.618187]

Parsing wrd01.txt
Parsing file: ../dataset/Windows\wrd01.txt
Total size after encoding is 9999 10000
Parsing done. [Time taken: 0:00:00.588087]

Parsing wrd02.txt
Parsing file: ../dataset/Windows\wrd02.txt
Total size after encoding is 9972 10000
Parsing done. [Time taken: 0:00:00.700026]

Parsing wrd03.txt
Parsing file: ../dataset/Windows\wrd03.txt
Total size after encoding is 10000 10000
Parsing done. [Time taken: 0:00:00.719724]

Parsing wrd04.txt
Parsing file: ../dataset/Windows\wrd04.txt
Total size after encoding is 10000 10000
Parsing done. [Time taken: 0:00:00.694393]

Parsing wrd05.txt
Parsing file: ../dataset/Windows\wrd05.txt
Total size after encoding is 10000 10000
Parsing done. [Time taken: 0:00:00.728774]

Parsing wrd06.txt
Par

In [4]:



# sampling with sliding window
#deeplog_df = sliding_window(df[["timestamp", "Label", "EventId", "deltaT" <Date> <Time>, <Type>             <Admin>  <Content>]],
                            #para={"window_size": float(window_size)*60, "step_size": float(step_size) * 60}
                            #)
#output_dir += window_name

#########
# Train #
#########
df_normal = deeplog_df[deeplog_df["Label"] == 0]
df_normal = df_normal.sample(frac=1, random_state=12).reset_index(drop=True) #shuffle
normal_len = len(df_normal)
train_len = int(train_ratio) if train_ratio >= 1 else int(normal_len * train_ratio)

train = df_normal[:train_len]
deeplog_file_generator(os.path.join(output_dir,'train'), train, ["EventId"])
print("training size {}".format(train_len))


###############
# Test Normal #
###############
test_normal = df_normal[train_len:]
deeplog_file_generator(os.path.join(output_dir, 'test_normal'), test_normal, ["EventId"])
print("test normal size {}".format(normal_len - train_len))


#################
# Test Abnormal #
#################
df_abnormal = deeplog_df[deeplog_df["Label"] == 1]
deeplog_file_generator(os.path.join(output_dir,'test_abnormal'), df_abnormal, ["EventId"])
print('test abnormal size {}'.format(len(df_abnormal)))


NameError: name 'deeplog_df' is not defined

In [None]:
# specifying the path to csv files
path = "C:/downloads"
 
# csv files in the path
file_list = glob.glob(path + "/*.xlsx")
 
# list of excel files we want to merge.
# pd.read_excel(file_path) reads the excel
# data into pandas dataframe.
excl_list = []
 
for file in file_list:
    excl_list.append(pd.read_excel(file))
 
# create a new dataframe to store the
# merged excel file.
excl_merged = pd.DataFrame()
 
for excl_file in excl_list:
     
    # appends the data into the excl_merged
    # dataframe.
    excl_merged = excl_merged.append(
      excl_file, ignore_index=True)
 
# exports the dataframe into excel file with
# specified name.
excl_merged.to_excel('total_food_sales.xlsx', index=False)