In [5]:
import sys
sys.path.append('../')

import os
import pandas as pd
import re
import numpy as np
from logparser import Spell, Drain
from tqdm import tqdm
#from logdeep.dataset.session import sliding_window
import glob
tqdm.pandas()
pd.options.mode.chained_assignment = None  # default='warn'


# In the first column of the log, "-" indicates non-alert messages while others are alert messages.
def count_anomaly(log_path):
    total_size = 0
    normal_size = 0
    with open(log_path, errors='ignore') as f:
        for line in f:
            total_size += 1
            if line.split('')[0] == '-':
                normal_size += 1
    print("total size {}, abnormal size {}".format(total_size, total_size - normal_size))


def deeplog_file_generator(filename, df, features):
    with open(filename, 'w') as f:
        for _, row in df.iterrows():
            for val in zip(*row[features]):
                f.write(','.join([str(v) for v in val]) + ' ')
            f.write('\n')




def sample_raw_data(data_file, output_file, sample_window_size, sample_step_size):
    # sample 1M by sliding window, abnormal rate is over 2%
    sample_data = []
    labels = []
    idx = 0

    # spirit dataset can start from the 2Mth line, as there are many abnormal lines gathering in the first 2M
    with open(data_file, 'r', errors='ignore') as f:
        for line in f:
            labels.append(line.split()[0] != '-')
            sample_data.append(line)

            if len(labels) == sample_window_size:
                abnormal_rate = sum(np.array(labels)) / len(labels)
                print(f"{idx + 1} lines, abnormal rate {abnormal_rate}")
                break

            idx += 1
            if idx % sample_step_size == 0:
                print(f"Process {round(idx/sample_window_size * 100,4)} % raw data", end='\r')

    with open(output_file, "w") as f:
        f.writelines(sample_data)

    print("Sampling done")




In [6]:
def parse_log(input_dir, output_dir, log_file, parser_type):
    #log_format = '<Label> <Id> <Date> <Admin> <Month> <Day> <Time> <AdminAddr> <Content>'
    log_format = '<Date> <Time>, <Type>             <Admin>  <Content>'
    regex = [
        r'(0x)[0-9a-fA-F]+',  # hexadecimal
        r'\d+\.\d+\.\d+\.\d+',
        r'(?<=Warning: we failed to resolve data source name )[\w\s]+',
        r'\d+'
    ]
    keep_para = False
    if parser_type == "drain":
        # the hyper parameter is set according to http://jmzhu.logpai.com/pub/pjhe_icws2017.pdf
        st = 0.3  # Similarity threshold
        depth = 3  # Depth of all leaf nodes

        # Drain is modified
        parser = Drain.LogParser(log_format,
                                 indir=input_dir,
                                 outdir=output_dir,
                                 depth=depth,
                                 st=st,
                                 rex=regex,
                                 keep_para=keep_para, maxChild=1000)
        parser.parse(log_file)

    elif parser_type == "spell":
        tau = 0.35
        parser = Spell.LogParser(indir=data_dir,
                                 outdir=output_dir,
                                 log_format=log_format,
                                 tau=tau,
                                 rex=regex,
                                 keep_para=keep_para)
        parser.parse(log_file)

In [4]:
##File parser

data_dir = os.path.expanduser("../dataset/Windows")
output_dir = "../output/windows/"
raw_log_file = "xaa.txt"
sample_log_file = "xaa.txt"
sample_window_size = 2*10**7
sample_step_size = 10**4
window_name = ''
log_file = sample_log_file

parser_type = 'drain'
#mins
window_size = 1
step_size = 0.5
train_ratio = 6000

hresult_dict = {'0x00000000': 1,'0x800f080d':2,'0x800f0805':3, '0x80070490':4, '0x80004005':5, '0x80070001':6, '0x80071a2d':7, '0x80070216':8, '0x80070bc2':9, '0x800f0816':10, '0x800f0806':11, '0x800f0902':12, '0x80070002':13}

def matchfunc(line):
    # Capture one-or-more characters of non-whitespace after the initial match
    match = re.search(r'HRESULT = (\S+)', line)

    # Did we find a match?
    if match:
        # Yes, process it
        weather = match.group(1)
        return(hresult_dict[weather])
    return(0)

########
# count anomaly
########
# count_anomaly(data_dir + log_file)
# sys.exit()

import glob
file_list = glob.glob(data_dir +"/*.txt")
#########
# sample raw data
#########
#sample_raw_data(data_dir+raw_log_file, data_dir+sample_log_file, sample_window_size, sample_step_size )

a = glob.glob(data_dir +"/*.*") 
file_list=[os.path.basename(list_item) for list_item in a]
print(file_list)
##########
# Parser #
#########
for file in file_list:
    print("\nParsing", file)
    parse_log(data_dir, output_dir, file, parser_type)

['wrd9632.txt', 'wrd9037.txt', 'wrd9700.txt', 'wrd9765.txt', 'wrd990156.txt', 'wrd990056.txt', 'wrd9451.txt', 'wrd9888.txt', 'wrd9656.txt', 'wrd9773.txt', 'wrd9541.txt', 'wrd9391.txt', 'wrd9841.txt', 'wrd9624.txt', 'wrd9117.txt', 'wrd990146.txt', 'wrd9335.txt', 'wrd990111.txt', 'wrd9251.txt', 'wrd9333.txt', 'wrd9158.txt', 'wrd990124.txt', 'wrd990060.txt', 'wrd9696.txt', 'wrd9567.txt', 'wrd02.txt', 'wrd59.txt', 'wrd9775.txt', 'wrd9481.txt', 'wrd9655.txt', 'wrd9606.txt', 'wrd9644.txt', 'wrd9326.txt', 'wrd9076.txt', 'wrd9460.txt', 'wrd9004.txt', 'wrd9790.txt', 'wrd9488.txt', 'wrd9548.txt', 'wrd9538.txt', 'wrd9679.txt', 'wrd9307.txt', 'wrd9837.txt', 'wrd9890.txt', 'wrd9688.txt', 'wrd03.txt', 'wrd9099.txt', 'wrd9337.txt', 'wrd9583.txt', 'wrd70.txt', 'wrd9025.txt', 'wrd9534.txt', 'wrd9370.txt', 'wrd990140.txt', 'wrd9755.txt', 'wrd990006.txt', 'wrd9240.txt', 'wrd990017.txt', 'wrd9636.txt', 'wrd9769.txt', 'wrd9631.txt', 'wrd9448.txt', 'wrd9554.txt', 'wrd9148.txt', 'wrd9342.txt', 'wrd9133.txt',

Total size after encoding is 100000 100000
Parsing done. [Time taken: 0:00:10.066822]

Parsing wrd9037.txt
Parsing file: ../dataset/Windows/wrd9037.txt
Total size after encoding is 99645 100000
Parsing done. [Time taken: 0:00:10.641708]

Parsing wrd9700.txt
Parsing file: ../dataset/Windows/wrd9700.txt
Total size after encoding is 99975 100000
Parsing done. [Time taken: 0:00:10.700493]

Parsing wrd9765.txt
Parsing file: ../dataset/Windows/wrd9765.txt
Total size after encoding is 99985 100000
Parsing done. [Time taken: 0:00:10.475468]

Parsing wrd990156.txt
Parsing file: ../dataset/Windows/wrd990156.txt
Total size after encoding is 8384 8388
Parsing done. [Time taken: 0:00:00.752175]

Parsing wrd990056.txt
Parsing file: ../dataset/Windows/wrd990056.txt
Total size after encoding is 99953 100000
Parsing done. [Time taken: 0:00:10.211905]

Parsing wrd9451.txt
Parsing file: ../dataset/Windows/wrd9451.txt
Total size after encoding is 99972 100000
Parsing done. [Time taken: 0:00:10.251246]

Pa

Parsing done. [Time taken: 0:00:20.938704]

Parsing wrd990006.txt
Parsing file: ../dataset/Windows/wrd990006.txt
Total size after encoding is 99975 100000
Parsing done. [Time taken: 0:00:20.979997]

Parsing wrd9240.txt
Parsing file: ../dataset/Windows/wrd9240.txt
Total size after encoding is 99952 100000
Parsing done. [Time taken: 0:00:21.497518]

Parsing wrd990017.txt
Parsing file: ../dataset/Windows/wrd990017.txt
Total size after encoding is 99969 100000
Parsing done. [Time taken: 0:00:20.965599]

Parsing wrd9636.txt
Parsing file: ../dataset/Windows/wrd9636.txt
Total size after encoding is 100000 100000
Parsing done. [Time taken: 0:00:20.254285]

Parsing wrd9769.txt
Parsing file: ../dataset/Windows/wrd9769.txt
Total size after encoding is 99947 100000
Parsing done. [Time taken: 0:00:21.454619]

Parsing wrd9631.txt
Parsing file: ../dataset/Windows/wrd9631.txt
Total size after encoding is 100000 100000
Parsing done. [Time taken: 0:00:20.420246]

Parsing wrd9448.txt
Parsing file: ../dat

Parsing done. [Time taken: 0:00:21.029594]

Parsing wrd9185.txt
Parsing file: ../dataset/Windows/wrd9185.txt
Total size after encoding is 99958 100000
Parsing done. [Time taken: 0:00:20.670309]

Parsing wrd00.txt
Parsing file: ../dataset/Windows/wrd00.txt
Total size after encoding is 99962 100000
Parsing done. [Time taken: 0:00:22.724953]

Parsing wrd9648.txt
Parsing file: ../dataset/Windows/wrd9648.txt
Total size after encoding is 99979 100000
Parsing done. [Time taken: 0:00:20.817781]

Parsing wrd9504.txt
Parsing file: ../dataset/Windows/wrd9504.txt
Total size after encoding is 99982 100000
Parsing done. [Time taken: 0:00:20.786314]

Parsing wrd9522.txt
Parsing file: ../dataset/Windows/wrd9522.txt
Total size after encoding is 99998 100000
Parsing done. [Time taken: 0:00:21.191744]

Parsing wrd9236.txt
Parsing file: ../dataset/Windows/wrd9236.txt
Total size after encoding is 99929 100000
Parsing done. [Time taken: 0:00:20.463628]

Parsing wrd9684.txt
Parsing file: ../dataset/Windows/w

Total size after encoding is 99956 100000
Parsing done. [Time taken: 0:00:20.795611]

Parsing wrd990090.txt
Parsing file: ../dataset/Windows/wrd990090.txt
Total size after encoding is 99986 100000
Parsing done. [Time taken: 0:00:20.927094]

Parsing wrd87.txt
Parsing file: ../dataset/Windows/wrd87.txt
Total size after encoding is 99839 100000
Parsing done. [Time taken: 0:00:21.011212]

Parsing wrd9551.txt
Parsing file: ../dataset/Windows/wrd9551.txt
Total size after encoding is 99972 100000
Parsing done. [Time taken: 0:00:20.700741]

Parsing wrd9116.txt
Parsing file: ../dataset/Windows/wrd9116.txt
Total size after encoding is 99975 100000
Parsing done. [Time taken: 0:00:21.030744]

Parsing wrd9210.txt
Parsing file: ../dataset/Windows/wrd9210.txt
Total size after encoding is 99972 100000
Parsing done. [Time taken: 0:00:20.844468]

Parsing wrd9681.txt
Parsing file: ../dataset/Windows/wrd9681.txt
Total size after encoding is 99980 100000
Parsing done. [Time taken: 0:00:20.841070]

Parsing 

Parsing done. [Time taken: 0:00:20.760457]

Parsing wrd9542.txt
Parsing file: ../dataset/Windows/wrd9542.txt
Total size after encoding is 99972 100000
Parsing done. [Time taken: 0:00:21.011457]

Parsing wrd9354.txt
Parsing file: ../dataset/Windows/wrd9354.txt
Total size after encoding is 100000 100000
Parsing done. [Time taken: 0:00:20.774465]

Parsing wrd9733.txt
Parsing file: ../dataset/Windows/wrd9733.txt
Total size after encoding is 99992 100000
Parsing done. [Time taken: 0:00:20.725145]

Parsing wrd990071.txt
Parsing file: ../dataset/Windows/wrd990071.txt
Total size after encoding is 99986 100000
Parsing done. [Time taken: 0:00:20.528065]

Parsing wrd990115.txt
Parsing file: ../dataset/Windows/wrd990115.txt
Total size after encoding is 99969 100000
Parsing done. [Time taken: 0:00:21.234907]

Parsing wrd9137.txt
Parsing file: ../dataset/Windows/wrd9137.txt
Total size after encoding is 99948 100000
Parsing done. [Time taken: 0:00:20.858417]

Parsing wrd9701.txt
Parsing file: ../data

Parsing done. [Time taken: 0:00:21.252539]

Parsing wrd990051.txt
Parsing file: ../dataset/Windows/wrd990051.txt
Total size after encoding is 99784 100006
Parsing done. [Time taken: 0:00:20.691794]

Parsing wrd9010.txt
Parsing file: ../dataset/Windows/wrd9010.txt
Total size after encoding is 99988 100000
Parsing done. [Time taken: 0:00:21.231854]

Parsing wrd990133.txt
Parsing file: ../dataset/Windows/wrd990133.txt
Total size after encoding is 99968 100000
Parsing done. [Time taken: 0:00:20.703870]

Parsing wrd9297.txt
Parsing file: ../dataset/Windows/wrd9297.txt
Total size after encoding is 99909 100002
Parsing done. [Time taken: 0:00:21.747958]

Parsing wrd9371.txt
Parsing file: ../dataset/Windows/wrd9371.txt
Total size after encoding is 99967 100000
Parsing done. [Time taken: 0:00:21.194275]

Parsing wrd9343.txt
Parsing file: ../dataset/Windows/wrd9343.txt
Total size after encoding is 99983 100000
Parsing done. [Time taken: 0:00:20.680509]

Parsing wrd9166.txt
Parsing file: ../datas

KeyboardInterrupt: 

In [None]:
##File parser

data_dir = os.path.expanduser("../dataset/Windows")
output_dir = "../output/windows/"
raw_log_file = "xaa.txt"
sample_log_file = "xaa.txt"
sample_window_size = 2*10**7
sample_step_size = 10**4
window_name = ''
log_file = sample_log_file

parser_type = 'drain'
#mins
window_size = 1
step_size = 0.5
train_ratio = 6000

hresult_dict = {'0x00000000': 1,'0x800f080d':2,'0x800f0805':3, '0x80070490':4, '0x80004005':5, '0x80070001':6, '0x80071a2d':7, '0x80070216':8, '0x80070bc2':9, '0x800f0816':10, '0x800f0806':11, '0x800f0902':12, '0x80070002':13, '0x80070013':14}

def matchfunc(line):
    # Capture one-or-more characters of non-whitespace after the initial match
    match = re.search(r'HRESULT = (\S+)', line)

    # Did we find a match?
    if match:
        # Yes, process it
        weather = match.group(1)
        return(hresult_dict[weather])
    return(0)

########
# count anomaly
########
# count_anomaly(data_dir + log_file)
# sys.exit()

import glob

a = glob.glob(output_dir +"/*_structured.csv") 
file_list=[os.path.basename(list_item) for list_item in a]

#print(file_list)


##################
# Transformation #
##################
for file in file_list:
    print("\nTransforming", file)
    df = pd.read_csv(f'{output_dir}{file}')
    df['Label'] = 0

    #df.loc[df['Content'].str.contains("HRESULT"), "Label"] = 1
    df['Label'] = df.apply(lambda row: matchfunc(row['Content']), axis = 1)
    df.to_excel(f'{output_dir}{file}_labeled.xlsx')  
    



# csv files in the path
file_list = glob.glob(output_dir + "*_labeled.xlsx")
 
# list of excel files we want to merge.
# pd.read_excel(file_path) reads the excel
# data into pandas dataframe.
excl_list = []
 
for file in file_list:
    excl_list.append(pd.read_excel(file))
 
# create a new dataframe to store the
# merged excel file.
excl_merged = pd.DataFrame()
 
for excl_file in excl_list:
     
    # appends the data into the excl_merged
    # dataframe.
    excl_merged = excl_merged.append(
      excl_file, ignore_index=True)
 
# exports the dataframe into excel file with
# specified name.
excl_merged.to_csv('total_output.csv', index=False)
print("done!!")


Transforming wrd9362.txt_structured.csv

Transforming wrd990006.txt_structured.csv

Transforming wrd56.txt_structured.csv

Transforming wrd990138.txt_structured.csv

Transforming wrd9856.txt_structured.csv

Transforming wrd9583.txt_structured.csv

Transforming wrd9035.txt_structured.csv

Transforming wrd9542.txt_structured.csv

Transforming wrd9141.txt_structured.csv

Transforming wrd9437.txt_structured.csv

Transforming wrd9198.txt_structured.csv

Transforming wrd78.txt_structured.csv

Transforming wrd9835.txt_structured.csv

Transforming wrd9781.txt_structured.csv

Transforming wrd9134.txt_structured.csv

Transforming wrd990071.txt_structured.csv

Transforming wrd9137.txt_structured.csv

Transforming wrd9240.txt_structured.csv

Transforming wrd14.txt_structured.csv

Transforming wrd9089.txt_structured.csv

Transforming wrd9576.txt_structured.csv

Transforming wrd9479.txt_structured.csv

Transforming wrd9581.txt_structured.csv

Transforming wrd9486.txt_structured.csv

Transforming wr

In [None]:
import glob
file_list = glob.glob(data_dir +"/*.txt")
#########
# sample raw data
#########
#sample_raw_data(data_dir+raw_log_file, data_dir+sample_log_file, sample_window_size, sample_step_size )

a = glob.glob(data_dir +"/*.*") 
file_list=[os.path.basename(list_item) for list_item in a]
print(file_list)
##########
# Parser #
#########
for file in file_list:
    print("\nParsing", file)
    parse_log(data_dir, output_dir, file, parser_type)


##################
# Transformation #
##################
for file in file_list:
    print("\nTransforming", file)
    df = pd.read_csv(f'{output_dir}{file}_structured.csv')
    df['Label'] = 0

    #df.loc[df['Content'].str.contains("HRESULT"), "Label"] = 1
    df['Label'] = df.apply(lambda row: matchfunc(row['Content']), axis = 1)
    df.to_excel(f'{output_dir}{file}_labeled.xlsx')  
    



# csv files in the path
file_list = glob.glob(output_dir + "*_labeled.xlsx")
 
# list of excel files we want to merge.
# pd.read_excel(file_path) reads the excel
# data into pandas dataframe.
excl_list = []
 
for file in file_list:
    excl_list.append(pd.read_excel(file))
 
# create a new dataframe to store the
# merged excel file.
excl_merged = pd.DataFrame()
 
for excl_file in excl_list:
     
    # appends the data into the excl_merged
    # dataframe.
    excl_merged = excl_merged.append(
      excl_file, ignore_index=True)
 
# exports the dataframe into excel file with
# specified name.
excl_merged.to_csv('total_output.csv', index=False)
print("done!!")

In [None]:
# specifying the path to csv files
path = "C:/downloads"
 
# csv files in the path
file_list = glob.glob(path + "/*.xlsx")
 
# list of excel files we want to merge.
# pd.read_excel(file_path) reads the excel
# data into pandas dataframe.
excl_list = []
 
for file in file_list:
    excl_list.append(pd.read_excel(file))
 
# create a new dataframe to store the
# merged excel file.
excl_merged = pd.DataFrame()
 
for excl_file in excl_list:
     
    # appends the data into the excl_merged
    # dataframe.
    excl_merged = excl_merged.append(
      excl_file, ignore_index=True)
 
# exports the dataframe into excel file with
# specified name.
excl_merged.to_excel('total_food_sales.xlsx', index=False)