In [1]:
import sys
sys.path.append('../')

import os
import pandas as pd
import re
import numpy as np
from logparser import Spell, Drain
from tqdm import tqdm
#from logdeep.dataset.session import sliding_window
import glob
tqdm.pandas()
pd.options.mode.chained_assignment = None  # default='warn'


# In the first column of the log, "-" indicates non-alert messages while others are alert messages.
def count_anomaly(log_path):
    total_size = 0
    normal_size = 0
    with open(log_path, errors='ignore') as f:
        for line in f:
            total_size += 1
            if line.split('')[0] == '-':
                normal_size += 1
    print("total size {}, abnormal size {}".format(total_size, total_size - normal_size))


def deeplog_file_generator(filename, df, features):
    with open(filename, 'w') as f:
        for _, row in df.iterrows():
            for val in zip(*row[features]):
                f.write(','.join([str(v) for v in val]) + ' ')
            f.write('\n')




def sample_raw_data(data_file, output_file, sample_window_size, sample_step_size):
    # sample 1M by sliding window, abnormal rate is over 2%
    sample_data = []
    labels = []
    idx = 0

    # spirit dataset can start from the 2Mth line, as there are many abnormal lines gathering in the first 2M
    with open(data_file, 'r', errors='ignore') as f:
        for line in f:
            labels.append(line.split()[0] != '-')
            sample_data.append(line)

            if len(labels) == sample_window_size:
                abnormal_rate = sum(np.array(labels)) / len(labels)
                print(f"{idx + 1} lines, abnormal rate {abnormal_rate}")
                break

            idx += 1
            if idx % sample_step_size == 0:
                print(f"Process {round(idx/sample_window_size * 100,4)} % raw data", end='\r')

    with open(output_file, "w") as f:
        f.writelines(sample_data)

    print("Sampling done")




In [2]:
def parse_log(input_dir, output_dir, log_file, parser_type):
    #log_format = '<Label> <Id> <Date> <Admin> <Month> <Day> <Time> <AdminAddr> <Content>'
    log_format = '<Date> <Time>, <Type>             <Admin>  <Content>'
    regex = [
        r'(0x)[0-9a-fA-F]+',  # hexadecimal
        r'\d+\.\d+\.\d+\.\d+',
        r'(?<=Warning: we failed to resolve data source name )[\w\s]+',
        r'\d+'
    ]
    keep_para = False
    if parser_type == "drain":
        # the hyper parameter is set according to http://jmzhu.logpai.com/pub/pjhe_icws2017.pdf
        st = 0.3  # Similarity threshold
        depth = 3  # Depth of all leaf nodes

        # Drain is modified
        parser = Drain.LogParser(log_format,
                                 indir=input_dir,
                                 outdir=output_dir,
                                 depth=depth,
                                 st=st,
                                 rex=regex,
                                 keep_para=keep_para, maxChild=1000)
        parser.parse(log_file)

    elif parser_type == "spell":
        tau = 0.35
        parser = Spell.LogParser(indir=data_dir,
                                 outdir=output_dir,
                                 log_format=log_format,
                                 tau=tau,
                                 rex=regex,
                                 keep_para=keep_para)
        parser.parse(log_file)

In [None]:
##File parser

data_dir = os.path.expanduser("../dataset/Windows")
output_dir = "../output/windows/"
raw_log_file = "xaa.txt"
sample_log_file = "xaa.txt"
sample_window_size = 2*10**7
sample_step_size = 10**4
window_name = ''
log_file = sample_log_file

parser_type = 'drain'
#mins
window_size = 1
step_size = 0.5
train_ratio = 6000

hresult_dict = {'0x00000000': 1,'0x800f080d':2,'0x800f0805':3, '0x80070490':4, '0x80004005':5, '0x80070001':6, '0x80071a2d':7, '0x80070216':8, '0x80070bc2':9, '0x800f0816':10, '0x800f0806':11, '0x800f0902':12, '0x80070002':13}

def matchfunc(line):
    # Capture one-or-more characters of non-whitespace after the initial match
    match = re.search(r'HRESULT = (\S+)', line)

    # Did we find a match?
    if match:
        # Yes, process it
        weather = match.group(1)
        return(hresult_dict[weather])
    return(0)

########
# count anomaly
########
# count_anomaly(data_dir + log_file)
# sys.exit()

import glob
file_list = glob.glob(data_dir +"/*.txt")
#########
# sample raw data
#########
#sample_raw_data(data_dir+raw_log_file, data_dir+sample_log_file, sample_window_size, sample_step_size )

a = glob.glob(data_dir +"/*.*") 
file_list=[os.path.basename(list_item) for list_item in a]
print(file_list)
##########
# Parser #
#########
for file in file_list:
    print("\nParsing", file)
    parse_log(data_dir, output_dir, file, parser_type)


##################
# Transformation #
##################
for file in file_list:
    print("\nTransforming", file)
    df = pd.read_csv(f'{output_dir}{file}_structured.csv')
    df['Label'] = 0

    #df.loc[df['Content'].str.contains("HRESULT"), "Label"] = 1
    df['Label'] = df.apply(lambda row: matchfunc(row['Content']), axis = 1)
    df.to_excel(f'{output_dir}{file}_labeled.xlsx')  
    



# csv files in the path
file_list = glob.glob(output_dir + "*_labeled.xlsx")
 
# list of excel files we want to merge.
# pd.read_excel(file_path) reads the excel
# data into pandas dataframe.
excl_list = []
 
for file in file_list:
    excl_list.append(pd.read_excel(file))
 
# create a new dataframe to store the
# merged excel file.
excl_merged = pd.DataFrame()
 
for excl_file in excl_list:
     
    # appends the data into the excl_merged
    # dataframe.
    excl_merged = excl_merged.append(
      excl_file, ignore_index=True)
 
# exports the dataframe into excel file with
# specified name.
excl_merged.to_csv('total_output.csv', index=False)
print("done!!")

['wrd00.txt', 'wrd01.txt', 'wrd02.txt', 'wrd03.txt', 'wrd04.txt', 'wrd05.txt', 'wrd06.txt', 'wrd07.txt', 'wrd08.txt', 'wrd09.txt', 'wrd10.txt', 'wrd11.txt', 'wrd12.txt', 'wrd13.txt', 'wrd14.txt', 'wrd15.txt', 'wrd16.txt', 'wrd17.txt', 'wrd18.txt', 'wrd19.txt', 'wrd20.txt', 'wrd21.txt', 'wrd22.txt', 'wrd23.txt', 'wrd24.txt', 'wrd25.txt', 'wrd26.txt', 'wrd27.txt', 'wrd28.txt', 'wrd29.txt', 'wrd30.txt', 'wrd31.txt', 'wrd32.txt', 'wrd33.txt', 'wrd34.txt', 'wrd35.txt', 'wrd36.txt', 'wrd37.txt', 'wrd38.txt', 'wrd39.txt', 'wrd40.txt', 'wrd41.txt', 'wrd42.txt', 'wrd43.txt', 'wrd44.txt', 'wrd45.txt', 'wrd46.txt', 'wrd47.txt', 'wrd48.txt', 'wrd49.txt', 'wrd50.txt', 'wrd51.txt', 'wrd52.txt', 'wrd53.txt', 'wrd54.txt', 'wrd55.txt', 'wrd56.txt', 'wrd57.txt', 'wrd58.txt', 'wrd59.txt', 'wrd60.txt', 'wrd61.txt', 'wrd62.txt', 'wrd63.txt', 'wrd64.txt', 'wrd65.txt', 'wrd66.txt', 'wrd67.txt', 'wrd68.txt', 'wrd69.txt', 'wrd70.txt', 'wrd71.txt', 'wrd72.txt', 'wrd73.txt', 'wrd74.txt', 'wrd75.txt', 'wrd76.txt'

Total size after encoding is 49966 50000
Parsing done. [Time taken: 0:00:04.744213]

Parsing wrd01.txt
Parsing file: ../dataset/Windows\wrd01.txt
Total size after encoding is 49996 50000
Parsing done. [Time taken: 0:00:04.703818]

Parsing wrd02.txt
Parsing file: ../dataset/Windows\wrd02.txt
Total size after encoding is 49992 50000
Parsing done. [Time taken: 0:00:05.284390]

Parsing wrd03.txt
Parsing file: ../dataset/Windows\wrd03.txt
Total size after encoding is 49996 50000
Parsing done. [Time taken: 0:00:05.114272]

Parsing wrd04.txt
Parsing file: ../dataset/Windows\wrd04.txt
Total size after encoding is 49997 50000
Parsing done. [Time taken: 0:00:05.123085]

Parsing wrd05.txt
Parsing file: ../dataset/Windows\wrd05.txt
Total size after encoding is 49972 50000
Parsing done. [Time taken: 0:00:04.605678]

Parsing wrd06.txt
Parsing file: ../dataset/Windows\wrd06.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:04.972535]

Parsing wrd07.txt
Parsing file: ../data

Parsing done. [Time taken: 0:00:05.038942]

Parsing wrd57.txt
Parsing file: ../dataset/Windows\wrd57.txt
Total size after encoding is 49769 50001
Parsing done. [Time taken: 0:00:05.451781]

Parsing wrd58.txt
Parsing file: ../dataset/Windows\wrd58.txt
Total size after encoding is 49996 50000
Parsing done. [Time taken: 0:00:05.413116]

Parsing wrd59.txt
Parsing file: ../dataset/Windows\wrd59.txt
Total size after encoding is 49991 50000
Parsing done. [Time taken: 0:00:05.437812]

Parsing wrd60.txt
Parsing file: ../dataset/Windows\wrd60.txt
Total size after encoding is 49997 50000
Parsing done. [Time taken: 0:00:05.582776]

Parsing wrd61.txt
Parsing file: ../dataset/Windows\wrd61.txt
Total size after encoding is 49997 50000
Parsing done. [Time taken: 0:00:05.104279]

Parsing wrd62.txt
Parsing file: ../dataset/Windows\wrd62.txt
Total size after encoding is 49972 50000
Parsing done. [Time taken: 0:00:04.768561]

Parsing wrd63.txt
Parsing file: ../dataset/Windows\wrd63.txt
Total size after en

Total size after encoding is 49990 50000
Parsing done. [Time taken: 0:00:04.831195]

Parsing wrd9023.txt
Parsing file: ../dataset/Windows\wrd9023.txt
Total size after encoding is 49990 50000
Parsing done. [Time taken: 0:00:04.461616]

Parsing wrd9024.txt
Parsing file: ../dataset/Windows\wrd9024.txt
Total size after encoding is 49986 50000
Parsing done. [Time taken: 0:00:04.699507]

Parsing wrd9025.txt
Parsing file: ../dataset/Windows\wrd9025.txt
Total size after encoding is 49433 50000
Parsing done. [Time taken: 0:00:04.740033]

Parsing wrd9026.txt
Parsing file: ../dataset/Windows\wrd9026.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:04.457858]

Parsing wrd9027.txt
Parsing file: ../dataset/Windows\wrd9027.txt
Total size after encoding is 49993 50000
Parsing done. [Time taken: 0:00:04.942032]

Parsing wrd9028.txt
Parsing file: ../dataset/Windows\wrd9028.txt
Total size after encoding is 49992 50000
Parsing done. [Time taken: 0:00:04.532752]

Parsing wrd9029

Total size after encoding is 49824 50000
Parsing done. [Time taken: 0:00:04.893371]

Parsing wrd9078.txt
Parsing file: ../dataset/Windows\wrd9078.txt
Total size after encoding is 49914 50000
Parsing done. [Time taken: 0:00:05.100779]

Parsing wrd9079.txt
Parsing file: ../dataset/Windows\wrd9079.txt
Total size after encoding is 49961 50000
Parsing done. [Time taken: 0:00:04.542570]

Parsing wrd9080.txt
Parsing file: ../dataset/Windows\wrd9080.txt
Total size after encoding is 49983 50000
Parsing done. [Time taken: 0:00:04.607986]

Parsing wrd9081.txt
Parsing file: ../dataset/Windows\wrd9081.txt
Total size after encoding is 49974 50000
Parsing done. [Time taken: 0:00:04.948330]

Parsing wrd9082.txt
Parsing file: ../dataset/Windows\wrd9082.txt
Total size after encoding is 49965 50000
Parsing done. [Time taken: 0:00:05.003887]

Parsing wrd9083.txt
Parsing file: ../dataset/Windows\wrd9083.txt
Total size after encoding is 49983 50000
Parsing done. [Time taken: 0:00:04.583667]

Parsing wrd9084

Total size after encoding is 49978 50000
Parsing done. [Time taken: 0:00:04.671578]

Parsing wrd9133.txt
Parsing file: ../dataset/Windows\wrd9133.txt
Total size after encoding is 49986 50000
Parsing done. [Time taken: 0:00:04.774338]

Parsing wrd9134.txt
Parsing file: ../dataset/Windows\wrd9134.txt
Total size after encoding is 49972 50000
Parsing done. [Time taken: 0:00:04.594102]

Parsing wrd9135.txt
Parsing file: ../dataset/Windows\wrd9135.txt
Total size after encoding is 49973 50000
Parsing done. [Time taken: 0:00:05.177626]

Parsing wrd9136.txt
Parsing file: ../dataset/Windows\wrd9136.txt
Total size after encoding is 49940 50000
Parsing done. [Time taken: 0:00:04.983436]

Parsing wrd9137.txt
Parsing file: ../dataset/Windows\wrd9137.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:05.000602]

Parsing wrd9138.txt
Parsing file: ../dataset/Windows\wrd9138.txt
Total size after encoding is 49963 50000
Parsing done. [Time taken: 0:00:04.990010]

Parsing wrd9139

Total size after encoding is 49985 50000
Parsing done. [Time taken: 0:00:05.008059]

Parsing wrd9188.txt
Parsing file: ../dataset/Windows\wrd9188.txt
Total size after encoding is 49972 50000
Parsing done. [Time taken: 0:00:04.605386]

Parsing wrd9189.txt
Parsing file: ../dataset/Windows\wrd9189.txt
Total size after encoding is 49945 50000
Parsing done. [Time taken: 0:00:04.660787]

Parsing wrd9190.txt
Parsing file: ../dataset/Windows\wrd9190.txt
Total size after encoding is 49908 50000
Parsing done. [Time taken: 0:00:05.564069]

Parsing wrd9191.txt
Parsing file: ../dataset/Windows\wrd9191.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:04.990674]

Parsing wrd9192.txt
Parsing file: ../dataset/Windows\wrd9192.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:04.670944]

Parsing wrd9193.txt
Parsing file: ../dataset/Windows\wrd9193.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:04.937958]

Parsing wrd9194

Total size after encoding is 49987 50000
Parsing done. [Time taken: 0:00:04.751094]

Parsing wrd9243.txt
Parsing file: ../dataset/Windows\wrd9243.txt
Total size after encoding is 49998 50000
Parsing done. [Time taken: 0:00:05.446218]

Parsing wrd9244.txt
Parsing file: ../dataset/Windows\wrd9244.txt
Total size after encoding is 49980 50000
Parsing done. [Time taken: 0:00:04.864510]

Parsing wrd9245.txt
Parsing file: ../dataset/Windows\wrd9245.txt
Total size after encoding is 49970 50000
Parsing done. [Time taken: 0:00:05.239984]

Parsing wrd9246.txt
Parsing file: ../dataset/Windows\wrd9246.txt
Total size after encoding is 46113 50000
Parsing done. [Time taken: 0:00:04.838814]

Parsing wrd9247.txt
Parsing file: ../dataset/Windows\wrd9247.txt
Total size after encoding is 47588 50000
Parsing done. [Time taken: 0:00:05.087821]

Parsing wrd9248.txt
Parsing file: ../dataset/Windows\wrd9248.txt
Total size after encoding is 49986 50000
Parsing done. [Time taken: 0:00:05.216145]

Parsing wrd9249

Total size after encoding is 49987 50000
Parsing done. [Time taken: 0:00:04.981762]

Parsing wrd9298.txt
Parsing file: ../dataset/Windows\wrd9298.txt
Total size after encoding is 49986 50000
Parsing done. [Time taken: 0:00:04.806186]

Parsing wrd9299.txt
Parsing file: ../dataset/Windows\wrd9299.txt
Total size after encoding is 49971 50000
Parsing done. [Time taken: 0:00:04.911077]

Parsing wrd9300.txt
Parsing file: ../dataset/Windows\wrd9300.txt
Total size after encoding is 49936 50000
Parsing done. [Time taken: 0:00:05.522946]

Parsing wrd9301.txt
Parsing file: ../dataset/Windows\wrd9301.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:04.766491]

Parsing wrd9302.txt
Parsing file: ../dataset/Windows\wrd9302.txt
Total size after encoding is 49978 50000
Parsing done. [Time taken: 0:00:05.108685]

Parsing wrd9303.txt
Parsing file: ../dataset/Windows\wrd9303.txt
Total size after encoding is 49990 50000
Parsing done. [Time taken: 0:00:04.594316]

Parsing wrd9304

Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:05.148148]

Parsing wrd9353.txt
Parsing file: ../dataset/Windows\wrd9353.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:04.945142]

Parsing wrd9354.txt
Parsing file: ../dataset/Windows\wrd9354.txt
Total size after encoding is 49997 50000
Parsing done. [Time taken: 0:00:05.335327]

Parsing wrd9355.txt
Parsing file: ../dataset/Windows\wrd9355.txt
Total size after encoding is 49999 50000
Parsing done. [Time taken: 0:00:05.574002]

Parsing wrd9356.txt
Parsing file: ../dataset/Windows\wrd9356.txt
Total size after encoding is 49991 50000
Parsing done. [Time taken: 0:00:05.072590]

Parsing wrd9357.txt
Parsing file: ../dataset/Windows\wrd9357.txt
Total size after encoding is 49998 50000
Parsing done. [Time taken: 0:00:05.004392]

Parsing wrd9358.txt
Parsing file: ../dataset/Windows\wrd9358.txt
Total size after encoding is 49993 50000
Parsing done. [Time taken: 0:00:05.030378]

Parsing wrd9359

Total size after encoding is 49976 50000
Parsing done. [Time taken: 0:00:04.941323]

Parsing wrd9408.txt
Parsing file: ../dataset/Windows\wrd9408.txt
Total size after encoding is 49984 50000
Parsing done. [Time taken: 0:00:04.699260]

Parsing wrd9409.txt
Parsing file: ../dataset/Windows\wrd9409.txt
Total size after encoding is 49979 50000
Parsing done. [Time taken: 0:00:04.562957]

Parsing wrd9410.txt
Parsing file: ../dataset/Windows\wrd9410.txt
Total size after encoding is 49988 50000
Parsing done. [Time taken: 0:00:04.622568]

Parsing wrd9411.txt
Parsing file: ../dataset/Windows\wrd9411.txt
Total size after encoding is 49984 50000
Parsing done. [Time taken: 0:00:05.088148]

Parsing wrd9412.txt
Parsing file: ../dataset/Windows\wrd9412.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:04.937749]

Parsing wrd9413.txt
Parsing file: ../dataset/Windows\wrd9413.txt
Total size after encoding is 49982 50000
Parsing done. [Time taken: 0:00:05.171552]

Parsing wrd9414

Total size after encoding is 49636 50000
Parsing done. [Time taken: 0:00:05.843170]

Parsing wrd9463.txt
Parsing file: ../dataset/Windows\wrd9463.txt
Total size after encoding is 49996 50000
Parsing done. [Time taken: 0:00:05.190053]

Parsing wrd9464.txt
Parsing file: ../dataset/Windows\wrd9464.txt
Total size after encoding is 49993 50000
Parsing done. [Time taken: 0:00:04.890581]

Parsing wrd9465.txt
Parsing file: ../dataset/Windows\wrd9465.txt
Total size after encoding is 49971 50000
Parsing done. [Time taken: 0:00:05.189472]

Parsing wrd9466.txt
Parsing file: ../dataset/Windows\wrd9466.txt
Total size after encoding is 49979 50000
Parsing done. [Time taken: 0:00:05.219202]

Parsing wrd9467.txt
Parsing file: ../dataset/Windows\wrd9467.txt
Total size after encoding is 49970 50000
Parsing done. [Time taken: 0:00:04.960459]

Parsing wrd9468.txt
Parsing file: ../dataset/Windows\wrd9468.txt
Total size after encoding is 49993 50000
Parsing done. [Time taken: 0:00:04.937580]

Parsing wrd9469

Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:05.287348]

Parsing wrd9518.txt
Parsing file: ../dataset/Windows\wrd9518.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:04.967106]

Parsing wrd9519.txt
Parsing file: ../dataset/Windows\wrd9519.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:05.002733]

Parsing wrd9520.txt
Parsing file: ../dataset/Windows\wrd9520.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:05.370924]

Parsing wrd9521.txt
Parsing file: ../dataset/Windows\wrd9521.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:04.923291]

Parsing wrd9522.txt
Parsing file: ../dataset/Windows\wrd9522.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:04.898365]

Parsing wrd9523.txt
Parsing file: ../dataset/Windows\wrd9523.txt
Total size after encoding is 49930 50000
Parsing done. [Time taken: 0:00:05.360993]

Parsing wrd9524

Total size after encoding is 49980 50000
Parsing done. [Time taken: 0:00:04.672447]

Parsing wrd9573.txt
Parsing file: ../dataset/Windows\wrd9573.txt
Total size after encoding is 49991 50000
Parsing done. [Time taken: 0:00:04.674086]

Parsing wrd9574.txt
Parsing file: ../dataset/Windows\wrd9574.txt
Total size after encoding is 49982 50000
Parsing done. [Time taken: 0:00:05.030890]

Parsing wrd9575.txt
Parsing file: ../dataset/Windows\wrd9575.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:04.675116]

Parsing wrd9576.txt
Parsing file: ../dataset/Windows\wrd9576.txt
Total size after encoding is 49992 50000
Parsing done. [Time taken: 0:00:05.533786]

Parsing wrd9577.txt
Parsing file: ../dataset/Windows\wrd9577.txt
Total size after encoding is 49725 50006
Parsing done. [Time taken: 0:00:05.029641]

Parsing wrd9578.txt
Parsing file: ../dataset/Windows\wrd9578.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:05.128468]

Parsing wrd9579

Total size after encoding is 49990 50000
Parsing done. [Time taken: 0:00:05.533027]

Parsing wrd9628.txt
Parsing file: ../dataset/Windows\wrd9628.txt
Total size after encoding is 49982 50000
Parsing done. [Time taken: 0:00:05.353825]

Parsing wrd9629.txt
Parsing file: ../dataset/Windows\wrd9629.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:05.367786]

Parsing wrd9630.txt
Parsing file: ../dataset/Windows\wrd9630.txt
Total size after encoding is 49992 50000
Parsing done. [Time taken: 0:00:05.288997]

Parsing wrd9631.txt
Parsing file: ../dataset/Windows\wrd9631.txt
Total size after encoding is 49968 50000
Parsing done. [Time taken: 0:00:05.218391]

Parsing wrd9632.txt
Parsing file: ../dataset/Windows\wrd9632.txt
Total size after encoding is 49997 50000
Parsing done. [Time taken: 0:00:05.506648]

Parsing wrd9633.txt
Parsing file: ../dataset/Windows\wrd9633.txt
Total size after encoding is 49996 50000
Parsing done. [Time taken: 0:00:05.198257]

Parsing wrd9634

Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:04.794483]

Parsing wrd9683.txt
Parsing file: ../dataset/Windows\wrd9683.txt
Total size after encoding is 49992 50000
Parsing done. [Time taken: 0:00:04.809992]

Parsing wrd9684.txt
Parsing file: ../dataset/Windows\wrd9684.txt
Total size after encoding is 49983 50000
Parsing done. [Time taken: 0:00:05.197859]

Parsing wrd9685.txt
Parsing file: ../dataset/Windows\wrd9685.txt
Total size after encoding is 49926 50002
Parsing done. [Time taken: 0:00:05.286659]

Parsing wrd9686.txt
Parsing file: ../dataset/Windows\wrd9686.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:05.157419]

Parsing wrd9687.txt
Parsing file: ../dataset/Windows\wrd9687.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:05.123351]

Parsing wrd9688.txt
Parsing file: ../dataset/Windows\wrd9688.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:05.049002]

Parsing wrd9689

Total size after encoding is 49900 50000
Parsing done. [Time taken: 0:00:04.472268]

Parsing wrd9738.txt
Parsing file: ../dataset/Windows\wrd9738.txt
Total size after encoding is 49934 50000
Parsing done. [Time taken: 0:00:05.672245]

Parsing wrd9739.txt
Parsing file: ../dataset/Windows\wrd9739.txt
Total size after encoding is 49996 50000
Parsing done. [Time taken: 0:00:04.843276]

Parsing wrd9740.txt
Parsing file: ../dataset/Windows\wrd9740.txt
Total size after encoding is 49996 50000
Parsing done. [Time taken: 0:00:04.911475]

Parsing wrd9741.txt
Parsing file: ../dataset/Windows\wrd9741.txt
Total size after encoding is 49993 50000
Parsing done. [Time taken: 0:00:05.165906]

Parsing wrd9742.txt
Parsing file: ../dataset/Windows\wrd9742.txt
Total size after encoding is 49974 50000
Parsing done. [Time taken: 0:00:05.196009]

Parsing wrd9743.txt
Parsing file: ../dataset/Windows\wrd9743.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:05.266193]

Parsing wrd9744

Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:04.923511]

Parsing wrd9793.txt
Parsing file: ../dataset/Windows\wrd9793.txt
Total size after encoding is 49918 50000
Parsing done. [Time taken: 0:00:05.022675]

Parsing wrd9794.txt
Parsing file: ../dataset/Windows\wrd9794.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:05.307056]

Parsing wrd9795.txt
Parsing file: ../dataset/Windows\wrd9795.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:05.118645]

Parsing wrd9796.txt
Parsing file: ../dataset/Windows\wrd9796.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:05.036345]

Parsing wrd9797.txt
Parsing file: ../dataset/Windows\wrd9797.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:05.283215]

Parsing wrd9798.txt
Parsing file: ../dataset/Windows\wrd9798.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:05.048985]

Parsing wrd9799

Total size after encoding is 49934 50000
Parsing done. [Time taken: 0:00:05.573754]

Parsing wrd9848.txt
Parsing file: ../dataset/Windows\wrd9848.txt
Total size after encoding is 49996 50000
Parsing done. [Time taken: 0:00:05.268702]

Parsing wrd9849.txt
Parsing file: ../dataset/Windows\wrd9849.txt
Total size after encoding is 49996 50000
Parsing done. [Time taken: 0:00:04.873260]

Parsing wrd9850.txt
Parsing file: ../dataset/Windows\wrd9850.txt
Total size after encoding is 49993 50000
Parsing done. [Time taken: 0:00:05.282337]

Parsing wrd9851.txt
Parsing file: ../dataset/Windows\wrd9851.txt
Total size after encoding is 49975 50000
Parsing done. [Time taken: 0:00:04.846776]

Parsing wrd9852.txt
Parsing file: ../dataset/Windows\wrd9852.txt
Total size after encoding is 49999 50000
Parsing done. [Time taken: 0:00:05.015623]

Parsing wrd9853.txt
Parsing file: ../dataset/Windows\wrd9853.txt
Total size after encoding is 49986 50000
Parsing done. [Time taken: 0:00:05.302625]

Parsing wrd9854

Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:05.313728]

Parsing wrd990003.txt
Parsing file: ../dataset/Windows\wrd990003.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:05.173612]

Parsing wrd990004.txt
Parsing file: ../dataset/Windows\wrd990004.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:05.303193]

Parsing wrd990005.txt
Parsing file: ../dataset/Windows\wrd990005.txt
Total size after encoding is 49928 50000
Parsing done. [Time taken: 0:00:05.005823]

Parsing wrd990006.txt
Parsing file: ../dataset/Windows\wrd990006.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:05.187531]

Parsing wrd990007.txt
Parsing file: ../dataset/Windows\wrd990007.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:05.276667]

Parsing wrd990008.txt
Parsing file: ../dataset/Windows\wrd990008.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:05.

Parsing done. [Time taken: 0:00:04.971427]

Parsing wrd990056.txt
Parsing file: ../dataset/Windows\wrd990056.txt
Total size after encoding is 49979 50000
Parsing done. [Time taken: 0:00:05.020135]

Parsing wrd990057.txt
Parsing file: ../dataset/Windows\wrd990057.txt
Total size after encoding is 49999 50000
Parsing done. [Time taken: 0:00:04.736812]

Parsing wrd990058.txt
Parsing file: ../dataset/Windows\wrd990058.txt
Total size after encoding is 49998 50000
Parsing done. [Time taken: 0:00:05.174916]

Parsing wrd990059.txt
Parsing file: ../dataset/Windows\wrd990059.txt
Total size after encoding is 49985 50000
Parsing done. [Time taken: 0:00:05.279023]

Parsing wrd990060.txt
Parsing file: ../dataset/Windows\wrd990060.txt
Total size after encoding is 49994 50000
Parsing done. [Time taken: 0:00:05.304856]

Parsing wrd990061.txt
Parsing file: ../dataset/Windows\wrd990061.txt
Total size after encoding is 49987 50000
Parsing done. [Time taken: 0:00:05.461220]

Parsing wrd990062.txt
Parsing fi

Parsing done. [Time taken: 0:00:05.241428]

Parsing wrd990109.txt
Parsing file: ../dataset/Windows\wrd990109.txt
Total size after encoding is 49996 50000
Parsing done. [Time taken: 0:00:05.149527]

Parsing wrd990110.txt
Parsing file: ../dataset/Windows\wrd990110.txt
Total size after encoding is 49996 50000
Parsing done. [Time taken: 0:00:05.112679]

Parsing wrd990111.txt
Parsing file: ../dataset/Windows\wrd990111.txt
Total size after encoding is 49996 50000
Parsing done. [Time taken: 0:00:06.087688]

Parsing wrd990112.txt
Parsing file: ../dataset/Windows\wrd990112.txt
Total size after encoding is 49994 50000
Parsing done. [Time taken: 0:00:05.046996]

Parsing wrd990113.txt
Parsing file: ../dataset/Windows\wrd990113.txt
Total size after encoding is 49980 50000
Parsing done. [Time taken: 0:00:04.689165]

Parsing wrd990114.txt
Parsing file: ../dataset/Windows\wrd990114.txt
Total size after encoding is 49992 50000
Parsing done. [Time taken: 0:00:05.109132]

Parsing wrd990115.txt
Parsing fi

Parsing done. [Time taken: 0:00:05.002264]

Parsing wrd990162.txt
Parsing file: ../dataset/Windows\wrd990162.txt
Total size after encoding is 49984 50000
Parsing done. [Time taken: 0:00:05.256225]

Parsing wrd990163.txt
Parsing file: ../dataset/Windows\wrd990163.txt
Total size after encoding is 49993 50000
Parsing done. [Time taken: 0:00:04.881864]

Parsing wrd990164.txt
Parsing file: ../dataset/Windows\wrd990164.txt
Total size after encoding is 49973 50000
Parsing done. [Time taken: 0:00:05.265390]

Parsing wrd990165.txt
Parsing file: ../dataset/Windows\wrd990165.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:05.013834]

Parsing wrd990166.txt
Parsing file: ../dataset/Windows\wrd990166.txt
Total size after encoding is 49994 50000
Parsing done. [Time taken: 0:00:04.815394]

Parsing wrd990167.txt
Parsing file: ../dataset/Windows\wrd990167.txt
Total size after encoding is 49979 50000
Parsing done. [Time taken: 0:00:05.438831]

Parsing wrd990168.txt
Parsing fi

Parsing done. [Time taken: 0:00:04.533649]

Parsing wrd990215.txt
Parsing file: ../dataset/Windows\wrd990215.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:04.826974]

Parsing wrd990216.txt
Parsing file: ../dataset/Windows\wrd990216.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:04.977833]

Parsing wrd990217.txt
Parsing file: ../dataset/Windows\wrd990217.txt
Total size after encoding is 49928 50000
Parsing done. [Time taken: 0:00:05.260681]

Parsing wrd990218.txt
Parsing file: ../dataset/Windows\wrd990218.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:04.872025]

Parsing wrd990219.txt
Parsing file: ../dataset/Windows\wrd990219.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:04.631195]

Parsing wrd990220.txt
Parsing file: ../dataset/Windows\wrd990220.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:04.517275]

Parsing wrd990221.txt
Parsing fi

Parsing done. [Time taken: 0:00:05.096951]

Parsing wrd990268.txt
Parsing file: ../dataset/Windows\wrd990268.txt
Total size after encoding is 49997 50000
Parsing done. [Time taken: 0:00:05.049081]

Parsing wrd990269.txt
Parsing file: ../dataset/Windows\wrd990269.txt
Total size after encoding is 49994 50000
Parsing done. [Time taken: 0:00:05.123762]

Parsing wrd990270.txt
Parsing file: ../dataset/Windows\wrd990270.txt
Total size after encoding is 49999 50000
Parsing done. [Time taken: 0:00:05.060684]

Parsing wrd990271.txt
Parsing file: ../dataset/Windows\wrd990271.txt
Total size after encoding is 49980 50000
Parsing done. [Time taken: 0:00:04.745404]

Parsing wrd990272.txt
Parsing file: ../dataset/Windows\wrd990272.txt
Total size after encoding is 49993 50000
Parsing done. [Time taken: 0:00:05.180013]

Parsing wrd990273.txt
Parsing file: ../dataset/Windows\wrd990273.txt
Total size after encoding is 49988 50000
Parsing done. [Time taken: 0:00:04.690042]

Parsing wrd990274.txt
Parsing fi

Parsing done. [Time taken: 0:00:04.942145]

Parsing wrd990321.txt
Parsing file: ../dataset/Windows\wrd990321.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:04.659967]

Parsing wrd990322.txt
Parsing file: ../dataset/Windows\wrd990322.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:04.979831]

Parsing wrd990323.txt
Parsing file: ../dataset/Windows\wrd990323.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:04.908963]

Parsing wrd990324.txt
Parsing file: ../dataset/Windows\wrd990324.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:05.047409]

Parsing wrd990325.txt
Parsing file: ../dataset/Windows\wrd990325.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:04.612122]

Parsing wrd990326.txt
Parsing file: ../dataset/Windows\wrd990326.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:05.121583]

Parsing wrd990327.txt
Parsing fi

Parsing done. [Time taken: 0:00:04.536189]

Parsing wrd990374.txt
Parsing file: ../dataset/Windows\wrd990374.txt
Total size after encoding is 49992 50000
Parsing done. [Time taken: 0:00:04.863722]

Parsing wrd990375.txt
Parsing file: ../dataset/Windows\wrd990375.txt
Total size after encoding is 49815 50000
Parsing done. [Time taken: 0:00:04.767665]

Parsing wrd990376.txt
Parsing file: ../dataset/Windows\wrd990376.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:05.308228]

Parsing wrd990377.txt
Parsing file: ../dataset/Windows\wrd990377.txt
Total size after encoding is 49996 50000
Parsing done. [Time taken: 0:00:05.022169]

Parsing wrd990378.txt
Parsing file: ../dataset/Windows\wrd990378.txt
Total size after encoding is 49995 50000
Parsing done. [Time taken: 0:00:05.065871]

Parsing wrd990379.txt
Parsing file: ../dataset/Windows\wrd990379.txt
Total size after encoding is 49994 50000
Parsing done. [Time taken: 0:00:04.898062]

Parsing wrd990380.txt
Parsing fi

Parsing done. [Time taken: 0:00:05.032392]

Parsing wrd990427.txt
Parsing file: ../dataset/Windows\wrd990427.txt
Total size after encoding is 49994 50000
Parsing done. [Time taken: 0:00:04.959169]

Parsing wrd990428.txt
Parsing file: ../dataset/Windows\wrd990428.txt
Total size after encoding is 49998 50000
Parsing done. [Time taken: 0:00:05.174890]

Parsing wrd990429.txt
Parsing file: ../dataset/Windows\wrd990429.txt
Total size after encoding is 49981 50000
Parsing done. [Time taken: 0:00:05.079299]

Parsing wrd990430.txt
Parsing file: ../dataset/Windows\wrd990430.txt
Total size after encoding is 49993 50000
Parsing done. [Time taken: 0:00:05.095949]

Parsing wrd990431.txt
Parsing file: ../dataset/Windows\wrd990431.txt
Total size after encoding is 49985 50000
Parsing done. [Time taken: 0:00:05.287329]

Parsing wrd990432.txt
Parsing file: ../dataset/Windows\wrd990432.txt
Total size after encoding is 49988 50000
Parsing done. [Time taken: 0:00:05.340408]

Parsing wrd990433.txt
Parsing fi

Parsing done. [Time taken: 0:00:05.109651]

Parsing wrd990480.txt
Parsing file: ../dataset/Windows\wrd990480.txt
Total size after encoding is 49993 50000
Parsing done. [Time taken: 0:00:04.817156]

Parsing wrd990481.txt
Parsing file: ../dataset/Windows\wrd990481.txt
Total size after encoding is 49970 50000
Parsing done. [Time taken: 0:00:05.148590]

Parsing wrd990482.txt
Parsing file: ../dataset/Windows\wrd990482.txt
Total size after encoding is 49985 50000
Parsing done. [Time taken: 0:00:05.027160]

Parsing wrd990483.txt
Parsing file: ../dataset/Windows\wrd990483.txt
Total size after encoding is 49995 50000
Parsing done. [Time taken: 0:00:05.246426]

Parsing wrd990484.txt
Parsing file: ../dataset/Windows\wrd990484.txt
Total size after encoding is 49997 50000
Parsing done. [Time taken: 0:00:04.679437]

Parsing wrd990485.txt
Parsing file: ../dataset/Windows\wrd990485.txt
Total size after encoding is 49999 50000
Parsing done. [Time taken: 0:00:05.062706]

Parsing wrd990486.txt
Parsing fi

Parsing done. [Time taken: 0:00:04.854809]

Parsing wrd990533.txt
Parsing file: ../dataset/Windows\wrd990533.txt
Total size after encoding is 49980 50000
Parsing done. [Time taken: 0:00:04.523075]

Parsing wrd990534.txt
Parsing file: ../dataset/Windows\wrd990534.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:05.213546]

Parsing wrd990535.txt
Parsing file: ../dataset/Windows\wrd990535.txt
Total size after encoding is 49991 50000
Parsing done. [Time taken: 0:00:04.558805]

Parsing wrd990536.txt
Parsing file: ../dataset/Windows\wrd990536.txt
Total size after encoding is 49981 50000
Parsing done. [Time taken: 0:00:05.029518]

Parsing wrd990537.txt
Parsing file: ../dataset/Windows\wrd990537.txt
Total size after encoding is 49981 50000
Parsing done. [Time taken: 0:00:05.144566]

Parsing wrd990538.txt
Parsing file: ../dataset/Windows\wrd990538.txt
Total size after encoding is 49991 50000
Parsing done. [Time taken: 0:00:04.979494]

Parsing wrd990539.txt
Parsing fi

Parsing done. [Time taken: 0:00:04.934435]

Parsing wrd990586.txt
Parsing file: ../dataset/Windows\wrd990586.txt
Total size after encoding is 49990 50000
Parsing done. [Time taken: 0:00:05.495911]

Parsing wrd990587.txt
Parsing file: ../dataset/Windows\wrd990587.txt
Total size after encoding is 49980 50000
Parsing done. [Time taken: 0:00:05.045394]

Parsing wrd990588.txt
Parsing file: ../dataset/Windows\wrd990588.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:04.899710]

Parsing wrd990589.txt
Parsing file: ../dataset/Windows\wrd990589.txt
Total size after encoding is 49992 50000
Parsing done. [Time taken: 0:00:05.342101]

Parsing wrd990590.txt
Parsing file: ../dataset/Windows\wrd990590.txt
Total size after encoding is 49981 50000
Parsing done. [Time taken: 0:00:05.250585]

Parsing wrd990591.txt
Parsing file: ../dataset/Windows\wrd990591.txt
Total size after encoding is 49994 50000
Parsing done. [Time taken: 0:00:04.921063]

Parsing wrd990592.txt
Parsing fi

Parsing done. [Time taken: 0:00:05.220815]

Parsing wrd990639.txt
Parsing file: ../dataset/Windows\wrd990639.txt
Total size after encoding is 49997 50000
Parsing done. [Time taken: 0:00:04.824753]

Parsing wrd990640.txt
Parsing file: ../dataset/Windows\wrd990640.txt
Total size after encoding is 49978 50000
Parsing done. [Time taken: 0:00:04.805095]

Parsing wrd990641.txt
Parsing file: ../dataset/Windows\wrd990641.txt
Total size after encoding is 49994 50000
Parsing done. [Time taken: 0:00:04.585738]

Parsing wrd990642.txt
Parsing file: ../dataset/Windows\wrd990642.txt
Total size after encoding is 49990 50000
Parsing done. [Time taken: 0:00:04.554496]

Parsing wrd990643.txt
Parsing file: ../dataset/Windows\wrd990643.txt
Total size after encoding is 49983 50000
Parsing done. [Time taken: 0:00:05.179900]

Parsing wrd990644.txt
Parsing file: ../dataset/Windows\wrd990644.txt
Total size after encoding is 49993 50000
Parsing done. [Time taken: 0:00:04.987866]

Parsing wrd990645.txt
Parsing fi

Parsing done. [Time taken: 0:00:05.088633]

Parsing wrd990692.txt
Parsing file: ../dataset/Windows\wrd990692.txt
Total size after encoding is 49988 50000
Parsing done. [Time taken: 0:00:05.015464]

Parsing wrd990693.txt
Parsing file: ../dataset/Windows\wrd990693.txt
Total size after encoding is 49982 50000
Parsing done. [Time taken: 0:00:05.059700]

Parsing wrd990694.txt
Parsing file: ../dataset/Windows\wrd990694.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:04.741527]

Parsing wrd990695.txt
Parsing file: ../dataset/Windows\wrd990695.txt
Total size after encoding is 49987 50000
Parsing done. [Time taken: 0:00:05.284361]

Parsing wrd990696.txt
Parsing file: ../dataset/Windows\wrd990696.txt
Total size after encoding is 49938 50000
Parsing done. [Time taken: 0:00:05.021907]

Parsing wrd990697.txt
Parsing file: ../dataset/Windows\wrd990697.txt
Total size after encoding is 49580 50000
Parsing done. [Time taken: 0:00:05.851100]

Parsing wrd990698.txt
Parsing fi

Parsing done. [Time taken: 0:00:04.911502]

Parsing wrd990745.txt
Parsing file: ../dataset/Windows\wrd990745.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:04.977015]

Parsing wrd990746.txt
Parsing file: ../dataset/Windows\wrd990746.txt
Total size after encoding is 49973 50000
Parsing done. [Time taken: 0:00:04.571484]

Parsing wrd990747.txt
Parsing file: ../dataset/Windows\wrd990747.txt
Total size after encoding is 49994 50000
Parsing done. [Time taken: 0:00:04.653183]

Parsing wrd990748.txt
Parsing file: ../dataset/Windows\wrd990748.txt
Total size after encoding is 49972 50000
Parsing done. [Time taken: 0:00:04.902839]

Parsing wrd990749.txt
Parsing file: ../dataset/Windows\wrd990749.txt
Total size after encoding is 49993 50000
Parsing done. [Time taken: 0:00:05.551072]

Parsing wrd990750.txt
Parsing file: ../dataset/Windows\wrd990750.txt
Total size after encoding is 49972 50000
Parsing done. [Time taken: 0:00:04.657392]

Parsing wrd990751.txt
Parsing fi

Parsing done. [Time taken: 0:00:05.760319]

Parsing wrd990798.txt
Parsing file: ../dataset/Windows\wrd990798.txt
Total size after encoding is 49729 50000
Parsing done. [Time taken: 0:00:05.143948]

Parsing wrd990799.txt
Parsing file: ../dataset/Windows\wrd990799.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:04.898351]

Parsing wrd990800.txt
Parsing file: ../dataset/Windows\wrd990800.txt
Total size after encoding is 49992 50000
Parsing done. [Time taken: 0:00:04.890269]

Parsing wrd990801.txt
Parsing file: ../dataset/Windows\wrd990801.txt
Total size after encoding is 49993 50000
Parsing done. [Time taken: 0:00:05.510044]

Parsing wrd990802.txt
Parsing file: ../dataset/Windows\wrd990802.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:05.833246]

Parsing wrd990803.txt
Parsing file: ../dataset/Windows\wrd990803.txt
Total size after encoding is 49988 50000
Parsing done. [Time taken: 0:00:05.374394]

Parsing wrd990804.txt
Parsing fi

Parsing done. [Time taken: 0:00:05.321885]

Parsing wrd990851.txt
Parsing file: ../dataset/Windows\wrd990851.txt
Total size after encoding is 49975 50000
Parsing done. [Time taken: 0:00:04.853291]

Parsing wrd990852.txt
Parsing file: ../dataset/Windows\wrd990852.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:05.086241]

Parsing wrd990853.txt
Parsing file: ../dataset/Windows\wrd990853.txt
Total size after encoding is 49994 50000
Parsing done. [Time taken: 0:00:04.974001]

Parsing wrd990854.txt
Parsing file: ../dataset/Windows\wrd990854.txt
Total size after encoding is 49977 50000
Parsing done. [Time taken: 0:00:05.199968]

Parsing wrd990855.txt
Parsing file: ../dataset/Windows\wrd990855.txt
Total size after encoding is 49978 50000
Parsing done. [Time taken: 0:00:05.273671]

Parsing wrd990856.txt
Parsing file: ../dataset/Windows\wrd990856.txt
Total size after encoding is 49989 50000
Parsing done. [Time taken: 0:00:05.129723]

Parsing wrd990857.txt
Parsing fi

Parsing done. [Time taken: 0:00:04.596998]

Parsing wrd990904.txt
Parsing file: ../dataset/Windows\wrd990904.txt
Total size after encoding is 49981 50000
Parsing done. [Time taken: 0:00:04.892352]

Parsing wrd990905.txt
Parsing file: ../dataset/Windows\wrd990905.txt
Total size after encoding is 49988 50000
Parsing done. [Time taken: 0:00:04.875658]

Parsing wrd990906.txt
Parsing file: ../dataset/Windows\wrd990906.txt
Total size after encoding is 49991 50000
Parsing done. [Time taken: 0:00:04.853336]

Parsing wrd990907.txt
Parsing file: ../dataset/Windows\wrd990907.txt
Total size after encoding is 49982 50000
Parsing done. [Time taken: 0:00:05.057469]

Parsing wrd990908.txt
Parsing file: ../dataset/Windows\wrd990908.txt
Total size after encoding is 49993 50000
Parsing done. [Time taken: 0:00:05.031531]

Parsing wrd990909.txt
Parsing file: ../dataset/Windows\wrd990909.txt
Total size after encoding is 49967 50000
Parsing done. [Time taken: 0:00:05.815487]

Parsing wrd990910.txt
Parsing fi

Parsing done. [Time taken: 0:00:04.599830]

Parsing wrd990957.txt
Parsing file: ../dataset/Windows\wrd990957.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:05.187400]

Parsing wrd990958.txt
Parsing file: ../dataset/Windows\wrd990958.txt
Total size after encoding is 49991 50000
Parsing done. [Time taken: 0:00:05.102539]

Parsing wrd990959.txt
Parsing file: ../dataset/Windows\wrd990959.txt
Total size after encoding is 49496 50237
Parsing done. [Time taken: 0:00:05.342623]

Parsing wrd990960.txt
Parsing file: ../dataset/Windows\wrd990960.txt
Total size after encoding is 49992 50000
Parsing done. [Time taken: 0:00:05.458838]

Parsing wrd990961.txt
Parsing file: ../dataset/Windows\wrd990961.txt
Total size after encoding is 49992 50000
Parsing done. [Time taken: 0:00:04.659842]

Parsing wrd990962.txt
Parsing file: ../dataset/Windows\wrd990962.txt
Total size after encoding is 50000 50000
Parsing done. [Time taken: 0:00:04.894298]

Parsing wrd990963.txt
Parsing fi

Parsing done. [Time taken: 0:00:05.740981]

Parsing wrd991010.txt
Parsing file: ../dataset/Windows\wrd991010.txt
Total size after encoding is 49990 50000
Parsing done. [Time taken: 0:00:06.947188]

Parsing wrd991011.txt
Parsing file: ../dataset/Windows\wrd991011.txt
Total size after encoding is 49978 50000
Parsing done. [Time taken: 0:00:06.436077]

Parsing wrd991012.txt
Parsing file: ../dataset/Windows\wrd991012.txt
Total size after encoding is 49983 50000
Parsing done. [Time taken: 0:00:05.569365]

Parsing wrd991013.txt
Parsing file: ../dataset/Windows\wrd991013.txt


In [None]:
# sampling with sliding window
#deeplog_df = sliding_window(df[["timestamp", "Label", "EventId", "deltaT" <Date> <Time>, <Type>             <Admin>  <Content>]],
                            #para={"window_size": float(window_size)*60, "step_size": float(step_size) * 60}
                            #)
#output_dir += window_name

#########
# Train #
#########
df_normal = deeplog_df[deeplog_df["Label"] == 0]
df_normal = df_normal.sample(frac=1, random_state=12).reset_index(drop=True) #shuffle
normal_len = len(df_normal)
train_len = int(train_ratio) if train_ratio >= 1 else int(normal_len * train_ratio)

train = df_normal[:train_len]
deeplog_file_generator(os.path.join(output_dir,'train'), train, ["EventId"])
print("training size {}".format(train_len))


###############
# Test Normal #
###############
test_normal = df_normal[train_len:]
deeplog_file_generator(os.path.join(output_dir, 'test_normal'), test_normal, ["EventId"])
print("test normal size {}".format(normal_len - train_len))


#################
# Test Abnormal #
#################
df_abnormal = deeplog_df[deeplog_df["Label"] == 1]
deeplog_file_generator(os.path.join(output_dir,'test_abnormal'), df_abnormal, ["EventId"])
print('test abnormal size {}'.format(len(df_abnormal)))


In [None]:
# specifying the path to csv files
path = "C:/downloads"
 
# csv files in the path
file_list = glob.glob(path + "/*.xlsx")
 
# list of excel files we want to merge.
# pd.read_excel(file_path) reads the excel
# data into pandas dataframe.
excl_list = []
 
for file in file_list:
    excl_list.append(pd.read_excel(file))
 
# create a new dataframe to store the
# merged excel file.
excl_merged = pd.DataFrame()
 
for excl_file in excl_list:
     
    # appends the data into the excl_merged
    # dataframe.
    excl_merged = excl_merged.append(
      excl_file, ignore_index=True)
 
# exports the dataframe into excel file with
# specified name.
excl_merged.to_excel('total_food_sales.xlsx', index=False)