In [1]:
from drain import model_v1, evaluator
from drain.optimizer import Optimizer
from drain import Drain
import pandas as pd
import os
import sys
sys.path.append('../')

In [6]:
benchmark_settings = {
    'HDFS': {
        'log_file': 'HDFS/HDFS_2k.log',
        'log_format': '<Date> <Time> <Pid> <Level> <Component>: <Content>',
        'regex': [r'blk_-?\d+', r'(\d+\.){3}\d+(:\d+)?'],
        'st': 0.8,
        'depth': 5
        },

    'Hadoop': {
        'log_file': 'Hadoop/Hadoop_2k.log',
        'log_format': '<Date> <Time> <Level> \[<Process>\] <Component>: <Content>',
        'removeCol':[0,1],
        'regex': [r'(\d+\.){3}\d+'],
        'st': 0.5,
        'depth': 4        
        },

    'Spark': {
        'log_file': 'Spark/Spark_2k.log',
        'log_format': '<Date> <Time> <Level> <Component>: <Content>',
        'removeCol': [0, 1],
        'regex': [r'(\d+\.){3}\d+', r'\b[KGTM]?B\b', r'([\w-]+\.){2,}[\w-]+'],
        'st': 0.5,
        'depth': 4
        },

    'Zookeeper': {
        'log_file': 'Zookeeper/Zookeeper_2k.log',
        'log_format': '<Date> <Time> - <Level>  \[<Node>:<Component>@<Id>\] - <Content>',
        'removeCol': [0, 1],
        'regex': [r'(/|)(\d+\.){3}\d+(:\d+)?'],
        'st': 0.5,
        'depth': 4        
        },

    'BGL': {
        'log_file': 'BGL/BGL_2k.log',
        'log_format': '<Label> <Timestamp> <Date> <Node> <Time> <NodeRepeat> <Type> <Component> <Level> <Content>',
        'removeCol': [0, 1,2,3,4,5],
        'regex': [r'core\.\d+'],
        'st': 0.5,
        'depth': 4        
        },

    'HPC': {
        'log_file': 'HPC/HPC_2k.log',
        'log_format': '<LogId> <Node> <Component> <State> <Time> <Flag> <Content>',
        'removeCol': [0, 1,4],
        'regex': [r'=\d+'],
        'st': 0.5,
        'depth': 4
        },

    'Thunderbird': {
        'log_file': 'Thunderbird/Thunderbird_2k.log',
        'removeCol': [0, 1,4],
        'log_format': '<Label> <Timestamp> <Date> <User> <Month> <Day> <Time> <Location> <Component>(\[<PID>\])?: <Content>',
        'regex': [r'(\d+\.){3}\d+'],
        'st': 0.5,
        'depth': 4        
        },

    'Windows': {
        'log_file': 'Windows/Windows_2k.log',
        'log_format': '<Date> <Time>, <Level>                  <Component>    <Content>',
        'regex': [r'0x.*?\s'],
        'st': 0.7,
        'depth': 5      
        },

    'Linux': {
        'log_file': 'Linux/Linux_2k.log',
        'log_format': '<Month> <Date> <Time> <Level> <Component>(\[<PID>\])?: <Content>',
        'regex': [r'(\d+\.){3}\d+', r'\d{2}:\d{2}:\d{2}'],
        'st': 0.39,
        'depth': 6        
        },

    'Andriod': {
        'log_file': 'Andriod/Andriod_2k.log',
        'log_format': '<Date> <Time>  <Pid>  <Tid> <Level> <Component>: <Content>',
        'regex': [r'(/[\w-]+)+', r'([\w-]+\.){2,}[\w-]+', r'\b(\-?\+?\d+)\b|\b0[Xx][a-fA-F\d]+\b|\b[a-fA-F\d]{4,}\b'],
        'st': 0.2,
        'depth': 6   
        },

    'HealthApp': {
        'log_file': 'HealthApp/HealthApp_2k.log',
        'log_format': '<Time>\|<Component>\|<Pid>\|<Content>',
        'regex': [],
        'st': 0.2,
        'depth': 4
        },

    'Apache': {
        'log_file': 'Apache/Apache_2k.log',
        'log_format': '\[<Time>\] \[<Level>\] <Content>',
        'regex': [r'(\d+\.){3}\d+'],
        'st': 0.5,
        'depth': 4        
        },

    'Proxifier': {
        'log_file': 'Proxifier/Proxifier_2k.log',
        'log_format': '\[<Time>\] <Program> - <Content>',
        'regex': [r'<\d+\ssec', r'([\w-]+\.)+[\w-]+(:\d+)?', r'\d{2}:\d{2}(:\d{2})*', r'[KGTM]B'],
        'st': 0.6,
        'depth': 3
        },

    'OpenSSH': {
        'log_file': 'OpenSSH/OpenSSH_2k.log',
        'log_format': '<Date> <Day> <Time> <Component> sshd\[<Pid>\]: <Content>',
        'regex': [r'(\d+\.){3}\d+', r'([\w-]+\.){2,}[\w-]+'],
        'st': 0.6,
        'depth': 5   
        },

    'OpenStack': {
        'log_file': 'OpenStack/OpenStack_2k.log',
        'log_format': '<Logrecord> <Date> <Time> <Pid> <Level> <Component> \[<ADDR>\] <Content>',
        'regex': [r'((\d+\.){3}\d+,?)+', r'/.+?\s', r'\d+'],
        'st': 0.5,
        'depth': 5
        },

    'Mac': {
        'log_file': 'Mac/Mac_2k.log',
        'log_format': '<Month>  <Date> <Time> <User> <Component>\[<PID>\]( \(<Address>\))?: <Content>',
        'regex': [r'([\w-]+\.){2,}[\w-]+'],
        'st': 0.7,
        'depth': 6   
        },
}
input_dir = 'data/logs/' # The input directory of log file
output_dir = 'data/logs/result/' # The output directory of parsing results

In [7]:
def printClusters(logClusters):
    for logCluster in logClusters:
        print("eventId: "+str(logCluster.eventId)+" template: "+' '.join(logCluster.logTemplate),end=' ')
        print("parentNode: " ,end=' ')
        for node in logCluster.parentNode:
            print(node.token,end=' ')
        print()

In [8]:
optimize_settings = {
    'HDFS':[8,0.7,0.8,1,0.9],
    'Hadoop':[9,0.5,0.8,1,0.8],
    'Spark':[9,0.6,0.8,1,1],
    'Zookeeper':[9,0.6,0.8,1,0.9],
    'BGL':[9,0.7,0.8,1,0.9],
    'HPC':[8,0.6,0.8,1,0.7],
    'Windows':[8,0.9,0.8,1,0.7],
    'Andriod':[8,1,0.9,100,1]
}

In [80]:
for dataset,param in optimize_settings.items():
    setting = benchmark_settings[dataset]
    print('\n=== Evaluation on %s ==='%dataset)
    indir = os.path.join(input_dir, os.path.dirname(setting['log_file']))
    log_file = os.path.basename(setting['log_file'])
    parser1 = model_v1.Drain(log_format=setting['log_format'], rex=setting['regex'], 
                            depth=param[0], st=param[1])
    parser1.fit(isReconstruct=True,inputFile=input_dir+setting['log_file'],
               outputFile=output_dir+log_file + '_structured.csv')
    print('优化前,聚类数: ',len(parser1.logClusters),end=' ')
    F1_measure, accuracy = evaluator.evaluate(
                       groundtruth=os.path.join(indir, log_file + '_structured.csv'),
                       parsedresult=os.path.join(output_dir, log_file + '_structured.csv')
                       )
    opt = Optimizer()
    opt.modify(method='merge_sub_tree',
               resultFile=output_dir+log_file + '_structured.csv', 
               logparser = parser1,st=param[2],nst=param[3])
    logClusters = parser1.logClusters
    print('合并子树优化,聚类数: ',len(logClusters),end=' ')
    F1_measure, accuracy = evaluator.evaluate(
                       groundtruth=os.path.join(indir, log_file + '_structured.csv'),
                       parsedresult=os.path.join(output_dir, log_file + '_structured.csv')
                       )
    opt.modify(method = 'LCS',resultFile=output_dir+log_file + '_structured.csv',
               logparser=parser1,st = param[4])

    print('合并聚类优化,聚类数: ',len(parser1.logClusters),end=' ')
    F1_measure, accuracy = evaluator.evaluate(
                           groundtruth=os.path.join(indir, log_file + '_structured.csv'),
                           parsedresult=os.path.join(output_dir, log_file + '_structured.csv')
                           )


=== Evaluation on HDFS ===
build the prefix tree process takes 1.8409967422485352
the number of log used to build the parser is  2000
*********************************************
优化前,聚类数:  14 Precision: 1.0000, Recall: 1.0000, F1_measure: 1.0000, Parsing_Accuracy: 1.0000
合并子树优化,聚类数:  14 Precision: 1.0000, Recall: 1.0000, F1_measure: 1.0000, Parsing_Accuracy: 1.0000
合并聚类优化,聚类数:  14 Precision: 1.0000, Recall: 1.0000, F1_measure: 1.0000, Parsing_Accuracy: 1.0000

=== Evaluation on Hadoop ===
build the prefix tree process takes 1.8610014915466309
the number of log used to build the parser is  2000
*********************************************
优化前,聚类数:  142 Precision: 1.0000, Recall: 0.9993, F1_measure: 0.9996, Parsing_Accuracy: 0.9725
合并子树优化,聚类数:  117 Precision: 1.0000, Recall: 0.9998, F1_measure: 0.9999, Parsing_Accuracy: 0.9845
合并聚类优化,聚类数:  114 Precision: 1.0000, Recall: 0.9999, F1_measure: 1.0000, Parsing_Accuracy: 0.9870

=== Evaluation on Spark ===
build the prefix tree process take

In [10]:
dataset = 'Mac'
setting =benchmark_settings[dataset]
indir = os.path.join(input_dir, os.path.dirname(setting['log_file']))
log_file = os.path.basename(setting['log_file'])

In [11]:
parser = Drain.LogParser(log_format=setting['log_format'], indir=indir, outdir=output_dir, rex=setting['regex'],
                         depth=setting['depth'], st=setting['st'])
parser.parse(log_file)
logClusters = parser.logCluL
print('原Drain,聚类数: ', len(logClusters), end=' ')
F1_measure, accuracy = evaluator.evaluate(
    groundtruth=os.path.join(indir, log_file + '_structured.csv'),
    parsedresult=os.path.join(output_dir, log_file + '_structured.csv')
)
df = pd.read_csv(os.path.join(indir,log_file + '_structured.csv'))
print('真实聚类数: ', len(df['EventId'].drop_duplicates()))

原Drain,聚类数:  578 Precision: 0.9759, Recall: 0.9750, F1_measure: 0.9755, Parsing_Accuracy: 0.7865
真实聚类数:  341


In [12]:
print('\n=== Evaluation on %s ==='%dataset)
indir = os.path.join(input_dir, os.path.dirname(setting['log_file']))
log_file = os.path.basename(setting['log_file'])
parser1 = model_v1.Drain(log_format=setting['log_format'], rex=setting['regex'], 
                        depth=8, st=0.8)
parser1.fit(isReconstruct=True,inputFile=input_dir+setting['log_file'],
           outputFile=output_dir+log_file + '_structured.csv')
print('优化前,聚类数: ',len(parser1.logClusters),end=' ')
F1_measure, accuracy = evaluator.evaluate(
                   groundtruth=os.path.join(indir, log_file + '_structured.csv'),
                   parsedresult=os.path.join(output_dir, log_file + '_structured.csv')
                   )
opt = Optimizer()
opt.modify(method='merge_sub_tree',
           resultFile=output_dir+log_file + '_structured.csv', 
           logparser = parser1,st=0.8,nst=1)
logClusters = parser1.logClusters
print('合并子树优化,聚类数: ',len(logClusters),end=' ')
F1_measure, accuracy = evaluator.evaluate(
                   groundtruth=os.path.join(indir, log_file + '_structured.csv'),
                   parsedresult=os.path.join(output_dir, log_file + '_structured.csv')
                   )
opt.modify(method = 'LCS',resultFile=output_dir+log_file + '_structured.csv',
           logparser=parser1,st = 0.9)

print('合并聚类优化,聚类数: ',len(parser1.logClusters),end=' ')
F1_measure, accuracy = evaluator.evaluate(
                       groundtruth=os.path.join(indir, log_file + '_structured.csv'),
                       parsedresult=os.path.join(output_dir, log_file + '_structured.csv')
                       )


=== Evaluation on Mac ===
build the prefix tree process takes 2.846994400024414
the number of log used to build the parser is  2000
*********************************************
优化前,聚类数:  551 Precision: 0.9830, Recall: 0.9267, F1_measure: 0.9540, Parsing_Accuracy: 0.6985
合并子树优化,聚类数:  337 Precision: 0.9740, Recall: 0.9891, F1_measure: 0.9815, Parsing_Accuracy: 0.7775
合并聚类优化,聚类数:  335 Precision: 0.9728, Recall: 0.9891, F1_measure: 0.9809, Parsing_Accuracy: 0.7700


In [8]:
printClusters(parser1.logClusters)

eventId: 1 template: Notification time out: <*> parentNode:  * 
eventId: 2 template: Received connection request <*> parentNode:  <*> 
eventId: 3 template: Send worker leaving thread parentNode:  thread 
eventId: 4 template: Interrupted while waiting for message on queue parentNode:  queue 
eventId: 5 template: Connection broken for id <*> my id = <*> error = parentNode:  id 
eventId: 6 template: Interrupting SendWorker parentNode:  SendWorker 
eventId: 7 template: Closed socket connection for client <*> which had sessionid <*> parentNode:  which 
eventId: 8 template: caught end of stream exception parentNode:  exception 
eventId: 9 template: Client attempting to renew session <*> at <*> parentNode:  at 
eventId: 10 template: Client attempting to establish new session at <*> parentNode:  at 
eventId: 11 template: Established session <*> with negotiated timeout <*> for client <*> parentNode:  * 
eventId: 12 template: Accepted socket connection from <*> parentNode:  <*> 
eventId: 13 temp

In [41]:
opt = Optimizer()
opt.modify(method='merge_sub_tree',
           resultFile=output_dir+log_file + '_structured.csv', 
           logparser = parser1,nst=1)
logClusters = parser1.logClusters
print('合并子树优化,聚类数: ',len(logClusters),end=' ')
F1_measure, accuracy = evaluator.evaluate(
                   groundtruth=os.path.join(indir, log_file + '_structured.csv'),
                   parsedresult=os.path.join(output_dir, log_file + '_structured.csv')
                   )
opt.modify(method = 'seq_dist',resultFile=output_dir+log_file + '_structured.csv',
           logparser=parser1,st = 1)

print('合并聚类优化,聚类数: ',len(parser1.logClusters),end=' ')
F1_measure, accuracy = evaluator.evaluate(
                       groundtruth=os.path.join(indir, log_file + '_structured.csv'),
                       parsedresult=os.path.join(output_dir, log_file + '_structured.csv')
                       )

合并子树优化,聚类数:  48 Precision: 0.9998, Recall: 0.9998, F1_measure: 0.9998, Parsing_Accuracy: 0.9885
合并聚类优化,聚类数:  48 Precision: 0.9998, Recall: 0.9998, F1_measure: 0.9998, Parsing_Accuracy: 0.9885


In [31]:
printClusters(parser.logClusters)

eventId: 1 template: PacketResponder <*> for block <*> terminating parentNode:  block 
eventId: 2 template: BLOCK* NameSystem.addStoredBlock: blockMap updated: <*> is added to <*> size <*> parentNode:  updated: 
eventId: 3 template: Received block <*> of size <*> from /<*> parentNode:  of 
eventId: 4 template: <*> block <*> src: /<*> dest: /<*> parentNode:  src: src: 
eventId: 5 template: BLOCK* NameSystem.allocateBlock: <*> <*> parentNode:  <*> 
eventId: 6 template: Verification succeeded for <*> parentNode:  <*> 
eventId: 7 template: Deleting block <*> file <*> parentNode:  file 
eventId: 8 template: <*> Served block <*> to /<*> parentNode:  <*> 
eventId: 9 template: <*>:Got exception while serving <*> to /<*>: parentNode:  serving 
eventId: 10 template: BLOCK* NameSystem.delete: <*> is added to invalidSet of <*> parentNode:  is 
eventId: 11 template: <*> Starting thread to transfer block <*> to <*> parentNode:  to 
eventId: 12 template: BLOCK* ask <*> to <*> <*> parentNode:  to 


In [96]:
for i in parser.logCluL:
    print(' '.join(i.logTemplate))

PacketResponder <*> for block <*> terminating
BLOCK* NameSystem.addStoredBlock: blockMap updated: <*> is added to <*> size <*>
Received block <*> of size <*> from /<*>
Receiving block <*> src: /<*> dest: /<*>
BLOCK* NameSystem.allocateBlock: <*> <*>
Verification succeeded for <*>
Deleting block <*> file <*>
<*> Served block <*> to /<*>
<*>:Got exception while serving <*> to /<*>:
BLOCK* NameSystem.delete: <*> is added to invalidSet of <*>
<*> Starting thread to transfer block <*> to <*>
BLOCK* ask <*> to delete <*>
Received block <*> src: /<*> dest: /<*> of size 67108864
BLOCK* ask <*> to delete <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <

In [34]:
parser = Drain.LogParser(log_format=setting['log_format'], indir=indir, 
                         outdir=output_dir, rex=setting['regex'], depth=4, st=0.5)
parser.parse(log_file)

print('原Drain,聚类数: ',len(logClusters),end=' ')
F1_measure, accuracy = evaluator.evaluate(
                       groundtruth=os.path.join(indir, log_file + '_structured.csv'),
                       parsedresult=os.path.join(output_dir, log_file + '_structured.csv')
                       )

df = pd.read_csv(os.path.join(indir,log_file + '_structured.csv'))
print('真实聚类数: ', len(df['EventId'].drop_duplicates()))

原Drain,聚类数:  12 Precision: 1.0000, Recall: 1.0000, F1_measure: 1.0000, Parsing_Accuracy: 0.9975
真实聚类数:  14


In [7]:

for dataset, setting in benchmark_settings.items():

    print('\n=== Evaluation on %s ==='%dataset)
    indir = os.path.join(input_dir, os.path.dirname(setting['log_file']))
    log_file = os.path.basename(setting['log_file'])
    try:
        parser = model_v1.Drain(log_format=setting['log_format'], rex=setting['regex'], depth=setting['depth']+1, st=setting['st'])
        parser.fit(isReconstruct=True,inputFile=input_dir+setting['log_file'],outputFile=output_dir+log_file + '_structured.csv')
    except:
        continue
    opt = Optimizer()
    logClusters = parser.logClusters
    print('优化前,聚类数: ',len(logClusters),end=' ')
    F1_measure, accuracy = evaluator.evaluate(
                       groundtruth=os.path.join(indir, log_file + '_structured.csv'),
                       parsedresult=os.path.join(output_dir, log_file + '_structured.csv')
                       )
    v0_bechmark_result.append([dataset, F1_measure, accuracy])
#     printClusters(logClusters)
    opt.modify(method='merge_sub_tree',resultFile=output_dir+log_file + '_structured.csv', logparser = parser)
    logClusters = parser.logClusters
    print('合并子树优化,聚类数: ',len(logClusters),end=' ')
    F1_measure, accuracy = evaluator.evaluate(
                       groundtruth=os.path.join(indir, log_file + '_structured.csv'),
                       parsedresult=os.path.join(output_dir, log_file + '_structured.csv')
                       )
#     printClusters(logClusters)
    # createPlot(drain)
    v1_bechmark_result.append([dataset, F1_measure, accuracy])
    opt.modify(method = 'seq_dist',resultFile=output_dir+log_file + '_structured.csv',logparser=parser,st = 0.8)
    logClusters = parser.logClusters
    print('合并聚类优化,聚类数: ',len(logClusters),end=' ')
#     printClusters(logClusters)
    F1_measure, accuracy = evaluator.evaluate(
                           groundtruth=os.path.join(indir, log_file + '_structured.csv'),
                           parsedresult=os.path.join(output_dir, log_file + '_structured.csv')
                           )
    v2_bechmark_result.append([dataset, F1_measure, accuracy])
    parser = Drain.LogParser(log_format=setting['log_format'], indir=indir, outdir=output_dir, rex=setting['regex'], depth=setting['depth'], st=setting['st'])
    parser.parse(log_file)
    logClusters = parser.logCluL
    print('原Drain,聚类数: ',len(logClusters),end=' ')
    F1_measure, accuracy = evaluator.evaluate(
                           groundtruth=os.path.join(indir, log_file + '_structured.csv'),
                           parsedresult=os.path.join(output_dir, log_file + '_structured.csv')
                           )
    origin_bechmark_result.append([dataset, F1_measure, accuracy])
    pd.read_csv(log_file + '_structured.csv')
    print('真实聚类数: ',)

print('\n=== Overall evaluation results ===')
df_result = pd.DataFrame(origin_bechmark_result, columns=['Dataset', 'F1_measure', 'Accuracy'])
df_result.set_index('Dataset', inplace=True)
print(df_result)
df_result = pd.DataFrame(v0_bechmark_result, columns=['Dataset', 'F1_measure', 'Accuracy'])
df_result.set_index('Dataset', inplace=True)
print(df_result)
df_result = pd.DataFrame(v1_bechmark_result, columns=['Dataset', 'F1_measure', 'Accuracy'])
df_result.set_index('Dataset', inplace=True)
print(df_result)
df_result = pd.DataFrame(v2_bechmark_result, columns=['Dataset', 'F1_measure', 'Accuracy'])
df_result.set_index('Dataset', inplace=True)
print(df_result)
df_result.T.to_csv('Drain_bechmark_result.csv')


=== Evaluation on HDFS ===
build the prefix tree process takes 2.0962367057800293
the number of log used to build the parser is  2000
*********************************************
优化前,聚类数:  13 Precision: 1.0000, Recall: 1.0000, F1_measure: 1.0000, Parsing_Accuracy: 0.9970
合并子树优化,聚类数:  13 Precision: 1.0000, Recall: 1.0000, F1_measure: 1.0000, Parsing_Accuracy: 0.9970
合并聚类优化,聚类数:  12 Precision: 0.9977, Recall: 1.0000, F1_measure: 0.9988, Parsing_Accuracy: 0.8500
原Drain,聚类数:  17 Precision: 1.0000, Recall: 1.0000, F1_measure: 1.0000, Parsing_Accuracy: 0.9975

=== Evaluation on Hadoop ===

=== Evaluation on Spark ===
build the prefix tree process takes 1.6864142417907715
the number of log used to build the parser is  2000
*********************************************
优化前,聚类数:  30 Precision: 0.9839, Recall: 1.0000, F1_measure: 0.9919, Parsing_Accuracy: 0.9220
合并子树优化,聚类数:  28 Precision: 0.7263, Recall: 1.0000, F1_measure: 0.8415, Parsing_Accuracy: 0.6175
合并聚类优化,聚类数:  28 Precision: 0.7263, Re

In [16]:
a = {1:2,2:1}
b = {1:1}
list(set(a.keys()).intersection(b.keys()))

[1]

In [None]:
'''
parsing phase
'''
def parse_log(logPath,parsed_result):

    rex = ['blk_(|-)[0-9]+', '(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)']
    removeCol = [0,1,2]
    myParser = Drain(rex=rex,removeCol=removeCol,st=0.5)
    myParser.fit(isReconstruct=True,inputFile=logPath,outputFile=parsed_result)

#     myParser.save(savePath=cluster_result)


In [11]:
parse_log('data/logs/HDFS/HDFS_2k.log','data/logs/result/hdfs.csv')

load data: 2000it [00:00, 6644.67it/s]


build the prefix tree process takes 0.3579902648925781
the number of log used to build the parser is  2000
*********************************************


In [12]:
from drain.evaluator import evaluate

In [13]:
evaluate('data/logs/result/hdfs.csv','data/logs/HDFS/HDFS_2k.log_structured.csv')

Precision: 1.0000, Recall: 1.0000, F1_measure: 1.0000, Parsing_Accuracy: 0.9970


(0.9999901893648792, 0.997)

In [14]:
parse_log('data/logs/Apache/Apache_2k.log','data/logs/result/apache.csv')

load data: 2000it [00:00, 6501.99it/s]


build the prefix tree process takes 0.32255101203918457
the number of log used to build the parser is  2000
*********************************************


In [15]:
evaluate('data/logs/result/apache.csv','data/logs/Apache/Apache_2k.log_structured.csv')

Precision: 1.0000, Recall: 1.0000, F1_measure: 1.0000, Parsing_Accuracy: 1.0000


(1.0, 1.0)