# Training state for drain3

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType,StructField,Row, StringType

spark = SparkSession\
    .builder\
    .appName("training_drain3")\
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY")\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/17 12:00:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/08/17 12:00:07 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


### Load training data

In [2]:
log_file_path= "../data/raw/BGL_train.log"

base_df = spark.read.text(log_file_path)
# Let's look at the schema
base_df.printSchema()
base_df.show(truncate=False)


root
 |-- value: string (nullable = true)

+---------------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                              |
+---------------------------------------------------------------------------------------------------------------------------------------------------+
|- 1117838570 2005.06.03 R02-M1-N0-C:J12-U11 2005-06-03-15.42.50.363779 R02-M1-N0-C:J12-U11 RAS KERNEL INFO instruction cache parity error corrected|
|- 1117838570 2005.06.03 R02-M1-N0-C:J12-U11 2005-06-03-15.42.50.527847 R02-M1-N0-C:J12-U11 RAS KERNEL INFO instruction cache parity error corrected|
|- 1117838570 2005.06.03 R02-M1-N0-C:J12-U11 2005-06-03-15.42.50.675872 R02-M1-N0-C:J12-U11 RAS KERNEL INFO instruction cache parity error corrected|
|- 1117838570 2005.06.03 R02-M1-N0-C:J12-U11 2005-06-03-1

### split 

In [3]:
def split_line(log_line):
    log_line = log_line.rstrip()
    line_in_arr = log_line.split(' ')
    label = line_in_arr[0]
    timestamp = line_in_arr[1]
    date = line_in_arr[2]
    content = ' '.join(line_in_arr[8:])

    return Row('label', 'timestamp', 'date', 'content')(label, timestamp, date, content)

schema = StructType([
    StructField('label', StringType(), True),
    StructField('timestamp', StringType(), True),
    StructField('date', StringType(), True),
    StructField('content', StringType(), True)
])

udf_split = udf(split_line, schema)

processed_df = base_df\
    .withColumn("parsed", udf_split(base_df["value"]))\
    .select("parsed.*")

processed_df.show(5, truncate=False)

+-----+----------+----------+---------------------------------------------+
|label|timestamp |date      |content                                      |
+-----+----------+----------+---------------------------------------------+
|-    |1117838570|2005.06.03|INFO instruction cache parity error corrected|
|-    |1117838570|2005.06.03|INFO instruction cache parity error corrected|
|-    |1117838570|2005.06.03|INFO instruction cache parity error corrected|
|-    |1117838570|2005.06.03|INFO instruction cache parity error corrected|
|-    |1117838570|2005.06.03|INFO instruction cache parity error corrected|
+-----+----------+----------+---------------------------------------------+
only showing top 5 rows



### train drain3

In [4]:
from drain3 import TemplateMiner
from drain3.template_miner_config import TemplateMinerConfig
from drain3.file_persistence import FilePersistence

persistence = FilePersistence("drain3_state_bgl.bin")
config = TemplateMinerConfig()
config.load("drain3.ini")
config.profiling_enabled = False
template_miner = TemplateMiner(persistence, config)

# filter success logs with label == "-"
success_logs = processed_df.filter(processed_df.label == "-")
error_logs = processed_df.filter(processed_df.label != "-")

print(f"total log: {processed_df.count()}")
print(f"success log: {success_logs.count()}")
print(f"error log: {error_logs.count()}")

total log: 673599


                                                                                

success log: 454128
error log: 219471


In [5]:
for row in success_logs.collect():
    template_miner.add_log_message(row["content"])

print("Number of templates: ", len(template_miner.drain.clusters))

Number of templates:  176


In [6]:
correct_predictions = 0
for row in error_logs.collect():
    cluster = template_miner.match(row["content"])
    if cluster is None:
        correct_predictions += 1

print("Correct predictions: ", correct_predictions/error_logs.count()*100, "% (", correct_predictions, "/", error_logs.count(), ")")     

                                                                                

Correct predictions:  100.0 % ( 219471 / 219471 )
