In [141]:
from pyspark.sql import SparkSession

spark = SparkSession\
.builder\
.appName("pyspark-notebook").\
config("spark.sql.legacy.timeParserPolicy", "LEGACY").\
getOrCreate()

In [142]:
log_file_path= "dataset/BGL_2k.log"

base_df = spark.read.text(log_file_path)
# Let's look at the schema
base_df.printSchema()
base_df.show(truncate=False)

root
 |-- value: string (nullable = true)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                                                                                        |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|- 1117838570 2005.06.03 R02-M1-N0-C:J12-U11 2005-06-03-15.42.50.675872 R02-M1-N0-C:J12-U11 RAS KERNEL INFO instruction cache parity error corrected                                                          |
|- 1117838573 2005.06.03 R02-M1-N0-C:J12-U11 2005-06-03-15.42.53.276129 R02-M1-N0-C:J12-U11 RAS KERNEL INFO instruction cache

### Parsing

In [143]:
from pyspark.sql.functions import split, regexp_extract
split_df = base_df.select(regexp_extract('value', r'^([^\s]+\s)', 1).alias('host'),
                          regexp_extract('value', r'\[(.*?)\]', 1).alias('timestamp'),
                          regexp_extract('value', r'^.*"\w+\s+([^\s]+)\s+HTTP.*"', 1).alias('path'),
                          regexp_extract('value', r'^.*"\s+([^\s]+)', 1).cast('integer').alias('status'),
                          regexp_extract('value', r'^.*\s+(\d+)$', 1).cast('integer').alias('content_size'))
split_df.show(truncate=False)

+--------+---------+----+------+------------+
|host    |timestamp|path|status|content_size|
+--------+---------+----+------+------------+
|-       |         |    |null  |null        |
|-       |         |    |null  |null        |
|-       |         |    |null  |null        |
|-       |         |    |null  |null        |
|-       |         |    |null  |null        |
|-       |         |    |null  |null        |
|-       |         |    |null  |null        |
|-       |         |    |null  |null        |
|APPREAD |         |    |null  |null        |
|APPREAD |         |    |null  |null        |
|-       |         |    |null  |null        |
|-       |         |    |null  |null        |
|-       |         |    |null  |null        |
|-       |         |    |null  |null        |
|-       |         |    |null  |null        |
|-       |         |    |null  |null        |
|-       |         |    |null  |null        |
|-       |         |    |null  |null        |
|-       |         |    |null  |nu

### Cleaning

In [24]:
base_df.filter(base_df['value'].isNull()).count()

0

In [25]:
bad_rows_df = split_df.filter(split_df['host'].isNull() |
                              split_df['timestamp'].isNull() |
                              split_df['path'].isNull() |
                              split_df['status'].isNull() |
                             split_df['content_size'].isNull())
bad_rows_df.count()

0

In [26]:
from pyspark.sql.functions import col, sum

def count_null(col_name):
  return sum(col(col_name).isNull().cast('integer')).alias(col_name)
exprs = []
[exprs.append(count_null(col_name)) for col_name in split_df.columns]
split_df.agg(*exprs).show()

+----+---------+----+------+------------+
|host|timestamp|path|status|content_size|
+----+---------+----+------+------------+
|   0|        0|   0|     0|           0|
+----+---------+----+------+------------+



In [27]:
from pyspark.sql.functions import *
logs_df = split_df.select('*', to_timestamp(split_df['timestamp'],"dd/MMM/yyyy:HH:mm:ss ZZZZ").cast('timestamp').alias('time')).drop('timestamp')
total_log_entries = logs_df.count()
print(total_log_entries)
logs_df.show(truncate=False)

203025
+----------------+--------------------------------------------+------+------------+-------------------+
|host            |path                                        |status|content_size|time               |
+----------------+--------------------------------------------+------+------------+-------------------+
|97.86.147.130   |/niches                                     |405   |3170        |2023-08-15 15:32:09|
|162.176.171.13  |/ubiquitous/empower/content/roi             |502   |10984       |2023-08-15 15:32:09|
|70.109.63.88    |/reinvent/innovative                        |304   |5075        |2023-08-15 15:32:09|
|237.80.86.112   |/target                                     |100   |24483       |2023-08-15 15:32:09|
|198.91.2.115    |/infrastructures                            |503   |24834       |2023-08-15 15:32:09|
|37.79.127.83    |/open-source/scale/synergies/engage         |500   |11962       |2023-08-15 15:32:09|
|28.59.248.3     |/maximize/b2c                          

### Parser logs from drain3

In [133]:
from drain3 import TemplateMiner
from drain3.template_miner_config import TemplateMinerConfig

persistence_type = "FILE"
persistence = FilePersistence("drain3_state.bin")

config = TemplateMinerConfig()
config.load("drain3.ini")
config.profiling_enabled = False

template_miner = TemplateMiner(persistence, config)
print(f"Drain3 started with '{persistence_type}' persistence")
print(f"{len(config.masking_instructions)} masking instructions are in use")
print(f"Starting training mode. Reading from std-in ('q' to finish)")

Drain3 started with 'FILE' persistence
4 masking instructions are in use
Starting training mode. Reading from std-in ('q' to finish)


In [134]:
with open("dataset/apache_error.log") as f:
    lines = f.readlines()

In [140]:
total_lines = 0
for line in lines:
    total_lines += 1
    line = line.rstrip()
    split_line = line.split('] ')
    message = split_line[4]
    # template_miner.add_log_message(message)
    print(f"Message: {message}")
    print(f"Date: {split_line[0][1:]}")
    break

print('Total lines: {}'.format(total_lines))
print('Number of clusters: {}'.format(len(template_miner.drain.clusters)))

Message: You can't index the port without copying the cross-platform HTTP microchip!
Date: Tue Aug 15 15:32:26 2023
Total lines: 1
Number of clusters: 183


In [136]:
example_log =\
"""
We need to compress the wireless HTTP transmitter!
"""

In [137]:
cluster = template_miner.match(example_log)
if cluster is None:
        print(f"No match found")
else:
    template = cluster.get_template()
    print(f"Matched template #{cluster.cluster_id}: {template}")
    print(f"Parameters: {template_miner.get_parameter_list(template, example_log)}")

Matched template #7: We need to <:*:> the <:*:> <:*:> <:*:>
Parameters: []
