In [0]:
############
# Answer to question 
# logs
#-------------
#Author: AdrianJ
#V1.0 Created(2023-09-01)
############

In [0]:
import re
from datetime import datetime
from pyspark.sql import functions as F

### Log Processing

The following data comes from the _Learning Spark_ book.

#### **Question #1**: Parsing Logs
Parse the logs into a DataFrame/Spark SQL table that can be queried. This should be done using the Dataset API.

In [0]:
#Create regex patterns to extract and parse the logs
host_reg = r'(^\S+\.[\S+\.]+\S+)\s'
ts_rg = r'\d{2}/[A-Za-z]{3}/\d{4}:\d{2}:\d{2}:\d{2} \+\d{4}'
uri_rg = r'\"(\S+)\s(\S+)\s*(\S*)\"'
status_rg = r'\s(\d{3})\s'

In [0]:
#read the logs as text file
df_logs = spark.read.text("/databricks-datasets/learning-spark/data-001/fake_logs/*.log")

#Extract the logs information 
#NB! due to limited time for the assignment only extracting partial logs

df_logs = (
    df_logs
    .select(
        F.regexp_extract('value', host_reg, 1).alias('host'),
        F.regexp_extract('value', ts_rg, 0).alias('timestamp'),
        F.regexp_extract('value', uri_rg, 1).alias('method'),
        F.regexp_extract('value', uri_rg, 2).alias('endpoint'),
        F.regexp_extract('value', uri_rg, 3).alias('protocol'),
        F.regexp_extract('value', status_rg, 1).cast('integer').alias('status')
    )
)

print("INFO: Logs parsed")
print(" First 3 rows from logs")
df_logs.show(3)

INFO: Logs parsed
 First 3 rows from logs
+-------------+--------------------+------+--------------+--------+------+
|         host|           timestamp|method|      endpoint|protocol|status|
+-------------+--------------------+------+--------------+--------+------+
| 66.249.69.97|24/Sep/2014:22:25...|   GET|/071300/242153|HTTP/1.1|   404|
|71.19.157.174|24/Sep/2014:22:26...|   GET|        /error|HTTP/1.1|   404|
|71.19.157.174|24/Sep/2014:22:26...|   GET|  /favicon.ico|HTTP/1.1|   200|
+-------------+--------------------+------+--------------+--------+------+
only showing top 3 rows



#### **Question #2**: Analysis
Generate some insights from the log data.

In [0]:
max_ts = df_logs.select(F.max("timestamp")).collect()[0][0]
min_ts = df_logs.select(F.min("timestamp")).collect()[0][0]
total_of_404 = df_logs.where(F.col("status")=="404").count()
print(f"Between {min_ts} and {max_ts} there was a total of {total_of_404} request errors")

Between 24/Sep/2014:22:25:44 +0000 and 24/Sep/2014:22:26:37 +0000 there was a total of 3 request errors
