# PySpark Log File Analysis
This notebook parses and analyzes synthetic Apache logs.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_extract, col, when
spark = SparkSession.builder.appName("LogAnalysis").getOrCreate()

In [None]:
log_df = spark.read.text("../data/access_log.txt")
pattern = r'^(\S+) - - \[(.*?)\] "(\S+) (\S+) \S+" (\d{3}) (\d+)'
logs = log_df.select(
    regexp_extract('value', pattern, 1).alias('ip'),
    regexp_extract('value', pattern, 2).alias('timestamp'),
    regexp_extract('value', pattern, 3).alias('method'),
    regexp_extract('value', pattern, 4).alias('url'),
    regexp_extract('value', pattern, 5).cast('int').alias('status'),
    regexp_extract('value', pattern, 6).cast('int').alias('size')
)

In [None]:
# Top IPs
logs.groupBy('ip').count().orderBy('count', ascending=False).show()

In [None]:
# Most Requested URLs
logs.groupBy('url').count().orderBy('count', ascending=False).show()

In [None]:
# Error Rate Analysis
logs = logs.withColumn('error_type', 
    when((col('status') >= 400) & (col('status') < 500), '4xx')
    .when((col('status') >= 500), '5xx')
    .otherwise('OK'))
logs.groupBy('error_type').count().show()

In [None]:
spark.stop()