In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, sum, count

# Создание SparkSession
spark = SparkSession.builder.appName("final_task").getOrCreate()
# Чтение данных
web_server_logs_df = spark.read.csv("web_server_logs.csv", header=True, inferSchema=True)

web_server_logs_df = web_server_logs_df.withColumn("timestamp", to_date(col("timestamp"), "yyyy-MM-dd"))

top_ip = web_server_logs_df.groupBy("ip").count().orderBy(col("count").desc()).limit(10).withColumnRenamed("count", "req_count")
print("Top 10 active IP addreses:")
top_ip.show()

method_count = web_server_logs_df.groupBy("method").count().limit(10).withColumnRenamed("count", "method_count")
print("Request count by HTTP method:")
method_count.show()

NumberOf404 = web_server_logs_df.filter(web_server_logs_df.response_code == '404').count()
print("Number of 404 response codes:", NumberOf404)

size_by_day = web_server_logs_df.groupBy("timestamp").agg(sum(col("response_size")).alias("total_response_size")).orderBy("timestamp")
print("Total response size by day:")
size_by_day.show()

spark.stop()


Top 10 active IP addreses:
+---------------+---------+
|             ip|req_count|
+---------------+---------+
| 145.24.162.146|        2|
|  100.37.110.47|        1|
|   20.32.75.124|        1|
|  43.12.236.141|        1|
| 42.175.236.159|        1|
|     94.6.71.59|        1|
| 180.94.109.180|        1|
| 183.176.40.146|        1|
|   40.218.10.25|        1|
|175.112.188.140|        1|
+---------------+---------+

Request count by HTTP method:
+------+------------+
|method|method_count|
+------+------------+
|  POST|       25065|
|DELETE|       25100|
|   PUT|       24851|
|   GET|       24984|
+------+------------+

Number of 404 response codes: 24947
Total response size by day:
+----------+-------------------+
| timestamp|total_response_size|
+----------+-------------------+
|2024-01-01|            1935318|
|2024-01-02|            2135349|
|2024-01-03|            2072520|
|2024-01-04|            1946777|
|2024-01-05|            1836362|
|2024-01-06|            2059674|
|2024-01-07|