<a href="https://colab.research.google.com/github/andrey-de/stepik_de_jun_spark_in_collab/blob/main/SparkTaskFinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install faker

Collecting faker
  Downloading faker-37.5.3-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.5.3-py3-none-any.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m64.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.5.3


In [20]:
import csv
from faker import Faker
import random

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date
from pyspark.sql.functions import sum, desc, max, min, count
from pyspark.sql.functions import year, month, avg

In [3]:
fake = Faker()

num_records = 100000

http_methods = ['GET', 'POST', 'PUT', 'DELETE']
response_codes = [200, 301, 404, 500]

file_path = "/content/sample_data/web_server_logs.csv"

with open(file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['ip', 'timestamp', 'method', 'url', 'response_code', 'response_size'])

    for _ in range(num_records):
        ip = fake.ipv4()
        timestamp = fake.date_time_this_year().isoformat()
        method = random.choice(http_methods)
        url = fake.uri_path()
        response_code = random.choice(response_codes)
        response_size = random.randint(100, 10000)

        writer.writerow([ip, timestamp, method, url, response_code, response_size])

print(f"Сгенерировано {num_records} записей и сохранено в {file_path}")

Сгенерировано 100000 записей и сохранено в /content/sample_data/web_server_logs.csv


In [6]:
# Create SparkSession
spark = SparkSession.builder.appName("WebServerLogs").getOrCreate()

In [7]:
# Read CSV-files
df = spark.read.csv("/content/sample_data/web_server_logs.csv", header=True, inferSchema=True)

In [9]:
# Print schemas
df.printSchema()

root
 |-- ip: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- method: string (nullable = true)
 |-- url: string (nullable = true)
 |-- response_code: integer (nullable = true)
 |-- response_size: integer (nullable = true)



In [28]:
#result 1
df_res1 = df.select("ip").groupBy("ip").agg(count("ip").alias("request_count"))
print("Top 10 active IP addresses:")
df_res1.orderBy(desc("request_count")).show(10)

Top 10 active IP addresses:
+--------------+-------------+
|            ip|request_count|
+--------------+-------------+
| 25.119.58.204|            2|
| 193.65.83.104|            2|
|99.218.136.185|            1|
|  78.119.214.6|            1|
| 133.52.61.164|            1|
| 214.174.73.79|            1|
|  183.3.248.91|            1|
|194.60.158.181|            1|
|44.239.105.197|            1|
|209.86.206.217|            1|
+--------------+-------------+
only showing top 10 rows



In [29]:
#result 2
df_res2 = df.select("method").groupBy("method").agg(count("method").alias("method_count"))
print("Request count by HTTP method:")
df_res2.orderBy(desc("method_count")).show(10)

Request count by HTTP method:
+------+------------+
|method|method_count|
+------+------------+
|  POST|       25063|
|   GET|       25060|
|   PUT|       25010|
|DELETE|       24867|
+------+------------+



In [23]:
#result 3
df_res3 = df.filter("response_code = 404")
print(f'Number of 404 response codes: {df_res3.count()}')


Number of 404 response codes: 24994


In [30]:
#result 4
df_res4 = df.withColumn("date", to_date("timestamp")).select("date", "response_size").groupBy("date").agg(sum("response_size").alias("total_response_size"))
print("Total response size by day:")
df_res4.orderBy(("date")).show(25)

Total response size by day:
+----------+-------------------+
|      date|total_response_size|
+----------+-------------------+
|2025-01-01|            2475782|
|2025-01-02|            2338529|
|2025-01-03|            2397206|
|2025-01-04|            2100137|
|2025-01-05|            2382718|
|2025-01-06|            2162305|
|2025-01-07|            2323693|
|2025-01-08|            2103839|
|2025-01-09|            2235231|
|2025-01-10|            2699383|
|2025-01-11|            2346570|
|2025-01-12|            2379490|
|2025-01-13|            2264299|
|2025-01-14|            2180146|
|2025-01-15|            2333613|
|2025-01-16|            2322633|
|2025-01-17|            2147212|
|2025-01-18|            2265839|
|2025-01-19|            2371105|
|2025-01-20|            2337550|
|2025-01-21|            2424899|
|2025-01-22|            2217645|
|2025-01-23|            2313095|
|2025-01-24|            2331356|
|2025-01-25|            2462046|
+----------+-------------------+
only showing to

In [21]:
spark.stop()