In [20]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession,DataFrame
from pyspark.sql.functions import col, round, to_date, year, month, sum, avg

conf = (
     SparkConf()
    .setAppName('Simple_Spark')
    .setMaster('local[*]')
)


# Spark сессия
spark = (
    SparkSession
    .builder
    .config(conf=conf)
    # .enableHiveSupport()
    .getOrCreate()
)

24/09/06 10:58:38 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [21]:
weather_df = spark.read.csv("weather_data.csv", header=True)
weather_col = weather_df.columns

In [22]:
weather_col

['station_id', 'date', 'temperature', 'precipitation', 'wind_speed']

In [23]:
weather_df.filter(col("station_id").isNull()).count()
weather_df.filter(col("date").isNull()).count()
weather_df.filter(col("temperature").isNull()).count()
weather_df.filter(col("precipitation").isNull()).count()
weather_df.filter(col("wind_speed").isNull()).count()


0

In [24]:
# Преобразуйте столбец date в формат даты.
weather_format_df = (weather_df
.withColumn("date", to_date(col("date")))
.withColumn("temperature", col("temperature").cast("Double"))
)

weather_format_df.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- date: date (nullable = true)
 |-- temperature: double (nullable = true)
 |-- precipitation: string (nullable = true)
 |-- wind_speed: string (nullable = true)



In [25]:
weather_df.count()

1000

In [26]:
# Найдите топ-5 самых жарких дней за все время наблюдений.
(weather_format_df
.select("date", "temperature")
.orderBy(col("temperature").desc())
.limit(5)
.show()
)

+----------+------------------+
|      date|       temperature|
+----------+------------------+
|2021-08-20|39.982828249354846|
|2023-12-02| 39.96797489293784|
|2022-03-28|  39.8246894248997|
|2019-02-11| 39.76737697836647|
|2020-06-10| 39.69147838355929|
+----------+------------------+



In [27]:
# Найдите метеостанцию с наибольшим количеством осадков за последний год
(
    weather_format_df
    .filter(year(col("date")) == 2023)
    .groupBy("station_id")
    .agg(sum(col("precipitation")).alias("sum_precipitation"))
    .orderBy(col("sum_precipitation").desc())
    .limit(1)
    .select("station_id", "sum_precipitation")
    .show()
)

+----------+-----------------+
|station_id|sum_precipitation|
+----------+-----------------+
| station_5|642.9302626767898|
+----------+-----------------+



In [28]:
# Подсчитайте среднюю температуру по месяцам за все время наблюдений.

(
    weather_format_df
    .withColumn("month", month(col("date")))
    .groupBy("month")
    .agg(avg("temperature").alias("avg_temp"))
    .orderBy("month")
    .show()
)


+-----+------------------+
|month|          avg_temp|
+-----+------------------+
|    1|11.356518462550754|
|    2| 9.067229891101926|
|    3| 7.244080205633994|
|    4|12.024529009744693|
|    5| 9.902883346912718|
|    6|13.421092297254138|
|    7|6.1857183016954576|
|    8|  10.9678002814186|
|    9| 9.596744236573942|
|   10|  9.09884344821895|
|   11| 7.265889994697494|
|   12|11.218592100674337|
+-----+------------------+



In [29]:
spark.stop()