In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

In [3]:
spark = SparkSession.builder \
    .appName("Basic Spark Lab") \
    .getOrCreate()

print(f"Spark Version: {spark.version}")

Spark Version: 3.5.0


In [4]:
# Створення DataFrame з даними про міста
data = [
    ("Rome", "Italy", 2873000),
    ("Vienna", "Austria", 1900000),
    ("Berlin", "Germany", 3669000)
]
columns = ["City", "Country", "Population"]
df = spark.createDataFrame(data, schema=columns)

In [5]:
df.show()

+------+-------+----------+
|  City|Country|Population|
+------+-------+----------+
|  Rome|  Italy|   2873000|
|Vienna|Austria|   1900000|
|Berlin|Germany|   3669000|
+------+-------+----------+



In [6]:
# Сортування міст за населенням
df_sorted = df.orderBy(col("Population"))
df_sorted.show()

+------+-------+----------+
|  City|Country|Population|
+------+-------+----------+
|Vienna|Austria|   1900000|
|  Rome|  Italy|   2873000|
|Berlin|Germany|   3669000|
+------+-------+----------+



In [7]:
# Пошук міста з найбільшим населенням
most_populated_city = df.orderBy(col("Population").desc()).limit(1)
most_populated_city.show()

+------+-------+----------+
|  City|Country|Population|
+------+-------+----------+
|Berlin|Germany|   3669000|
+------+-------+----------+



In [8]:
df_with_category = df.withColumn(
    "Population_Category",
    when(col("Population") < 2000000, "Маленьке")
    .when((col("Population") >= 2000000) & (col("Population") < 3000000), "Середнє")
    .otherwise("Велике")
)
df_with_category.show()

+------+-------+----------+-------------------+
|  City|Country|Population|Population_Category|
+------+-------+----------+-------------------+
|  Rome|  Italy|   2873000|            Середнє|
|Vienna|Austria|   1900000|           Маленьке|
|Berlin|Germany|   3669000|             Велике|
+------+-------+----------+-------------------+



In [None]:
spark.stop()