In [1]:
%run nb0.spark-session.ipynb

`ArrayType` models a column as an ordered collection of elements with a single data type, allowing Spark SQL to store and manipulate arrays natively. It supports nested schemas (ArrayType can hold StructType, MapType, or other ArrayType values), enables functions like `size`, `array_max`, `array_sort`, `collect_set`, and `explode`, and maintains nullability at both the column and element levels.

In [9]:
from pyspark.sql import Row, functions as F

users = [
    Row(user_id=1, scores=[83, 91, 77]),
    Row(user_id=2, scores=[68, 88]),
    Row(user_id=3, scores=[])
]
scores_df = spark.createDataFrame(users)
display(scores_df)

user_id,scores
1,"[83, 91, 77]"
2,"[68, 88]"
3,[]


In [10]:
scores_df.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- scores: array (nullable = true)
 |    |-- element: long (containsNull = true)



In [11]:
scores_df.select(
    "user_id",
    "scores",
    F.size("scores").alias("score_count"),
    F.array_max("scores").alias("top_score")
).show()

+-------+------------+-----------+---------+
|user_id|      scores|score_count|top_score|
+-------+------------+-----------+---------+
|      1|[83, 91, 77]|          3|       91|
|      2|    [68, 88]|          2|       88|
|      3|          []|          0|     NULL|
+-------+------------+-----------+---------+



In [14]:
topics_df = spark.createDataFrame(
    [(1, "A"), (1, "B"), (2, "B"), (2, "C")],
    ["user_id", "topic"]
).groupBy("user_id").agg(F.collect_set("topic").alias("topics"))

display(topics_df)

user_id,topics
1,"[A, B]"
2,"[C, B]"


In [4]:
topics_df.select(
    "user_id",
    F.array_sort("topics").alias("sorted_topics")
).show()

+-------+-------------+
|user_id|sorted_topics|
+-------+-------------+
|      1|       [A, B]|
|      2|       [B, C]|
+-------+-------------+



In [17]:
df_exploded = scores_df.select(
    "user_id",
    F.explode_outer("scores").alias("score"))
display(df_exploded)

user_id,score
1,83.0
1,91.0
1,77.0
2,68.0
2,88.0
3,


In [19]:
df_exploded \
    .groupBy("user_id") \
    .avg("score") \
    .alias("avg_score").show()

+-------+-----------------+
|user_id|       avg(score)|
+-------+-----------------+
|      1|83.66666666666667|
|      2|             78.0|
|      3|             NULL|
+-------+-----------------+

