<a href="https://colab.research.google.com/github/amrit6878/Learning-PySpark/blob/main/FunctionsOfArrayType.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 Majorly used list of ArrayType functions with examples:
1. `size()` - Returns the number of elements in the array.
2. `array_contains()` - Checks if a value exists in the array (returns boolean).
3. `explode() `- Converts an array into multiple rows (flattening).
4. `concat()` - Merges two or more array columns.
5. `array_distinct() `- Removes duplicate values from an array.
6. `array_union()` - Returns the union of two arrays, removing duplicates.
7. `array_intersect()` - Returns common elements of two arrays.
9. `sort_array()` - Sorts array elements in ascending/descending order.


In [2]:
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType

spark = SparkSession.builder.appName("FunctionsOfArrayType").getOrCreate()

data = [
    (1, "Amrit", ["Python", "Spark", "SQL"], [85, 90, 88]),
    (2, "Riya", ["Java", "Python"], [78, 85]),
    (3, "John", ["Python", "JavaScript", "Python"], [60, 75, 70]),
    (4, "Sara", ["C++", "Java"], [95, 88]),
    (5, "Alex", ["Go", "Rust", "Python"], [82, 79, 91])
]

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("skills", ArrayType(StringType()), True),
    StructField("scores", ArrayType(IntegerType()), True)
])

df = spark.createDataFrame(data, schema=schema)
df.show(truncate=False)
df.printSchema()

+---+-----+----------------------------+------------+
|id |name |skills                      |scores      |
+---+-----+----------------------------+------------+
|1  |Amrit|[Python, Spark, SQL]        |[85, 90, 88]|
|2  |Riya |[Java, Python]              |[78, 85]    |
|3  |John |[Python, JavaScript, Python]|[60, 75, 70]|
|4  |Sara |[C++, Java]                 |[95, 88]    |
|5  |Alex |[Go, Rust, Python]          |[82, 79, 91]|
+---+-----+----------------------------+------------+

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- scores: array (nullable = true)
 |    |-- element: integer (containsNull = true)



In [3]:
df.select("name", size("skills").alias("num_skills")).show()

+-----+----------+
| name|num_skills|
+-----+----------+
|Amrit|         3|
| Riya|         2|
| John|         3|
| Sara|         2|
| Alex|         3|
+-----+----------+



In [4]:
df.select("name", array_contains("skills", "Python").alias("knows_python")).show()

+-----+------------+
| name|knows_python|
+-----+------------+
|Amrit|        true|
| Riya|        true|
| John|        true|
| Sara|       false|
| Alex|        true|
+-----+------------+



In [5]:
df.select("name", explode("skills").alias("skill")).show()

+-----+----------+
| name|     skill|
+-----+----------+
|Amrit|    Python|
|Amrit|     Spark|
|Amrit|       SQL|
| Riya|      Java|
| Riya|    Python|
| John|    Python|
| John|JavaScript|
| John|    Python|
| Sara|       C++|
| Sara|      Java|
| Alex|        Go|
| Alex|      Rust|
| Alex|    Python|
+-----+----------+



In [6]:
df.withColumn("updated_skills", concat(col("skills"), array(lit("SQL")))).show(truncate=False)

+---+-----+----------------------------+------------+---------------------------------+
|id |name |skills                      |scores      |updated_skills                   |
+---+-----+----------------------------+------------+---------------------------------+
|1  |Amrit|[Python, Spark, SQL]        |[85, 90, 88]|[Python, Spark, SQL, SQL]        |
|2  |Riya |[Java, Python]              |[78, 85]    |[Java, Python, SQL]              |
|3  |John |[Python, JavaScript, Python]|[60, 75, 70]|[Python, JavaScript, Python, SQL]|
|4  |Sara |[C++, Java]                 |[95, 88]    |[C++, Java, SQL]                 |
|5  |Alex |[Go, Rust, Python]          |[82, 79, 91]|[Go, Rust, Python, SQL]          |
+---+-----+----------------------------+------------+---------------------------------+



In [7]:
df.select("name", array_distinct("skills").alias("unique_skills")).show(truncate=False)

+-----+--------------------+
|name |unique_skills       |
+-----+--------------------+
|Amrit|[Python, Spark, SQL]|
|Riya |[Java, Python]      |
|John |[Python, JavaScript]|
|Sara |[C++, Java]         |
|Alex |[Go, Rust, Python]  |
+-----+--------------------+



In [8]:
df.select("name", array_union("skills", array(lit("Python"))).alias("skills_union")).show(truncate=False)

+-----+--------------------+
|name |skills_union        |
+-----+--------------------+
|Amrit|[Python, Spark, SQL]|
|Riya |[Java, Python]      |
|John |[Python, JavaScript]|
|Sara |[C++, Java, Python] |
|Alex |[Go, Rust, Python]  |
+-----+--------------------+



In [9]:
df.select("name", array_intersect("skills", array(lit("Python"), lit("Java"))).alias("common_skills")).show(truncate=False)


+-----+--------------+
|name |common_skills |
+-----+--------------+
|Amrit|[Python]      |
|Riya |[Java, Python]|
|John |[Python]      |
|Sara |[Java]        |
|Alex |[Python]      |
+-----+--------------+



In [10]:
df.select("name", sort_array("scores", asc=False).alias("sorted_scores")).show(truncate=False)

+-----+-------------+
|name |sorted_scores|
+-----+-------------+
|Amrit|[90, 88, 85] |
|Riya |[85, 78]     |
|John |[75, 70, 60] |
|Sara |[95, 88]     |
|Alex |[91, 82, 79] |
+-----+-------------+

