In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import json

# Create Spark session
spark = SparkSession.builder \
    .appName("Test Spark JSONs") \
    .master("local[*]") \
    .enableHiveSupport() \
    .getOrCreate()

# Sample data with JSON strings
json_data = [
    ('{"name": "John", "age": 30, "city": "NYC"}',),
    ('{"name": "Jane", "age": 25, "city": "LA"}',),
]
   

In [2]:

df_str = spark.createDataFrame(json_data, ["json_string"])
# Convert JSON string to struct
struct_json = StructType([
    StructField('name', dataType=StringType(), nullable=True),
    StructField('age', dataType=IntegerType(), nullable=True),
    StructField('city', dataType=StringType(), nullable=True),
])

df_json = df_str.select('json_string', from_json(col("json_string"), struct_json).alias("json_data"))
df_json.show(truncate=False)

+------------------------------------------+---------------+
|json_string                               |json_data      |
+------------------------------------------+---------------+
|{"name": "John", "age": 30, "city": "NYC"}|{John, 30, NYC}|
|{"name": "Jane", "age": 25, "city": "LA"} |{Jane, 25, LA} |
+------------------------------------------+---------------+



In [3]:
df_extracted_json = df_json.select(col("json_data.name").alias("name"),
                                   col("json_data.age").alias("age"),
                                   col("json_data.city").alias("city"))
df_extracted_json.show(truncate=False)
df_extracted_json.printSchema()
df_extracted_json.columns

+----+---+----+
|name|age|city|
+----+---+----+
|John|30 |NYC |
|Jane|25 |LA  |
+----+---+----+

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- city: string (nullable = true)



['name', 'age', 'city']

In [5]:
df_sel = df_extracted_json.select('*')
df_sel.show(truncate=False)

+----+---+----+
|name|age|city|
+----+---+----+
|John|30 |NYC |
|Jane|25 |LA  |
+----+---+----+



In [22]:
column_stats = df_extracted_json.describe().collect()
for stat in column_stats:
    print(stat)

Row(summary='count', name='2', age='2', city='2')
Row(summary='mean', name=None, age='27.5', city=None)
Row(summary='stddev', name=None, age='3.5355339059327378', city=None)
Row(summary='min', name='Jane', age='25', city='LA')
Row(summary='max', name='John', age='30', city='NYC')


In [None]:
# [catalog].[database/schema].[table]
current_catalog = spark.catalog.currentCatalog()
print(f"Current Catalog: {current_catalog}")

current_db = spark.catalog.currentDatabase()
print(f"Current database/schema: {current_db}")


Current database: default
Current schema: spark_catalog


In [23]:
df_str.write.mode("overwrite").saveAsTable("df_str")
spark.sql("DROP TABLE IF EXISTS df_str")

df_str.write.mode("overwrite").saveAsTable("original_jsons")


In [6]:
df_json_sql = spark.sql("""
select json_string, from_json(json_string, 'struct<name:string, age:int, city:string>') as json_data
from original_jsons
""")
df_json_sql.show(truncate=False)

+------------------------------------------+---------------+
|json_string                               |json_data      |
+------------------------------------------+---------------+
|{"name": "John", "age": 30, "city": "NYC"}|{John, 30, NYC}|
|{"name": "Jane", "age": 25, "city": "LA"} |{Jane, 25, LA} |
+------------------------------------------+---------------+



In [20]:
df = spark.table("spark_catalog.default.original_jsons")
df.show(truncate=False)

+------------------------------------------+
|json_string                               |
+------------------------------------------+
|{"name": "John", "age": 30, "city": "NYC"}|
|{"name": "Jane", "age": 25, "city": "LA"} |
+------------------------------------------+



---
## Creating Arrays

In [8]:
# Create array from multiple columns
df_arrays = spark.createDataFrame([
    ("John", 25, 30),
    ("Jane", 22, 28),
], ["name", "score1", "score2"])

df_with_array = df_arrays.select(
    col("name"),
    array(col("score1"), col("score2")).alias("scores")
)

# Create array with literals
df_literal_array = df_arrays.select(
    col("name"),
    array(lit(1), lit(2), lit(3)).alias("fixed_array")
)

In [9]:
df_arrays.show(truncate=False)
df_with_array.show(truncate=False)
df_literal_array.show(truncate=False)

+----+------+------+
|name|score1|score2|
+----+------+------+
|John|25    |30    |
|Jane|22    |28    |
+----+------+------+

+----+--------+
|name|scores  |
+----+--------+
|John|[25, 30]|
|Jane|[22, 28]|
+----+--------+

+----+-----------+
|name|fixed_array|
+----+-----------+
|John|[1, 2, 3]  |
|Jane|[1, 2, 3]  |
+----+-----------+



In [16]:
# Array Operations
df_with_array.select('*',size(col("scores")).alias("array_size")) \
    .select('*', array_contains(col("scores"), 5).alias("contains_25")).show(truncate=False)

+----+--------+----------+-----------+
|name|scores  |array_size|contains_25|
+----+--------+----------+-----------+
|John|[25, 30]|2         |false      |
|Jane|[22, 28]|2         |false      |
+----+--------+----------+-----------+



In [None]:
# test out array disinct, intersect, union, except
data_rows = [
    ("John", [1, 2, 3], [3,4,5], ['high-school', 'bachelors', 'masters']),
    ("Jane", [2, 3, 4], [5,6,7], ['high-school', 'bachelors']),
    ("Ron", [1, 2, 2], [2, 3, 4], ['bachelors', 'masters']),
    ("Alice", [1, 2, 3], [1, 5, 4, 5, 6], ['high-school', 'bachelors', 'masters']),
]
df_arrays = spark.createDataFrame(data_rows, schema=["name", "scores1", "scores2", "degrees"])
df_arrays.show(truncate=False)

+-----+---------+---------------+---------------------------------+
|name |scores1  |scores2        |degrees                          |
+-----+---------+---------------+---------------------------------+
|John |[1, 2, 3]|[3, 4, 5]      |[high-school, bachelors, masters]|
|Jane |[2, 3, 4]|[5, 6, 7]      |[high-school, bachelors]         |
|Ron  |[1, 2, 2]|[2, 3, 4]      |[bachelors, masters]             |
|Alice|[1, 2, 3]|[1, 5, 4, 5, 6]|[high-school, bachelors, masters]|
+-----+---------+---------------+---------------------------------+



DataFrame[name: string, scores1: array<bigint>, scores2: array<bigint>, degrees: array<string>]

In [25]:
df_arr_op = df_arrays.alias('arr').withColumn("distinct_scores1", array_distinct(col("scores1"))) \
    .withColumn("overlap", array_intersect(col("scores1"), col("scores2"))) \
    .withColumn("union", array_union(col("scores1"), col("arr.scores2"))) \
    .withColumn("except", array_except(col("union"), array(lit(4),lit(5),lit(6))))

df_arr_op.show(truncate=False)

+-----+---------+---------------+---------------------------------+----------------+-------+------------------+---------+
|name |scores1  |scores2        |degrees                          |distinct_scores1|overlap|union             |except   |
+-----+---------+---------------+---------------------------------+----------------+-------+------------------+---------+
|John |[1, 2, 3]|[3, 4, 5]      |[high-school, bachelors, masters]|[1, 2, 3]       |[3]    |[1, 2, 3, 4, 5]   |[1, 2, 3]|
|Jane |[2, 3, 4]|[5, 6, 7]      |[high-school, bachelors]         |[2, 3, 4]       |[]     |[2, 3, 4, 5, 6, 7]|[2, 3, 7]|
|Ron  |[1, 2, 2]|[2, 3, 4]      |[bachelors, masters]             |[1, 2]          |[2]    |[1, 2, 3, 4]      |[1, 2, 3]|
|Alice|[1, 2, 3]|[1, 5, 4, 5, 6]|[high-school, bachelors, masters]|[1, 2, 3]       |[1]    |[1, 2, 3, 5, 4, 6]|[1, 2, 3]|
+-----+---------+---------------+---------------------------------+----------------+-------+------------------+---------+



In [32]:
df_options = df_arr_op.selectExpr("uuid() as uid",
    "row_number() over(order by name) as row_num",
    "current_timestamp() as inserted_dt",
    """case when array_contains(degrees, 'high-school') then 1
        when array_contains(degrees, 'bachelors') then 2
        when array_contains(degrees, 'masters') then 3
        else 0 end as degree_level """,
    "*")

df_options.show(truncate=False)

+------------------------------------+-------+--------------------------+------------+-----+---------+---------------+---------------------------------+----------------+-------+------------------+---------+
|uid                                 |row_num|inserted_dt               |degree_level|name |scores1  |scores2        |degrees                          |distinct_scores1|overlap|union             |except   |
+------------------------------------+-------+--------------------------+------------+-----+---------+---------------+---------------------------------+----------------+-------+------------------+---------+
|15eca0ca-3f7f-4994-8445-20ae46e05812|1      |2025-06-08 11:01:33.801207|1           |Alice|[1, 2, 3]|[1, 5, 4, 5, 6]|[high-school, bachelors, masters]|[1, 2, 3]       |[1]    |[1, 2, 3, 5, 4, 6]|[1, 2, 3]|
|55df402e-ad96-47c3-b052-5a34c7eed2da|2      |2025-06-08 11:01:33.801207|1           |Jane |[2, 3, 4]|[5, 6, 7]      |[high-school, bachelors]         |[2, 3, 4]       |[] 