In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import random
import json

# Generate sample JSON data
def generate_json(i):
    return json.dumps({
        "id": i,
        "user": f"user_{i % 100}",
        "score": random.randint(0, 100),
        "tags": [f"tag_{j}" for j in range(i % 5)],
        "metadata": {
            "ip": f"192.168.1.{i % 255}",
            "device": f"device_{i % 10}"
        }
    })

generate_json_udf = udf(generate_json, StringType())

df = spark.range(0, 100_000).withColumn(
    "json_str",
    generate_json_udf("id")
)

df.write.mode("overwrite").saveAsTable("ankurnayyar_cat1.demo_schema.benchmark_json_data")

In [0]:
from pyspark.sql.types import *

json_schema = StructType([
    StructField("id", LongType()),
    StructField("user", StringType()),
    StructField("score", IntegerType()),
    StructField("tags", ArrayType(StringType())),
    StructField("metadata", StructType([
        StructField("ip", StringType()),
        StructField("device", StringType())
    ]))
])


In [0]:
import time

df_raw = spark.table("ankurnayyar_cat1.demo_schema.benchmark_json_data")



In [0]:

start_time = time.time()
df_from_json = df_raw.withColumn("parsed", from_json("json_str", json_schema))
df_from_json.select("parsed.*").count()

print("from_json() duration:", time.time() - start_time)


from_json() duration: 19.20588707923889


In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType

# Define the schema matching your JSON structure
schema = StructType([
    StructField("id", IntegerType()),
    StructField("user", StringType()),
    StructField("score", IntegerType()),
    StructField("tags", ArrayType(StringType())),
    StructField("metadata", StructType([
        StructField("ip", StringType()),
        StructField("device", StringType())
    ]))
])

start_time = time.time()

df_variant = df_raw.select(
    from_json("json_str", schema).alias("json_var")
)
df_variant.select(
    "json_var.id",
    "json_var.user",
    "json_var.metadata.ip"
).count()

print("variant (from_json) duration:", time.time() - start_time)

variant (from_json) duration: 5.684422254562378


In [0]:
# Benchmarking JSON Parsing vs Variant Type

spark.sql("CREATE CATALOG IF NOT EXISTS ankurnayyar_cat1")
spark.sql("USE CATALOG ankurnayyar_cat1")
spark.sql("CREATE SCHEMA IF NOT EXISTS demo_schema")
spark.sql("USE SCHEMA demo_schema")

from pyspark.sql.functions import expr, col, from_json, udf
from pyspark.sql.types import *
import random, json
import time
import pandas as pd

# UDF to generate JSON string
def generate_json(i):
    return json.dumps({
        "id": i,
        "user": f"user_{i % 100}",
        "score": random.randint(0, 100),
        "tags": [f"tag_{j}" for j in range(i % 5)],
        "metadata": {
            "ip": f"192.168.1.{i % 255}",
            "device": f"device_{i % 10}"
        }
    })

generate_json_udf = udf(generate_json, StringType())

# Create 100,000 rows of JSON data as a table
df = spark.range(0, 100_0000).withColumn("json_str", generate_json_udf(col("id")))
df.write.mode("overwrite").saveAsTable("json_data_raw")

# Define schema for from_json
json_schema = StructType([
    StructField("id", LongType(), nullable=False),
    StructField("user", StringType(), nullable=True),
    StructField("score", IntegerType(), nullable=True),
    StructField("tags", ArrayType(StringType()), nullable=True),
    StructField("metadata", StructType([
        StructField("ip", StringType(), nullable=True),
        StructField("device", StringType(), nullable=True)
    ]), nullable=True)
])

# Benchmark setup
df_raw = spark.table("json_data_raw")

df_parsed = df_raw.withColumn(
    "parsed",
    from_json(col("json_str"), json_schema)
)
df_variant = df_raw.withColumn("v", expr("parse_json(json_str)"))

# Function to measure time of a lambda or function
def measure(func, *args, **kwargs):
    start = time.time()
    result = func(*args, **kwargs)
    end = time.time()
    duration = end - start
    return result, duration

# List of benchmark operations
benchmarks = []

# B1: parse JSON / variant
_, dur = measure(lambda: df_parsed.count())
benchmarks.append(("B1_parse_from_json_count", "from_json", dur))

_, dur = measure(lambda: df_variant.count())
benchmarks.append(("B1_parse_variant_count", "variant", dur))

# B2: select nested fields
_, dur = measure(lambda: df_parsed.select("parsed.user", "parsed.metadata.device").count())
benchmarks.append(("B2_select_from_json", "from_json", dur))

_, dur = measure(lambda: df_variant.select(
    expr("cast(v:user as string)").alias("user"),
    expr("cast(v:metadata.device as string)").alias("device")
).count())
benchmarks.append(("B2_select_variant", "variant", dur))

# B3: filter on nested field
_, dur = measure(lambda: df_parsed.filter(col("parsed.metadata.device") == "device_1").count())
benchmarks.append(("B3_filter_from_json", "from_json", dur))

_, dur = measure(lambda: df_variant.filter(
    expr("cast(v:metadata.device as string) = 'device_1'")
).count())
benchmarks.append(("B3_filter_variant", "variant", dur))

# B4: distinct count
_, dur = measure(lambda: df_parsed.select("parsed.user").distinct().count())
benchmarks.append(("B4_distinct_from_json", "from_json", dur))

_, dur = measure(lambda: df_variant.select(
    expr("cast(v:user as string)").alias("user")
).distinct().count())
benchmarks.append(("B4_distinct_variant", "variant", dur))

# B5: group by nested field
_, dur = measure(lambda: df_parsed.groupBy("parsed.metadata.device").count().count())
benchmarks.append(("B5_group_from_json", "from_json", dur))

_, dur = measure(lambda: df_variant.groupBy(
    expr("cast(v:metadata.device as string)").alias("device")
).count().count())
benchmarks.append(("B5_group_variant", "variant", dur))

# Writing to delta tables
path_from_json = "/tmp/benchmark/from_json_data"
path_variant = "/tmp/benchmark/variant_data"

_, dur = measure(lambda: df_parsed.write.format("delta").mode("overwrite").save(path_from_json))
benchmarks.append(("B6_write_from_json_delta", "from_json", dur))

_, dur = measure(lambda: df_variant.write.format("delta").mode("overwrite").save(path_variant))
benchmarks.append(("B6_write_variant_delta", "variant", dur))

# Reading back and projecting
_, dur = measure(lambda: spark.read.format("delta").load(path_from_json).select("parsed.user").count())
benchmarks.append(("B7_read_from_json_delta", "from_json", dur))

_, dur = measure(lambda: spark.read.format("delta").load(path_variant).select(
    expr("cast(v:user as string)").alias("user")
).count())
benchmarks.append(("B7_read_variant_delta", "variant", dur))

# Display results
df_res = pd.DataFrame(benchmarks, columns=["Benchmark", "Method", "Duration_sec"])
display(df_res)

# Save results as table
spark.createDataFrame(df_res).write.mode("overwrite").saveAsTable("benchmark_results")

Benchmark,Method,Duration_sec
B1_parse_from_json_count,from_json,0.924936056137085
B1_parse_variant_count,variant,0.7764875888824463
B2_select_from_json,from_json,0.6358168125152588
B2_select_variant,variant,0.6142821311950684
B3_filter_from_json,from_json,0.5060315132141113
B3_filter_variant,variant,0.5699584484100342
B4_distinct_from_json,from_json,0.6705121994018555
B4_distinct_variant,variant,0.6077392101287842
B5_group_from_json,from_json,0.7451913356781006
B5_group_variant,variant,0.7954986095428467


Databricks visualization. Run in Databricks to view.

In [0]:
# If not already installed, run in a separate cell:
# %pip install psutil

import psutil

def measure_with_memory(func, *args, **kwargs):
    process = psutil.Process()
    mem_before = process.memory_info().rss
    start = time.time()
    result = func(*args, **kwargs)
    end = time.time()
    mem_after = process.memory_info().rss
    duration = end - start
    mem_used_mb = (mem_after - mem_before) / (1024 * 1024)
    return result, duration, mem_used_mb

benchmarks = []

# Example for one benchmark:
_, dur, mem = measure_with_memory(lambda: df_parsed.count())
benchmarks.append(("B1_parse_from_json_count", "from_json", dur, mem))

# Repeat for all other benchmarks:
_, dur, mem = measure_with_memory(lambda: df_variant.count())
benchmarks.append(("B1_parse_variant_count", "variant", dur, mem))

# ...repeat for all other operations, replacing measure() with measure_with_memory()

# When done, create the DataFrame:
df_res = pd.DataFrame(
    benchmarks, 
    columns=["Benchmark", "Method", "Duration_sec", "Memory_MB"]
)
display(df_res)

Benchmark,Method,Duration_sec,Memory_MB
B1_parse_from_json_count,from_json,0.7810497283935547,0.0
B1_parse_variant_count,variant,0.7221431732177734,0.00390625
