In [2]:
# PySpark Configuration Basics and Dynamic Config Example

from pyspark.sql import SparkSession

# Create SparkSession with initial configuration
spark = SparkSession.builder \
    .appName("PySpark Configuration Demo") \
    .master("local[*]") \
    .config("spark.executor.memory", "1g") \
    .config("spark.driver.memory", "1g") \
    .getOrCreate()




In [3]:
# Show all Spark configurations (key-value pairs)
print("=== Spark Configurations (Initial) ===")
for item in spark.sparkContext.getConf().getAll():
    print(item)

# === Dynamically set configuration at runtime ===
print("\n=== Setting Dynamic Configuration ===")
spark.conf.set("spark.sql.shuffle.partitions", "10")   # Set number of partitions for shuffle
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")  # Enable Arrow optimization

# Read back the values
shuffle_partitions = spark.conf.get("spark.sql.shuffle.partitions")
arrow_enabled = spark.conf.get("spark.sql.execution.arrow.pyspark.enabled")

print(f"Shuffle Partitions: {shuffle_partitions}")
print(f"Arrow Enabled: {arrow_enabled}")


=== Spark Configurations (Initial) ===
('spark.app.name', 'PySpark Configuration Demo')
('spark.driver.host', 'dell')
('spark.executor.id', 'driver')
('spark.driver.memory', '1g')
('spark.driver.extraJavaOptions', '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirec

In [4]:

# === Invalid Configuration Test (Try / Except) ===
try:
    print("\n=== Trying to fetch non-existent config ===")
    print(spark.conf.get("non.existing.config"))
except Exception as e:
    print("Caught Error:", e)

# Stop session
spark.stop()


=== Trying to fetch non-existent config ===
Caught Error: An error occurred while calling o170.get.
: java.util.NoSuchElementException: non.existing.config
	at org.apache.spark.sql.errors.QueryExecutionErrors$.noSuchElementExceptionError(QueryExecutionErrors.scala:2138)
	at org.apache.spark.sql.internal.SQLConf.$anonfun$getConfString$3(SQLConf.scala:5041)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.internal.SQLConf.getConfString(SQLConf.scala:5041)
	at org.apache.spark.sql.RuntimeConfig.get(RuntimeConfig.scala:81)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gate

In [5]:
# | Section                         | Purpose                        |
# | ------------------------------- | ------------------------------ |
# | `SparkSession.builder.config()` | Static config at session start |
# | `spark.conf.set()`              | Dynamic config at runtime      |
# | `spark.conf.get()`              | Read configs dynamically       |
# | `getAll()`                      | List all configs               |


In [6]:
# ⚙️ 1. Why Configuration Matters in PySpark?
# Spark has hundreds of configuration options. You can:

# Allocate memory and CPU resources.

# Control parallelism (e.g., number of partitions).

# Tune performance (e.g., caching, shuffle behavior).

# Enable/disable features (e.g., Arrow, adaptive execution).

# You can set configs in 2 ways:

# Static: Before creating SparkSession.

# Dynamic: After session starts using spark.conf.set().



In [7]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ConfigDemo") \
    .master("local[*]") \
    .config("spark.executor.memory", "1g") \
    .config("spark.driver.memory", "1g") \
    .getOrCreate()


In [8]:
# Set configuration
spark.conf.set("spark.sql.shuffle.partitions", "10")

# Get configuration
print(spark.conf.get("spark.sql.shuffle.partitions"))


10


In [10]:
from pyspark.sql import SparkSession

# Step 1: Create SparkSession with static configs
spark = SparkSession.builder \
    .appName("PySpark Config Demo") \
    .master("local[*]") \
    .config("spark.executor.memory", "1g") \
    .config("spark.driver.memory", "1g") \
    .getOrCreate()

# Step 2: Print all static configs
print("=== Static Spark Configurations ===")
for k, v in spark.sparkContext.getConf().getAll():
    print(f"{k}: {v}")

# Step 3: Set dynamic configs
spark.conf.set("spark.sql.shuffle.partitions", "10")
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

# Step 4: Read and print dynamic configs
print("\n=== Dynamic Configs After Setting ===")
print("spark.sql.shuffle.partitions:", spark.conf.get("spark.sql.shuffle.partitions"))
print("spark.sql.execution.arrow.pyspark.enabled:", spark.conf.get("spark.sql.execution.arrow.pyspark.enabled"))

# Step 5: Try reading a non-existing config key (to show error handling)
# try:
#     print("\nTrying to get non.existing.config:")
#     print(spark.conf.get("non.existing.config"))
# except Exception as e:
#     print("Caught error:", e)

# Step 6: Stop session (optional at end of notebook)
spark.stop()


=== Static Spark Configurations ===
spark.app.name: PySpark Config Demo
spark.driver.host: dell
spark.executor.id: driver
spark.driver.memory: 1g
spark.driver.extraJavaOptions: -Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false
spark.executor.mem

In [11]:
# | Feature                 | Static Config           | Dynamic Config                 |
# | ----------------------- | ----------------------- | ------------------------------ |
# | Set Before SparkSession | ✅                       | ❌                              |
# | Set After SparkSession  | ❌                       | ✅                              |
# | Examples                | `spark.executor.memory` | `spark.sql.shuffle.partitions` |
# | Requires Restart        | ✅                       | ❌                              |


In [12]:
# | Scenario                              | Configuration                                      |
# | ------------------------------------- | -------------------------------------------------- |
# | Want to reduce shuffle overhead       | `spark.sql.shuffle.partitions = 10`                |
# | Want to enable Arrow with Pandas UDFs | `spark.sql.execution.arrow.pyspark.enabled = true` |
# | For big joins                         | `spark.sql.autoBroadcastJoinThreshold`             |


In [13]:
# .conf.set("spark.sql.shuffle.partitions", "10")
# 📌 What it does:
# This sets the number of partitions to use when shuffling data in Spark SQL operations (like groupBy, join, orderBy, etc.).

# 💡 What is "shuffle"?
# Shuffling = Moving data between partitions (e.g., different worker nodes) during distributed operations. It's expensive (in terms of time and resources).

# 🧠 Why change this?
# Default is 200 partitions — too many for small datasets.

# If you're running on a laptop or small cluster, set this to 10 or lower to reduce overhead.

# ✅ Summary:
# Tells Spark: "Use 10 partitions instead of 200 during shuffle-heavy operations (like groupBy, join)."

# ✅ 2. spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
# 📌 What it does:
# This enables Apache Arrow for faster data transfer between Spark and Pandas (especially for .toPandas() and Pandas UDFs).

# 💡 What is Arrow?
# Arrow is a high-performance, in-memory format that makes Spark ↔ Pandas communication much faster.

# 🧠 Why use it?
# If you often do:

# python
# Copy
# Edit
# df.toPandas()
# or

# python
# Copy
# Edit
# @pandas_udf(...)
# def my_udf(...):
#     ...
# Then this config can boost performance 10x+.

# ✅ Summary:
# Tells Spark: "Use Apache Arrow to speed up communication between Spark and Pandas (for DataFrame conversion and Pandas UDFs)."

# 🎯 Real Use Case Example
# python
# Copy
# Edit
# # This will be faster when Arrow is enabled
# pdf = spark.read.csv("data.csv", header=True).toPandas()
