In [None]:

from pyspark.sql import SparkSession


spark_builder = SparkSession.builder \
    .appName("Driver vs Executor Config - 100GB Data") \
    .config("spark.driver.memory", "16g") \
    .config("spark.executor.memory", "30g") \
    .config("spark.executor.cores", "5") \
    .config("spark.driver.cores", "4") \
    .config("spark.sql.shuffle.partitions", "200")  # Optimize shuffling for large data

spark = spark_builder.getOrCreate()


from pyspark.sql.functions import monotonically_increasing_id

# Simulate large dataset with 100 million rows (approx 100GB when using multiple columns)
large_df = spark.range(0, 100_000_000).withColumn("id", monotonically_increasing_id())

# Example transformation to test executor performance
result = large_df.selectExpr("id % 100 as key").groupBy("key").count()
result.show()

# ------------------------------
# Stop the Spark session
spark.stop()


+---+-------+
|key|  count|
+---+-------+
| 26| 999999|
| 29|1000000|
| 65|1000000|
| 19| 999999|
| 54|1000000|
|  0|1000001|
| 22|1000000|
|  7|1000000|
| 77|1000000|
| 34| 999999|
| 50|1000000|
| 94|1000000|
| 57|1000000|
| 32|1000000|
| 43|1000000|
| 84|1000001|
| 31|1000000|
| 39|1000000|
| 98|1000000|
| 25| 999999|
+---+-------+
only showing top 20 rows



In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Spark with HDFS") \
    .master("local[*]") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9000") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

# Confirm HDFS access
df = spark.read.text("hdfs://localhost:9000/input/myfile.txt")
df.show()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("MyApp").getOrCreate()
spark._jvm.org.apache.hadoop.fs.FileSystem.get(
    spark._jsc.hadoopConfiguration()
).listStatus(
    spark._jvm.org.apache.hadoop.fs.Path("hdfs://localhost:9000")
)




In [None]:

from pyspark.sql import SparkSession

spark_static = (
    SparkSession.builder
    .appName("StaticConfigExample")
    .config("spark.executor.cores", "4")
    .config("spark.executor.memory", "8g")
    .config("spark.executor.memoryOverhead", "2g")
    .getOrCreate()
)

print("\n===> Static Spark Session Started")
spark_static.range(100000000).selectExpr("id * 2 as value").show(5)
spark_static.stop()

spark_dynamic = (
    SparkSession.builder
    .appName("DynamicAllocationExample")
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "2")
    .config("spark.dynamicAllocation.maxExecutors", "10")
    .config("spark.dynamicAllocation.initialExecutors", "4")
    .config("spark.shuffle.service.enabled", "true")
    .getOrCreate()
)

print("\n===> Dynamic Spark Session Started")
spark_dynamic.range(50000000).repartition(200).groupBy().count().show()
spark_dynamic.stop()




===> Static Spark Session Started
+-----+
|value|
+-----+
|    0|
|    2|
|    4|
|    6|
|    8|
+-----+
only showing top 5 rows


===> Dynamic Spark Session Started
+--------+
|   count|
+--------+
|50000000|
+--------+

