In [5]:
from deltalake import DeltaTable
from deltalake.writer import write_deltalake
import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import Row, Window
import pyspark.sql.types as T
import pyspark.sql.functions as F

In [2]:
spark_conf = SparkConf()
spark_conf.setAll([
    ("spark.master", "spark://localhost:7077"), # The address of the master node which is set within the docker compose file
    ("spark.submit.deployMode", "client"), # Client mode indicates the local host is the driver program (should be client by default)
    ("spark.driver.bindAddress", "0.0.0.0"), # Binds the driver to all available network interfaces
    ("spark.app.name", "spark-local-cluster"), # The name of the application that will display in the Spark UI
    ("spark.executor.memory", "4g") # Explicitly sets the memory allocated to the executor in the cluster (can't exceed amount allocated in the docker compose file)
])

spark = pyspark.sql.SparkSession.builder.config(conf=spark_conf).getOrCreate()

In [8]:
data = [("Alice", 25), ("Bob", 30), ("Charlie", 35)]
df = spark.createDataFrame(data, ["name", "age"])

In [10]:
dfp = df.toPandas()
write_deltalake("../data/ex1", dfp, mode="overwrite", partition_by=None)

In [11]:
dt = DeltaTable("../data/ex1")
result_dfp = dt.to_pandas()
result_df = spark.createDataFrame(result_dfp)

In [12]:
result_df.show()

+-------+---+
|   name|age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 35|
+-------+---+



In [13]:
result_df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)

