In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql import Window

spark = (SparkSession.builder
            .appName('DeltaMinIO')
            .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.1,"
                                            "com.amazonaws:aws-java-sdk-bundle:1.12.709,"
                                            "io.delta:delta-core_2.12:2.3.0"
                    )
            # S3 / Minio
            .config("spark.hadoop.fs.s3a.access.key", "andreyolv")
            .config("spark.hadoop.fs.s3a.secret.key", "andreyolv")
            .config("spark.hadoop.fs.s3a.endpoint", "http://minio.minio:9000")
            .config("spark.hadoop.fs.s3a.path.style.access", True)
            .config("spark.hadoop.fs.s3a.fast.upload", True)
            .config("spark.hadoop.fs.s3a.multipart.size", 104857600)
            .config("fs.s3a.connection.maximum", 100)
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
            # Delta
            .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore")
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
            .getOrCreate()
        )

In [None]:
# Define the schema for a DataFrame
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("city", StringType(), True)
])

# Create a DataFrame using the schema
data = [(1, "Alice", 25, "New York"), 
        (2, "Bob", 30, "San Francisco"), 
        (3, "Charlie", 35, "Chicago")]

df = spark.createDataFrame(data, schema)

# Show the DataFrame
df.toPandas()

## Save Delta

In [None]:
s3_location = 's3a://lakehouse/delta'

In [None]:
df.write.format("delta").save(s3_location)

## Load Delta

In [None]:
spark.read.format("delta").load(s3_location).toPandas()