In [None]:
import pyspark
from pyspark.sql import SparkSession
from delta import *
import warnings
warnings.filterwarnings('ignore')

In [None]:
import os

# Set AWS credentials (for accessing S3 or other AWS services)
os.environ['AWS_ACCESS_KEY_ID'] = '<Access Key>'
os.environ['AWS_SECRET_ACCESS_KEY'] = '<Access Secret Key>'


In [None]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

# Update the configuration to connect to a public S3 bucket
conf = (
    pyspark.conf.SparkConf()
    .setAppName("MY_APP")
    .set(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
    .set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .set("spark.sql.shuffle.partitions", "4")
    .setMaster(
        "local[*]"
    )  # replace the * with your desired number of cores. * for using all.
    .set("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com")  # Use the correct endpoint
    .set("spark.hadoop.fs.s3a.connection.ssl.enabled", "true")  # Enable SSL for S3
    .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")  # Hadoop S3A FileSystem implementation
    .set("spark.hadoop.fs.s3a.access.key", os.environ['AWS_ACCESS_KEY_ID'])
    .set("spark.hadoop.fs.s3a.secret.key", os.environ['AWS_SECRET_ACCESS_KEY'])
   # .set("spark.hadoop.fs.s3a.access.key", "")  # Ensure no access key is provided for public buckets
   # .set("spark.hadoop.fs.s3a.secret.key", "")  # Ensure no secret key is provided for public buckets
)

extra_packages = [
    "org.apache.hadoop:hadoop-aws:3.3.4",
    "org.apache.hadoop:hadoop-common:3.3.4",
    "com.amazonaws:aws-java-sdk-bundle:1.12.262",
]

# Initialize the Spark session with Delta support
builder = SparkSession.builder.appName("MyApp").config(conf=conf)

spark = configure_spark_with_delta_pip(
    builder, extra_packages=extra_packages
).getOrCreate()

# Now you can read data from the public S3 bucket like this
df = spark.read.format("csv").load("s3a://aws-s3-open/bits/spa2/weatherHistory.csv")
df.show()


In [None]:
spark.sparkContext.getConf().getAll()

In [None]:
	s3_path = "s3a://aws-s3-open/deltaspcs/table1"
	df.write.format("delta").save(s3_path)

In [None]:
# Register the Delta table with a name in the Spark Catalog (external table)
table_name = "table1"

# Register it as an external table (table will point to the location in S3)
spark.sql(f"CREATE TABLE {table_name} USING DELTA LOCATION '{s3_path}'")

# You can now query the Delta table by its name
spark.sql(f"SELECT * FROM {table_name}").show()