In [1]:
import os
from pyspark import SparkConf
from pyspark.sql import SparkSession

In [2]:
aws_access_key_id = os.environ['AWS_ACCESS_KEY_ID']
aws_secret_access_key = os.environ['AWS_SECRET_ACCESS_KEY']
bucket_name = os.environ['TF_VAR_test_s3_bucket']

In [3]:
conf = (
    SparkConf()
    .set("spark.hadoop.fs.s3a.access.key", aws_access_key_id)
    .set("spark.hadoop.fs.s3a.secret.key", aws_secret_access_key)
    .set("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com")
    .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .set("com.amazonaws.services.s3.enableV4", True)
)

spark = SparkSession.builder.master("local[1]") \
                    .appName('s3-writer') \
                    .config(conf=conf) \
                    .getOrCreate()

In [4]:
df = spark.createDataFrame([(1, "foo"), (2, "bar")], ["id", "label"])

In [5]:
s3_path=f"s3a://{bucket_name}/foobar"

In [6]:
df.write.format('csv').option('header','true').mode("overwrite").save(s3_path)

In [7]:
df_read = spark.read.format("csv").option('header','true').load(s3_path)

In [8]:
df_read.show()

+---+-----+
| id|label|
+---+-----+
|  1|  foo|
|  2|  bar|
+---+-----+

