In [6]:
from pyspark.sql import SparkSession

# Start with a clean session
spark = SparkSession.builder \
    .appName("MinIO_connection") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.2") \
    .getOrCreate()

# Clear all problematic properties first
hadoop_conf = spark._jsc.hadoopConfiguration()

# List of properties that might have duration strings
duration_props = [
    "fs.s3a.threads.keepalivetime",
    "hadoop.security.groups.shell.command.timeout",
    "hadoop.service.shutdown.timeout",
    "yarn.resourcemanager.delegation-token-renewer.thread-timeout", 
    "yarn.federation.gpg.webapp.connect-timeout",
    "yarn.federation.gpg.webapp.read-timeout",
    "fs.s3a.retry.interval",
    "fs.s3a.retry.throttle.interval",
    "fs.s3a.connection.ttl",
    "fs.s3a.multipart.purge.age"
]

print("Clearing problematic properties...")
for prop in duration_props:
    hadoop_conf.unset(prop)
    print(f"  Unset: {prop}")

# Now set them with numeric values where needed
numeric_props = {
    "fs.s3a.threads.keepalivetime": "60",  # seconds as number
    "hadoop.security.groups.shell.command.timeout": "0",  # 0 seconds
    "fs.s3a.retry.interval": "500",  # milliseconds
    "fs.s3a.retry.throttle.interval": "100",  # milliseconds
    "fs.s3a.connection.ttl": "300000",  # 5 minutes in ms
}
hadoop_conf.set(
    "fs.s3a.aws.credentials.provider",
    "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider"
)
for prop, value in numeric_props.items():
    hadoop_conf.set(prop, value)
    print(f"  Set {prop} = {value}")

# Configure MinIO
minio_configs = {
    "fs.s3a.endpoint": "http://minio:9000",
    "fs.s3a.access.key": "admin",
    "fs.s3a.secret.key": "password", 
    "fs.s3a.path.style.access": "true",
    "fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem",
    "fs.s3a.connection.ssl.enabled": "false",
    "fs.s3a.connection.timeout": "60000",
    "fs.s3a.socket.timeout": "60000",
    "fs.s3a.connection.establish.timeout": "5000",
}

print("\nSetting MinIO configuration...")
for key, value in minio_configs.items():
    hadoop_conf.set(key, value)
    print(f"  Set {key} = {value}")

print("\nConfiguration complete!")

Clearing problematic properties...
  Unset: fs.s3a.threads.keepalivetime
  Unset: hadoop.security.groups.shell.command.timeout
  Unset: hadoop.service.shutdown.timeout
  Unset: yarn.resourcemanager.delegation-token-renewer.thread-timeout
  Unset: yarn.federation.gpg.webapp.connect-timeout
  Unset: yarn.federation.gpg.webapp.read-timeout
  Unset: fs.s3a.retry.interval
  Unset: fs.s3a.retry.throttle.interval
  Unset: fs.s3a.connection.ttl
  Unset: fs.s3a.multipart.purge.age
  Set fs.s3a.threads.keepalivetime = 60
  Set hadoop.security.groups.shell.command.timeout = 0
  Set fs.s3a.retry.interval = 500
  Set fs.s3a.retry.throttle.interval = 100
  Set fs.s3a.connection.ttl = 300000

Setting MinIO configuration...
  Set fs.s3a.endpoint = http://minio:9000
  Set fs.s3a.access.key = admin
  Set fs.s3a.secret.key = password
  Set fs.s3a.path.style.access = true
  Set fs.s3a.impl = org.apache.hadoop.fs.s3a.S3AFileSystem
  Set fs.s3a.connection.ssl.enabled = false
  Set fs.s3a.connection.timeout 

In [7]:
# Print ALL Hadoop properties
print("=== Current Hadoop Properties ===")
iterator = hadoop_conf.iterator()
while iterator.hasNext():
    prop = iterator.next()
    key = prop.getKey()
    value = prop.getValue()
    if "timeout" in key.lower() or "s3a" in key.lower():
        print(f"{key}: {value}")

=== Current Hadoop Properties ===
fs.s3a.select.output.csv.record.delimiter: \n
yarn.app.mapreduce.am.job.committer.cancel-timeout: 60000
fs.s3a.select.input.csv.quote.character: "
fs.s3a.path.style.access: true
fs.s3a.access.key: admin
fs.s3a.select.input.compression: none
fs.s3a.max.total.tasks: 32
fs.s3a.vectored.read.min.seek.size: 128K
fs.s3a.select.output.csv.quote.fields: always
fs.s3a.vectored.read.max.merged.size: 2M
fs.s3a.socket.timeout: 60000
ha.failover-controller.new-active.rpc-timeout.ms: 60000
fs.s3a.select.input.csv.header: none
mapreduce.outputcommitter.factory.scheme.s3a: org.apache.hadoop.fs.s3a.commit.S3ACommitterFactory
ha.health-monitor.rpc-timeout.ms: 45000
fs.s3a.impl: org.apache.hadoop.fs.s3a.S3AFileSystem
adl.http.timeout: -1
hadoop.security.kms.client.timeout: 60
fs.s3a.select.enabled: true
yarn.nodemanager.health-checker.timeout-ms: 1200000
ipc.client.connect.timeout: 20000
fs.s3a.committer.staging.tmp.path: tmp/staging
ha.zookeeper.session-timeout.ms: 1000

In [8]:
# Create test data
data = [("Nvidia", 1000), ("OpenAI", 2000), ("Databricks", 3000)]
df = spark.createDataFrame(data, ["Company", "Valuation"])

# Test write
try:
    print("Writing to MinIO...")
    df.write.mode("overwrite").parquet("s3a://raw-data/test.parquet")
    print("✓ Write successful!")
    
    # Test read
    print("Reading from MinIO...")
    df_read = spark.read.parquet("s3a://raw-data/test.parquet")
    df_read.show()
    print("✓ Read successful!")
    
except Exception as e:
    print(f"✗ Error: {e}")
    import traceback
    traceback.print_exc()

Writing to MinIO...


26/01/05 17:49:37 WARN Base64: JAXB is unavailable. Will fallback to SDK implementation which may be less performant.If you are using Java 9+, you will need to include javax.xml.bind:jaxb-api as a dependency.
                                                                                

✓ Write successful!
Reading from MinIO...
+----------+---------+
|   Company|Valuation|
+----------+---------+
|Databricks|     3000|
|    Nvidia|     1000|
|    OpenAI|     2000|
+----------+---------+

✓ Read successful!


In [9]:
df.show()

                                                                                

+---------+----------+
|  Company|StockPrice|
+---------+----------+
|    Apple|       100|
|   Google|       200|
|Microsoft|       300|
+---------+----------+



In [4]:
print("Setting credentials provider...")
hadoop_conf.set(
    "fs.s3a.aws.credentials.provider",
    "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider"
)
print("  Set fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")


Setting credentials provider...
  Set fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider
