In [3]:
from pyspark.sql import SparkSession

# Start with a clean session
spark = SparkSession.builder \
    .appName("MinIO_Fetch") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.2") \
    .getOrCreate()

# Clear all problematic properties first
hadoop_conf = spark._jsc.hadoopConfiguration()

# List of properties that might have duration strings
duration_props = [
    "fs.s3a.threads.keepalivetime",
    "hadoop.security.groups.shell.command.timeout",
    "hadoop.service.shutdown.timeout",
    "yarn.resourcemanager.delegation-token-renewer.thread-timeout", 
    "yarn.federation.gpg.webapp.connect-timeout",
    "yarn.federation.gpg.webapp.read-timeout",
    "fs.s3a.retry.interval",
    "fs.s3a.retry.throttle.interval",
    "fs.s3a.connection.ttl",
    "fs.s3a.multipart.purge.age"
]

print("Clearing problematic properties...")

hadoop_conf.set(
    "fs.s3a.aws.credentials.provider",
    "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider"
)
for prop in duration_props:
    hadoop_conf.unset(prop)
    print(f"  Unset: {prop}")

# Now set them with numeric values where needed
numeric_props = {
    "fs.s3a.threads.keepalivetime": "60",  # seconds as number
    "hadoop.security.groups.shell.command.timeout": "0",  # 0 seconds
    "fs.s3a.retry.interval": "500",  # milliseconds
    "fs.s3a.retry.throttle.interval": "100",  # milliseconds
    "fs.s3a.connection.ttl": "300000",  # 5 minutes in ms
}

for prop, value in numeric_props.items():
    hadoop_conf.set(prop, value)
    print(f"  Set {prop} = {value}")

# Configure MinIO
minio_configs = {
    "fs.s3a.endpoint": "http://minio:9000",
    "fs.s3a.access.key": "admin",
    "fs.s3a.secret.key": "password", 
    "fs.s3a.path.style.access": "true",
    "fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem",
    "fs.s3a.connection.ssl.enabled": "false",
    "fs.s3a.connection.timeout": "60000",
    "fs.s3a.socket.timeout": "60000",
    "fs.s3a.connection.establish.timeout": "5000",
}

print("\nSetting MinIO configuration...")
for key, value in minio_configs.items():
    hadoop_conf.set(key, value)
    print(f"  Set {key} = {value}")

print("\nConfiguration complete!")

Clearing problematic properties...
  Unset: fs.s3a.threads.keepalivetime
  Unset: hadoop.security.groups.shell.command.timeout
  Unset: hadoop.service.shutdown.timeout
  Unset: yarn.resourcemanager.delegation-token-renewer.thread-timeout
  Unset: yarn.federation.gpg.webapp.connect-timeout
  Unset: yarn.federation.gpg.webapp.read-timeout
  Unset: fs.s3a.retry.interval
  Unset: fs.s3a.retry.throttle.interval
  Unset: fs.s3a.connection.ttl
  Unset: fs.s3a.multipart.purge.age
  Set fs.s3a.threads.keepalivetime = 60
  Set hadoop.security.groups.shell.command.timeout = 0
  Set fs.s3a.retry.interval = 500
  Set fs.s3a.retry.throttle.interval = 100
  Set fs.s3a.connection.ttl = 300000

Setting MinIO configuration...
  Set fs.s3a.endpoint = http://minio:9000
  Set fs.s3a.access.key = admin
  Set fs.s3a.secret.key = password
  Set fs.s3a.path.style.access = true
  Set fs.s3a.impl = org.apache.hadoop.fs.s3a.S3AFileSystem
  Set fs.s3a.connection.ssl.enabled = false
  Set fs.s3a.connection.timeout 

In [9]:
print("Reading from MinIO...")
df_read = spark.read.text("s3a://raw-data/sec-edgar-filings-raw/AAPL/10-K/0000320193-24-000123")
print(df_read.count())
print("✓ Read successful!")

Reading from MinIO...
88885
✓ Read successful!


In [5]:
import pyspark.sql.functions as F
from bs4 import BeautifulSoup


# Sample DataFrame with HTML content
data = spark.read.text("s3a://raw-data/sec-edgar-filings-raw/AAPL/10-K/0000320193-24-000123")
df = spark.createDataFrame(data, ["html_content"])

# Define a Python function to strip HTML tags
def strip_html_tags(html_content):
    if html_content:
        soup = BeautifulSoup(html_content, "html.parser")
        return soup.get_text()
    return None

# Register the function as a PySpark UDF
strip_html_tags_udf = F.udf(strip_html_tags, StringType()) # Import StringType from pyspark.sql.types

# Apply the UDF to the DataFrame column
cleaned_df = df.withColumn("cleaned_text", strip_html_tags_udf(F.col("html_content")))
cleaned_df.show(truncate=False)


PySparkTypeError: [INVALID_TYPE] Argument `data` should not be a DataFrame.