# Bronze → Silver Layer

## 1. Configuration Spark

In [None]:
from pyspark.sql import SparkSession

GARAGE_ENDPOINT = "http://garage:3900"
GARAGE_ACCESS_KEY = "GKa25124b4fd82613c063217f3"
GARAGE_SECRET_KEY = "008126399688f9b1efc3a3093079b066e4c6471fa256b52788da0c927194147e"

BRONZE_PATH = "s3a://bronze/hackernews"
SILVER_PATH = "s3a://silver/hackernews"

spark = SparkSession.builder \
    .appName("BronzeToSilver") \
    .master("spark://spark:7077") \
    .config("spark.jars.packages", 
            "org.apache.hadoop:hadoop-aws:3.3.4,"
            "com.amazonaws:aws-java-sdk-bundle:1.12.262,"
            "io.delta:delta-spark_2.12:3.3.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.hadoop.fs.s3a.multiobjectdelete.enable", "false") \
    .config("spark.sql.shuffle.partitions", "10") \
    .getOrCreate()

hadoop_conf = spark.sparkContext._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.endpoint", GARAGE_ENDPOINT)
hadoop_conf.set("fs.s3a.access.key", GARAGE_ACCESS_KEY)
hadoop_conf.set("fs.s3a.secret.key", GARAGE_SECRET_KEY)
hadoop_conf.set("fs.s3a.endpoint.region", "garage")
hadoop_conf.set("fs.s3a.path.style.access", "true")
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoop_conf.set("fs.s3a.connection.ssl.enabled", "false")

## 2. Création bucket Silver

In [None]:
# Bucket "silver" à créer manuellement via Garage CLI/WebUI si nécessaire

## 3. Lecture Bronze

In [None]:
stories_bronze = spark.read.format("delta").load(f"{BRONZE_PATH}/stories")
comments_bronze = spark.read.format("delta").load(f"{BRONZE_PATH}/comments")

print(f"Stories: {stories_bronze.count()}, Comments: {comments_bronze.count()}")

In [None]:
stories_bronze.printSchema()

## 4. Fonctions de nettoyage

In [None]:
from pyspark.sql.functions import col, when, regexp_replace, regexp_extract, length, trim, coalesce, lit

def clean_html(column):
    c = col(column)
    c = regexp_replace(c, r"<[^>]+>", " ")
    c = regexp_replace(c, r"\s+", " ")

    html_entities = {
        r"&#x27;": "'",
        r"&#x2F;": "/",
        r"&quot;": '"',
        r"&amp;": "&",
        r"&lt;": "<",
        r"&gt;": ">"
    }
    for k, v in html_entities.items():
        c = regexp_replace(c, k, v)

    return when(col(column).isNull(), lit("")).otherwise(trim(c))

def extract_domain(column):
    return regexp_extract(col(column), r"https?://(?:www\.)?([^/]+)", 1)

## 5. Nettoyage Stories

In [None]:
stories_silver = stories_bronze \
    .filter(col("id").isNotNull()) \
    .dropDuplicates(["id"]) \
    .withColumn("text_clean", clean_html("text")) \
    .withColumn("domain", extract_domain("url")) \
    .select("id", "by", "title", "url", "domain", "score", "descendants", 
            "text_clean", "timestamp", "_ingested_at")

stories_silver.show(3, truncate=40)

## 6. Nettoyage Comments

In [None]:
comments_silver = comments_bronze \
    .filter(col("id").isNotNull()) \
    .filter(coalesce(col("deleted"), lit(False)) == False) \
    .filter(coalesce(col("dead"), lit(False)) == False) \
    .dropDuplicates(["id"]) \
    .withColumn("text_clean", clean_html("text")) \
    .filter(length(col("text_clean")) > 0) \
    .select("id", "by", "parent", "text_clean", "timestamp", "_ingested_at")

comments_silver.show(3, truncate=40)

## 7. Écriture Silver

In [None]:
stories_silver.write.format("delta").mode("overwrite").save(f"{SILVER_PATH}/stories")

In [None]:
comments_silver.write.format("delta").mode("overwrite").save(f"{SILVER_PATH}/comments")

## 8. Vérification

In [None]:
spark.read.format("delta").load(f"{SILVER_PATH}/stories").show(3, truncate=30)
spark.read.format("delta").load(f"{SILVER_PATH}/comments").show(3, truncate=30)

In [None]:
spark.read.format("delta").load(f"{SILVER_PATH}/stories") \
    .filter(col("domain") != "") \
    .groupBy("domain").count() \
    .orderBy(col("count").desc()) \
    .show(5)

In [None]:
spark.stop()