# Silver → Gold Layer (Spark NLP)

## 1. Configuration Spark + Spark NLP

In [None]:
from pyspark.sql import SparkSession

GARAGE_ENDPOINT = "http://garage:3900"
GARAGE_ACCESS_KEY = "GKa25124b4fd82613c063217f3"
GARAGE_SECRET_KEY = "008126399688f9b1efc3a3093079b066e4c6471fa256b52788da0c927194147e"

SILVER_PATH = "s3a://silver/hackernews"
GOLD_PATH = "s3a://gold/hackernews"

spark = SparkSession.builder \
    .appName("SilverToGold-SparkNLP") \
    .master("spark://spark:7077") \
    .config("spark.jars.packages", 
            "org.apache.hadoop:hadoop-aws:3.3.4,"
            "com.amazonaws:aws-java-sdk-bundle:1.12.262,"
            "io.delta:delta-spark_2.12:3.3.0,"
            "com.johnsnowlabs.nlp:spark-nlp_2.12:5.3.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.hadoop.fs.s3a.multiobjectdelete.enable", "false") \
    .config("spark.sql.shuffle.partitions", "10") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

hadoop_conf = spark.sparkContext._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.endpoint", GARAGE_ENDPOINT)
hadoop_conf.set("fs.s3a.access.key", GARAGE_ACCESS_KEY)
hadoop_conf.set("fs.s3a.secret.key", GARAGE_SECRET_KEY)
hadoop_conf.set("fs.s3a.endpoint.region", "garage")
hadoop_conf.set("fs.s3a.path.style.access", "true")
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoop_conf.set("fs.s3a.connection.ssl.enabled", "false")

In [None]:
import sparknlp
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import (
    Tokenizer, SentimentDLModel, NerDLModel, NerConverter,
    SentenceDetector, WordEmbeddingsModel
)
from pyspark.ml import Pipeline

print(f"Spark NLP version: {sparknlp.version()}")

## 2. Lecture Silver

In [None]:
comments_silver = spark.read.format("delta").load(f"{SILVER_PATH}/comments")
stories_silver = spark.read.format("delta").load(f"{SILVER_PATH}/stories")

print(f"Comments: {comments_silver.count()}, Stories: {stories_silver.count()}")

## 3. Pipeline Sentiment Analysis

In [None]:
document_assembler = DocumentAssembler() \
    .setInputCol("text_clean") \
    .setOutputCol("document")

sentence_detector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

word_embeddings = WordEmbeddingsModel.pretrained("glove_100d", "en") \
    .setInputCols(["sentence", "token"]) \
    .setOutputCol("embeddings")

sentiment_model = SentimentDLModel.pretrained("sentimentdl_glove_imdb", "en") \
    .setInputCols(["sentence", "embeddings"]) \
    .setOutputCol("sentiment")

sentiment_pipeline = Pipeline(stages=[
    document_assembler,
    sentence_detector,
    tokenizer,
    word_embeddings,
    sentiment_model
])

In [None]:
from pyspark.sql.functions import col, explode

comments_sentiment = comments_with_sentiment \
    .withColumn("sentiment_result", explode(col("sentiment.result"))) \
    .select("id", "by", "parent", "text_clean", "timestamp", "sentiment_result")

comments_sentiment.show(5, truncate=50)

In [None]:
from pyspark.sql.functions import col, explode, expr

comments_sentiment = comments_with_sentiment \
    .withColumn("sentiment_result", explode(col("sentiment.result"))) \
    .select("id", "by", "parent", "text_clean", "timestamp", "sentiment_result")

comments_sentiment.show(5, truncate=50)

## 4. Pipeline NER (Named Entity Recognition)

In [None]:
from pyspark.sql.functions import explode_outer

comments_entities = comments_with_ner \
    .withColumn("entity", explode_outer(col("entities"))) \
    .select(
        "id", "by", "text_clean",
        col("entity.result").alias("entity_text"),
        col("entity.metadata.entity").alias("entity_type")
    )

comments_entities.filter(col("entity_text").isNotNull()).show(10, truncate=40)

In [None]:
ner_model_fitted = ner_pipeline.fit(comments_silver)
comments_with_ner = ner_model_fitted.transform(comments_silver)

In [None]:
from pyspark.sql.functions import col, explode_outer, size

comments_entities = comments_with_ner \
    .withColumn("entity", explode_outer(col("entities"))) \
    .select(
        "id", "by", "text_clean",
        col("entity.result").alias("entity_text"),
        col("entity.metadata.entity").alias("entity_type")
    )

comments_entities.filter(col("entity_text").isNotNull()).show(10, truncate=40)

## 5. Requête SparkSQL - Sentiment par domaine

In [None]:
comments_sentiment.createOrReplaceTempView("comments_sentiment")
stories_silver.createOrReplaceTempView("stories")

sentiment_by_domain = spark.sql("""
    SELECT 
        s.domain,
        COUNT(*) as comment_count,
        SUM(CASE WHEN c.sentiment_result = 'pos' THEN 1 ELSE 0 END) as positive,
        SUM(CASE WHEN c.sentiment_result = 'neg' THEN 1 ELSE 0 END) as negative,
        ROUND(SUM(CASE WHEN c.sentiment_result = 'pos' THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) as positive_pct
    FROM comments_sentiment c
    JOIN stories s ON c.parent = s.id
    WHERE s.domain != ''
    GROUP BY s.domain
    HAVING COUNT(*) >= 5
    ORDER BY comment_count DESC
    LIMIT 20
""")

sentiment_by_domain.show(20, truncate=False)

## 6. Visualisation Pandas + Seaborn

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sentiment_df = comments_sentiment.groupBy("sentiment_result").count().toPandas()

plt.figure(figsize=(8, 6))
sns.barplot(data=sentiment_df, x="sentiment_result", y="count", palette="viridis")
plt.title("Distribution des sentiments dans les commentaires HackerNews")
plt.xlabel("Sentiment")
plt.ylabel("Nombre de commentaires")
plt.tight_layout()
plt.savefig("sentiment_distribution.png")
plt.show()

In [None]:
top_entities = comments_entities \
    .filter(col("entity_type").isin(["ORG", "PRODUCT", "PERSON"])) \
    .groupBy("entity_text", "entity_type") \
    .count() \
    .orderBy(col("count").desc()) \
    .limit(15) \
    .toPandas()

plt.figure(figsize=(10, 6))
sns.barplot(data=top_entities, x="count", y="entity_text", hue="entity_type", dodge=False)
plt.title("Top entités mentionnées dans les commentaires HackerNews")
plt.xlabel("Nombre de mentions")
plt.ylabel("Entité")
plt.tight_layout()
plt.savefig("top_entities.png")
plt.show()

## 7. Écriture Gold

In [None]:
comments_sentiment.write.format("delta").mode("overwrite").save(f"{GOLD_PATH}/comments_sentiment")

In [None]:
entities_aggregated = comments_entities \
    .filter(col("entity_text").isNotNull()) \
    .groupBy("entity_text", "entity_type") \
    .count() \
    .orderBy(col("count").desc())

entities_aggregated.write.format("delta").mode("overwrite").save(f"{GOLD_PATH}/entities")

In [None]:
sentiment_by_domain.write.format("delta").mode("overwrite").save(f"{GOLD_PATH}/sentiment_by_domain")

## 8. Vérification

In [None]:
spark.read.format("delta").load(f"{GOLD_PATH}/comments_sentiment").show(5, truncate=40)
spark.read.format("delta").load(f"{GOLD_PATH}/entities").show(10)
spark.read.format("delta").load(f"{GOLD_PATH}/sentiment_by_domain").show(10)

In [None]:
spark.stop()