# ðŸ¥ˆ Silver â€” Clean + NLP Features

Adds sentiment (VADER), text length features, engagement ratios.
All downstream analysis notebooks read from Silver.


In [None]:
# pip install vaderSentiment before running this notebook
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType

spark = (
    SparkSession.builder
    .appName('Reddit-Silver')
    .master('local[2]')
    .config('spark.driver.memory', '4g')
    .config('spark.sql.shuffle.partitions', '8')
    .config('spark.sql.adaptive.enabled', 'true')
    .getOrCreate()
)
spark.sparkContext.setLogLevel('WARN')

BRONZE_PATH = '../data/bronze/parquet'
SILVER_PATH = '../data/silver/posts'

bronze = spark.read.parquet(BRONZE_PATH)
print(f'Loaded {bronze.count():,} rows')

In [None]:
# VADER sentiment UDF
# UDF = User Defined Function â€” lets you run Python code row-by-row in Spark
# We broadcast the analyser so each executor gets one copy (not re-created per row)
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Broadcast the analyser to all executors
analyser_broadcast = spark.sparkContext.broadcast(SentimentIntensityAnalyzer())

@F.udf(returnType=DoubleType())
def get_sentiment(text):
    if not text:
        return 0.0
    try:
        scores = analyser_broadcast.value.polarity_scores(text[:512])  # cap at 512 chars
        return float(scores['compound'])
    except:
        return 0.0

print('VADER UDF registered âœ“')

In [None]:
# Build Silver â€” all derived columns
silver = (
    bronze
    .fillna({'score': 0, 'num_comments': 0, 'upvote_ratio': 0.0,
             'total_awards_received': 0})

    # Sentiment on title (fast) and full_text (richer but slower)
    .withColumn('title_sentiment',    get_sentiment('title'))
    .withColumn('fulltext_sentiment', get_sentiment('full_text'))

    # Sentiment label
    .withColumn('sentiment_label',
        F.when(F.col('title_sentiment') >=  0.05, 'positive')
         .when(F.col('title_sentiment') <= -0.05, 'negative')
         .otherwise('neutral')
    )

    # Text features
    .withColumn('title_word_count',
        F.size(F.split(F.trim('title'), r'\s+'))
    )
    .withColumn('selftext_word_count',
        F.when(
            F.col('selftext').isNotNull() & (F.col('selftext') != ''),
            F.size(F.split(F.trim('selftext'), r'\s+'))
        ).otherwise(0)
    )
    .withColumn('has_body', F.col('selftext_word_count') > 10)

    # Engagement features
    .withColumn('engagement_score',
        F.col('score') + (F.col('num_comments') * 2)
    )
    # Ragebait ratio: high comments relative to score = controversy
    .withColumn('controversy_ratio',
        F.when(F.col('score') > 0,
            F.col('num_comments').cast('double') / F.col('score').cast('double')
        ).otherwise(F.col('num_comments').cast('double'))
    )

    # Time features
    .withColumn('hour_of_day',  F.hour('created_ts'))
    .withColumn('day_of_week',  F.dayofweek('created_ts'))  # 1=Sunday, 7=Saturday
    .withColumn('week',         F.weekofyear('created_ts').cast('string'))
    .withColumn('year_month',   F.date_format('created_ts', 'yyyy-MM'))

    # Post age bucket at time of snapshot
    .withColumn('score_per_comment',
        F.when(F.col('num_comments') > 0,
            F.col('score').cast('double') / F.col('num_comments')
        ).otherwise(F.col('score').cast('double'))
    )

    # Is removed?
    .withColumn('is_removed',
        F.col('removed_by_category').isNotNull()
    )
)

silver.select('subreddit','title','title_sentiment','sentiment_label',
              'controversy_ratio','engagement_score').show(5, truncate=60)

In [None]:
(
    silver.write
    .mode('overwrite')
    .partitionBy('subreddit', 'year')
    .parquet(SILVER_PATH)
)
print('Silver done âœ“')
spark.stop()