# ðŸŽ£ Analysis 1 â€” Ragebait Detector

**Core question:** Which subreddits manufacture outrage? Which posts get people commenting furiously but not upvoting?

**Definition:**  
`controversy_ratio = num_comments / score`  
- High ratio = people argue but don't upvote = ragebait  
- Low ratio = people upvote without engaging = good content  

**Interview talking point:**  
> "I operationalised ragebait as posts where comment velocity significantly outpaces score â€” the community is reacting, not rewarding. r/femaledatingstrategy and r/politics score highest on this metric."


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

spark = (
    SparkSession.builder.appName('Ragebait')
    .master('local[2]')
    .config('spark.driver.memory', '3g')
    .config('spark.sql.shuffle.partitions', '8')
    .getOrCreate()
)
spark.sparkContext.setLogLevel('WARN')

df = spark.read.parquet('../data/silver/posts')

# Only look at posts with some traction to avoid noise
df = df.filter((F.col('score') >= 5) & (F.col('num_comments') >= 2))
print(f'Filtered posts: {df.count():,}')

In [None]:
# â”€â”€ 1. Subreddit-level ragebait score â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
sub_rage = (
    df.groupBy('subreddit')
    .agg(
        F.count('*').alias('total_posts'),
        F.round(F.avg('controversy_ratio'), 3).alias('avg_controversy_ratio'),
        F.round(F.percentile_approx('controversy_ratio', 0.75), 3).alias('p75_controversy'),
        F.round(F.percentile_approx('controversy_ratio', 0.95), 3).alias('p95_controversy'),
        F.round(F.avg('score'), 1).alias('avg_score'),
        F.round(F.avg('num_comments'), 1).alias('avg_comments'),
        F.round(F.avg('upvote_ratio'), 3).alias('avg_upvote_ratio'),
        F.round(F.avg('title_sentiment'), 4).alias('avg_sentiment'),
    )
    .withColumn('ragebait_tier',
        F.when(F.col('avg_controversy_ratio') > 2.0, 'HIGH_RAGEBAIT')
         .when(F.col('avg_controversy_ratio') > 1.0, 'MEDIUM')
         .otherwise('LOW')
    )
    .orderBy(F.desc('avg_controversy_ratio'))
)

print('=== SUBREDDIT RAGEBAIT RANKING ===')
sub_rage.show(20, truncate=False)

In [None]:
# â”€â”€ 2. Top ragebait POSTS (the actual posts causing most arguments) â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
print('=== TOP 30 INDIVIDUAL RAGEBAIT POSTS ===')
(
    df
    .filter(F.col('controversy_ratio') > 3.0)
    .select(
        'subreddit',
        F.col('title').substr(1, 80).alias('title'),
        'score', 'num_comments',
        F.round('controversy_ratio', 2).alias('controversy_ratio'),
        'sentiment_label', 'year_month'
    )
    .orderBy(F.desc('controversy_ratio'))
    .limit(30)
).show(truncate=False)

In [None]:
# â”€â”€ 3. Does NEGATIVE sentiment posts get more controversy? â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
print('=== CONTROVERSY RATIO BY SENTIMENT LABEL ===')
(
    df.groupBy('subreddit', 'sentiment_label')
    .agg(
        F.count('*').alias('post_count'),
        F.round(F.avg('controversy_ratio'), 3).alias('avg_controversy'),
        F.round(F.avg('score'), 1).alias('avg_score'),
    )
    .orderBy('subreddit', 'sentiment_label')
).show(40, truncate=False)

In [None]:
# â”€â”€ 4. Ragebait over time â€” is it getting worse? â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
print('=== MONTHLY RAGEBAIT TREND (politics + worldnews + AITAH) ===')
(
    df
    .filter(F.col('subreddit').isin('politics', 'worldnews', 'aitah'))
    .groupBy('subreddit', 'year_month')
    .agg(
        F.count('*').alias('posts'),
        F.round(F.avg('controversy_ratio'), 3).alias('avg_controversy'),
        F.round(F.avg('upvote_ratio'), 3).alias('avg_upvote_ratio'),
    )
    .orderBy('subreddit', 'year_month')
).show(60, truncate=False)

In [None]:
spark.stop()
print('Ragebait analysis complete âœ“')