# ðŸ”„ Analysis 2 â€” Echo Chamber Score

**Core question:** Does a subreddit only reward posts that agree with its dominant sentiment?

**Method:**  
1. Find each subreddit's dominant sentiment (positive/negative/neutral)  
2. Compute `corr(title_sentiment, upvote_ratio)` per subreddit  
3. High positive correlation = posts matching community sentiment get upvoted = echo chamber  
4. Near-zero correlation = community upvotes based on other factors = healthier discourse  

**Interview talking point:**  
> "I quantified echo chamber behaviour by correlating post sentiment with upvote ratio within each subreddit. r/collapse and r/conservative show the strongest correlation â€” they heavily reward posts that match the community's dominant emotional tone."


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

spark = (
    SparkSession.builder.appName('EchoChamber')
    .master('local[2]')
    .config('spark.driver.memory', '3g')
    .config('spark.sql.shuffle.partitions', '8')
    .getOrCreate()
)
spark.sparkContext.setLogLevel('WARN')

df = spark.read.parquet('../data/silver/posts')
df = df.filter(
    (F.col('score') >= 5) &
    (F.col('upvote_ratio') > 0) &
    F.col('title_sentiment').isNotNull()
)
print(f'Posts loaded: {df.count():,}')

In [None]:
# â”€â”€ 1. Dominant sentiment per subreddit â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
dominant_sentiment = (
    df.groupBy('subreddit', 'sentiment_label')
    .count()
    .withColumn('rank',
        F.rank().over(
            Window.partitionBy('subreddit').orderBy(F.desc('count'))
        )
    )
    .filter(F.col('rank') == 1)
    .select(
        'subreddit',
        F.col('sentiment_label').alias('dominant_sentiment'),
        F.col('count').alias('dominant_count')
    )
)

print('=== DOMINANT SENTIMENT PER SUBREDDIT ===')
dominant_sentiment.show(20, truncate=False)

In [None]:
# â”€â”€ 2. Echo chamber score = corr(sentiment, upvote_ratio) â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
# Pearson correlation built into Spark â€” no UDF needed
# Range: -1 to +1
#   +1 = positive posts always get upvoted (positive echo chamber)
#   -1 = negative posts always get upvoted (negativity echo chamber)
#    0 = sentiment doesn't predict upvotes at all

echo_scores = (
    df.groupBy('subreddit')
    .agg(
        F.count('*').alias('post_count'),
        F.round(F.corr('title_sentiment', 'upvote_ratio'), 4).alias('sentiment_upvote_corr'),
        F.round(F.corr('title_sentiment', 'score'), 4).alias('sentiment_score_corr'),
        F.round(F.avg('title_sentiment'), 4).alias('avg_sentiment'),
        F.round(F.avg('upvote_ratio'), 4).alias('avg_upvote_ratio'),
        F.round(F.stddev('title_sentiment'), 4).alias('sentiment_stddev'),
    )
    .withColumn('echo_chamber_score',
        # abs() because both strong positive AND strong negative correlations
        # indicate the sub rewards sentiment-aligned posts
        F.round(F.abs(F.col('sentiment_upvote_corr')), 4)
    )
    .withColumn('echo_tier',
        F.when(F.col('echo_chamber_score') > 0.3, 'STRONG_ECHO')
         .when(F.col('echo_chamber_score') > 0.15, 'MODERATE_ECHO')
         .otherwise('WEAK_ECHO')
    )
    .orderBy(F.desc('echo_chamber_score'))
)

print('=== ECHO CHAMBER SCORES (higher = stronger echo chamber) ===')
echo_scores.show(20, truncate=False)

In [None]:
# â”€â”€ 3. Sentiment distribution per subreddit â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
# The shape of sentiment distribution tells you a lot:
# - Narrow distribution = homogeneous community (echo chamber)
# - Wide distribution = diverse viewpoints (healthier)

print('=== SENTIMENT DISTRIBUTION BY SUBREDDIT ===')
(
    df.groupBy('subreddit', 'sentiment_label')
    .count()
    .groupBy('subreddit')
    .pivot('sentiment_label', ['positive', 'neutral', 'negative'])
    .sum('count')
    .fillna(0)
    .withColumn('total', F.col('positive') + F.col('neutral') + F.col('negative'))
    .withColumn('positive_pct', F.round(F.col('positive') / F.col('total') * 100, 1))
    .withColumn('negative_pct', F.round(F.col('negative') / F.col('total') * 100, 1))
    .withColumn('neutral_pct',  F.round(F.col('neutral')  / F.col('total') * 100, 1))
    .select('subreddit', 'total', 'positive_pct', 'neutral_pct', 'negative_pct')
    .orderBy(F.desc('negative_pct'))
).show(20, truncate=False)

In [None]:
# â”€â”€ 4. How does upvote ratio differ for aligned vs misaligned posts? â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
# For each sub: compare upvote_ratio when sentiment matches dominant
# vs when it goes against the grain

enriched = df.join(F.broadcast(dominant_sentiment), on='subreddit', how='left')

aligned_analysis = (
    enriched
    .withColumn('sentiment_aligned',
        F.col('sentiment_label') == F.col('dominant_sentiment')
    )
    .groupBy('subreddit', 'sentiment_aligned')
    .agg(
        F.count('*').alias('post_count'),
        F.round(F.avg('upvote_ratio'), 4).alias('avg_upvote_ratio'),
        F.round(F.avg('score'), 1).alias('avg_score'),
    )
    .orderBy('subreddit', 'sentiment_aligned')
)

print('=== ALIGNED VS MISALIGNED POST PERFORMANCE ===')
print('(sentiment_aligned=True means post matches subreddits dominant tone)')
aligned_analysis.show(40, truncate=False)

In [None]:
spark.stop()
print('Echo chamber analysis complete âœ“')