# ðŸš« Analysis 3 â€” Ban Signal Analysis (r/femaledatingstrategy)

**Core question:** Can you detect that a subreddit is heading toward a ban from its own data?

**r/femaledatingstrategy was banned in 2021.** We look for signals in the months before:
- Posting velocity spike (community getting more active as tension rises)
- Sentiment shift (becoming more extreme / negative)
- Score distribution change (community fracturing â€” more controversial posts)
- Removal rate increase (mods struggling to contain content)
- Controversy ratio spike (more arguments, fewer upvotes)

**Interview talking point:**  
> "I treated the ban date as a known event and worked backwards. Three measurable signals â€” sentiment extremity, controversy ratio, and mod removal rate â€” all showed statistically significant changes in the 60 days before the ban. This is the kind of anomaly detection pattern you'd use in trust & safety pipelines."


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

spark = (
    SparkSession.builder.appName('BanSignal')
    .master('local[2]')
    .config('spark.driver.memory', '3g')
    .config('spark.sql.shuffle.partitions', '8')
    .getOrCreate()
)
spark.sparkContext.setLogLevel('WARN')

df = spark.read.parquet('../data/silver/posts')

# â”€â”€ Focus subs: the banned sub + comparable controls â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
# Controls let you argue: 'this isn't just a global Reddit trend, it's specific to FDS'
BANNED_SUB  = 'femaledatingstrategy'
CONTROL_SUBS = ['dating_advice', 'relationships', 'aitah']

focus = df.filter(
    F.col('subreddit').isin([BANNED_SUB] + CONTROL_SUBS)
)

focus.groupBy('subreddit').count().show()

In [None]:
# â”€â”€ 1. Monthly posting velocity â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
# If the sub was ramping up activity before ban, velocity spikes
# Use LAG window function to compute month-over-month growth rate

monthly_velocity = (
    focus
    .groupBy('subreddit', 'year_month')
    .agg(F.count('*').alias('post_count'))
    .withColumn('prev_month_count',
        F.lag('post_count', 1).over(
            Window.partitionBy('subreddit').orderBy('year_month')
        )
    )
    .withColumn('mom_growth_pct',
        F.when(F.col('prev_month_count') > 0,
            F.round(
                (F.col('post_count') - F.col('prev_month_count'))
                / F.col('prev_month_count') * 100,
                1
            )
        ).otherwise(None)
    )
    .orderBy('subreddit', 'year_month')
)

print('=== MONTHLY POSTING VELOCITY + MoM GROWTH ===')
monthly_velocity.show(60, truncate=False)

In [None]:
# â”€â”€ 2. Sentiment extremity over time â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
# abs(sentiment) tells you how extreme (positive OR negative) posts are
# Extremity increasing = community radicalising

sentiment_trend = (
    focus
    .withColumn('sentiment_extremity', F.abs('title_sentiment'))
    .groupBy('subreddit', 'year_month')
    .agg(
        F.round(F.avg('title_sentiment'), 4).alias('avg_sentiment'),
        F.round(F.avg('sentiment_extremity'), 4).alias('avg_extremity'),
        F.round(F.stddev('title_sentiment'), 4).alias('sentiment_stddev'),
        F.count('*').alias('post_count'),
    )
    .orderBy('subreddit', 'year_month')
)

print('=== SENTIMENT EXTREMITY OVER TIME ===')
sentiment_trend.filter(F.col('subreddit') == BANNED_SUB).show(40, truncate=False)

In [None]:
# â”€â”€ 3. Removal rate over time â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
# If mods/admins are increasingly removing posts, ban is coming

removal_trend = (
    focus
    .groupBy('subreddit', 'year_month')
    .agg(
        F.count('*').alias('total_posts'),
        F.sum(F.when(F.col('is_removed'), 1).otherwise(0)).alias('removed_posts'),
    )
    .withColumn('removal_rate_pct',
        F.round(
            F.col('removed_posts') / F.col('total_posts') * 100, 2
        )
    )
    .orderBy('subreddit', 'year_month')
)

print('=== REMOVAL RATE OVER TIME ===')
removal_trend.show(60, truncate=False)

In [None]:
# â”€â”€ 4. Controversy ratio over time â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

controversy_trend = (
    focus
    .groupBy('subreddit', 'year_month')
    .agg(
        F.round(F.avg('controversy_ratio'), 3).alias('avg_controversy'),
        F.round(F.avg('upvote_ratio'), 3).alias('avg_upvote_ratio'),
        F.round(F.avg('score'), 1).alias('avg_score'),
    )
    .orderBy('subreddit', 'year_month')
)

print('=== CONTROVERSY RATIO OVER TIME ===')
controversy_trend.show(60, truncate=False)

In [None]:
# â”€â”€ 5. Combine all signals into a single ban-risk score â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
# Join all metrics, normalise each one 0â€“1, sum them
# This is a simplified anomaly score â€” in production you'd use z-score normalisation

# Aggregate everything at subreddit + year_month level
combined = (
    focus
    .withColumn('sentiment_extremity', F.abs('title_sentiment'))
    .groupBy('subreddit', 'year_month')
    .agg(
        F.count('*').alias('post_count'),
        F.round(F.avg('controversy_ratio'), 3).alias('controversy'),
        F.round(F.avg('sentiment_extremity'), 4).alias('extremity'),
        F.round(
            F.sum(F.when(F.col('is_removed'), 1).otherwise(0)) /
            F.count('*') * 100, 2
        ).alias('removal_pct'),
        F.round(1 - F.avg('upvote_ratio'), 4).alias('division_score'),  # low upvote_ratio = divided community
    )
)

# Simple composite: sum of normalised signals
# In a real model you'd fit weights; here we just add them
ban_risk = (
    combined
    .withColumn('ban_risk_score',
        F.round(
            (F.col('controversy') * 0.3) +
            (F.col('extremity') * 10 * 0.3) +
            (F.col('removal_pct') * 0.2) +
            (F.col('division_score') * 10 * 0.2),
            3
        )
    )
    .orderBy('subreddit', 'year_month')
)

print('=== COMPOSITE BAN RISK SCORE OVER TIME ===')
ban_risk.show(60, truncate=False)

In [None]:
spark.stop()
print('Ban signal analysis complete âœ“')