# ðŸš« Analysis 3 â€” Ban Signal Analysis (r/femaledatingstrategy)

**Core question:** Can you detect that a subreddit is heading toward a ban from its own data?

**r/femaledatingstrategy was banned in 2021.** We look for signals in the months before:
- Posting velocity spike (community getting more active as tension rises)
- Sentiment shift (becoming more extreme / negative)
- Score distribution change (community fracturing â€” more controversial posts)
- Removal rate increase (mods struggling to contain content)
- Controversy ratio spike (more arguments, fewer upvotes)

**Interview talking point:**  
> "I treated the ban date as a known event and worked backwards. Three measurable signals â€” sentiment extremity, controversy ratio, and mod removal rate â€” all showed statistically significant changes in the 60 days before the ban. This is the kind of anomaly detection pattern you'd use in trust & safety pipelines."


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

spark = (
    SparkSession.builder.appName('BanSignal')
    .master('local[2]')
    .config('spark.driver.memory', '3g')
    .config('spark.sql.shuffle.partitions', '8')
    .getOrCreate()
)
spark.sparkContext.setLogLevel('WARN')

df = spark.read.parquet('/mnt/c/Users/gusmc/OneDrive/Desktop/reddit_historical_data/data/silver/posts')

# â”€â”€ Focus subs: the banned sub + comparable controls â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
# Controls let you argue: 'this isn't just a global Reddit trend, it's specific to FDS'
BANNED_SUB  = 'femaledatingstrategy'
CONTROL_SUBS = ['dating_advice', 'relationships', 'aitah']

focus = df.filter(
    F.col('subreddit').isin([BANNED_SUB] + CONTROL_SUBS)
)

focus.groupBy('subreddit').count().show()

                                                                                

+--------------------+------+
|           subreddit| count|
+--------------------+------+
|               aitah|200916|
|femaledatingstrategy| 48006|
|       dating_advice| 57739|
+--------------------+------+



In [3]:
# â”€â”€ 1. Monthly posting velocity â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
# If the sub was ramping up activity before ban, velocity spikes
# Use LAG window function to compute month-over-month growth rate

monthly_velocity = (
    focus
    .groupBy('subreddit', 'year_month')
    .agg(F.count('*').alias('post_count'))
    .withColumn('prev_month_count',
        F.lag('post_count', 1).over(
            Window.partitionBy('subreddit').orderBy('year_month')
        )
    )
    .withColumn('mom_growth_pct',
        F.when(F.col('prev_month_count') > 0,
            F.round(
                (F.col('post_count') - F.col('prev_month_count'))
                / F.col('prev_month_count') * 100,
                1
            )
        ).otherwise(None)
    )
    .orderBy('subreddit', 'year_month')
)

print('=== MONTHLY POSTING VELOCITY + MoM GROWTH ===')
monthly_velocity.show(60, truncate=False)

=== MONTHLY POSTING VELOCITY + MoM GROWTH ===




+--------------------+----------+----------+----------------+--------------+
|subreddit           |year_month|post_count|prev_month_count|mom_growth_pct|
+--------------------+----------+----------+----------------+--------------+
|aitah               |2025-03   |21394     |NULL            |NULL          |
|aitah               |2025-04   |21159     |21394           |-1.1          |
|aitah               |2025-05   |23246     |21159           |9.9           |
|aitah               |2025-06   |22221     |23246           |-4.4          |
|aitah               |2025-07   |22231     |22221           |0.0           |
|aitah               |2025-08   |20589     |22231           |-7.4          |
|aitah               |2025-09   |17756     |20589           |-13.8         |
|aitah               |2025-10   |16504     |17756           |-7.1          |
|aitah               |2025-11   |17332     |16504           |5.0           |
|aitah               |2025-12   |18484     |17332           |6.6           |

                                                                                

In [4]:
# â”€â”€ 2. Sentiment extremity over time â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
# abs(sentiment) tells you how extreme (positive OR negative) posts are
# Extremity increasing = community radicalising

sentiment_trend = (
    focus
    .withColumn('sentiment_extremity', F.abs('title_sentiment'))
    .groupBy('subreddit', 'year_month')
    .agg(
        F.round(F.avg('title_sentiment'), 4).alias('avg_sentiment'),
        F.round(F.avg('sentiment_extremity'), 4).alias('avg_extremity'),
        F.round(F.stddev('title_sentiment'), 4).alias('sentiment_stddev'),
        F.count('*').alias('post_count'),
    )
    .orderBy('subreddit', 'year_month')
)

print('=== SENTIMENT EXTREMITY OVER TIME ===')
sentiment_trend.filter(F.col('subreddit') == BANNED_SUB).show(40, truncate=False)

=== SENTIMENT EXTREMITY OVER TIME ===




+--------------------+----------+-------------+-------------+----------------+----------+
|subreddit           |year_month|avg_sentiment|avg_extremity|sentiment_stddev|post_count|
+--------------------+----------+-------------+-------------+----------------+----------+
|femaledatingstrategy|2019-02   |0.1133       |0.1133       |0.1963          |3         |
|femaledatingstrategy|2019-03   |0.134        |0.4113       |0.5117          |39        |
|femaledatingstrategy|2019-04   |0.0123       |0.4207       |0.5301          |86        |
|femaledatingstrategy|2019-05   |0.0324       |0.4096       |0.5037          |293       |
|femaledatingstrategy|2019-06   |-0.0107      |0.3874       |0.4912          |241       |
|femaledatingstrategy|2019-07   |0.0097       |0.3792       |0.4925          |188       |
|femaledatingstrategy|2019-08   |-0.0131      |0.3755       |0.472           |183       |
|femaledatingstrategy|2019-09   |-0.0016      |0.3529       |0.4585          |270       |
|femaledat

                                                                                

In [5]:
# â”€â”€ 3. Removal rate over time â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
# If mods/admins are increasingly removing posts, ban is coming

removal_trend = (
    focus
    .groupBy('subreddit', 'year_month')
    .agg(
        F.count('*').alias('total_posts'),
        F.sum(F.when(F.col('is_removed'), 1).otherwise(0)).alias('removed_posts'),
    )
    .withColumn('removal_rate_pct',
        F.round(
            F.col('removed_posts') / F.col('total_posts') * 100, 2
        )
    )
    .orderBy('subreddit', 'year_month')
)

print('=== REMOVAL RATE OVER TIME ===')
removal_trend.show(60, truncate=False)

=== REMOVAL RATE OVER TIME ===




+--------------------+----------+-----------+-------------+----------------+
|subreddit           |year_month|total_posts|removed_posts|removal_rate_pct|
+--------------------+----------+-----------+-------------+----------------+
|aitah               |2025-03   |21394      |117          |0.55            |
|aitah               |2025-04   |21159      |36           |0.17            |
|aitah               |2025-05   |23246      |5            |0.02            |
|aitah               |2025-06   |22221      |17           |0.08            |
|aitah               |2025-07   |22231      |13           |0.06            |
|aitah               |2025-08   |20589      |8            |0.04            |
|aitah               |2025-09   |17756      |8            |0.05            |
|aitah               |2025-10   |16504      |15           |0.09            |
|aitah               |2025-11   |17332      |61           |0.35            |
|aitah               |2025-12   |18484      |56           |0.3             |

                                                                                

In [6]:
# â”€â”€ 4. Controversy ratio over time â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

controversy_trend = (
    focus
    .groupBy('subreddit', 'year_month')
    .agg(
        F.round(F.avg('controversy_ratio'), 3).alias('avg_controversy'),
        F.round(F.avg('upvote_ratio'), 3).alias('avg_upvote_ratio'),
        F.round(F.avg('score'), 1).alias('avg_score'),
    )
    .orderBy('subreddit', 'year_month')
)

print('=== CONTROVERSY RATIO OVER TIME ===')
controversy_trend.show(60, truncate=False)

=== CONTROVERSY RATIO OVER TIME ===




+--------------------+----------+---------------+----------------+---------+
|subreddit           |year_month|avg_controversy|avg_upvote_ratio|avg_score|
+--------------------+----------+---------------+----------------+---------+
|aitah               |2025-03   |6.808          |0.718           |161.0    |
|aitah               |2025-04   |6.541          |0.728           |153.5    |
|aitah               |2025-05   |6.302          |0.728           |148.8    |
|aitah               |2025-06   |6.836          |0.712           |153.4    |
|aitah               |2025-07   |6.995          |0.704           |133.0    |
|aitah               |2025-08   |7.115          |0.713           |117.1    |
|aitah               |2025-09   |7.343          |0.71            |116.5    |
|aitah               |2025-10   |7.808          |0.708           |109.8    |
|aitah               |2025-11   |7.605          |0.72            |104.9    |
|aitah               |2025-12   |7.423          |0.722           |104.1    |

                                                                                

In [7]:
# â”€â”€ 5. Combine all signals into a single ban-risk score â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
# Join all metrics, normalise each one 0â€“1, sum them
# This is a simplified anomaly score â€” in production you'd use z-score normalisation

# Aggregate everything at subreddit + year_month level
combined = (
    focus
    .withColumn('sentiment_extremity', F.abs('title_sentiment'))
    .groupBy('subreddit', 'year_month')
    .agg(
        F.count('*').alias('post_count'),
        F.round(F.avg('controversy_ratio'), 3).alias('controversy'),
        F.round(F.avg('sentiment_extremity'), 4).alias('extremity'),
        F.round(
            F.sum(F.when(F.col('is_removed'), 1).otherwise(0)) /
            F.count('*') * 100, 2
        ).alias('removal_pct'),
        F.round(1 - F.avg('upvote_ratio'), 4).alias('division_score'),  # low upvote_ratio = divided community
    )
)

# Simple composite: sum of normalised signals
# In a real model you'd fit weights; here we just add them
ban_risk = (
    combined
    .withColumn('ban_risk_score',
        F.round(
            (F.col('controversy') * 0.3) +
            (F.col('extremity') * 10 * 0.3) +
            (F.col('removal_pct') * 0.2) +
            (F.col('division_score') * 10 * 0.2),
            3
        )
    )
    .orderBy('subreddit', 'year_month')
)

print('=== COMPOSITE BAN RISK SCORE OVER TIME ===')
ban_risk.show(60, truncate=False)

=== COMPOSITE BAN RISK SCORE OVER TIME ===




+--------------------+----------+----------+-----------+---------+-----------+--------------+--------------+
|subreddit           |year_month|post_count|controversy|extremity|removal_pct|division_score|ban_risk_score|
+--------------------+----------+----------+-----------+---------+-----------+--------------+--------------+
|aitah               |2025-03   |21394     |6.808      |0.2708   |0.55       |0.2817        |3.528         |
|aitah               |2025-04   |21159     |6.541      |0.2685   |0.17       |0.2722        |3.346         |
|aitah               |2025-05   |23246     |6.302      |0.2697   |0.02       |0.2722        |3.248         |
|aitah               |2025-06   |22221     |6.836      |0.2681   |0.08       |0.2883        |3.448         |
|aitah               |2025-07   |22231     |6.995      |0.2685   |0.06       |0.2961        |3.508         |
|aitah               |2025-08   |20589     |7.115      |0.2662   |0.04       |0.2868        |3.515         |
|aitah             

                                                                                

In [8]:
spark.stop()
print('Ban signal analysis complete âœ“')

Ban signal analysis complete âœ“
