# ðŸ‘¤ Analysis 4 â€” User Crossposting Behaviour

**Core question:** Which users post across ideologically opposed subreddits? What does that overlap look like?

**Method:**  
1. For every pair of subreddits, count authors who posted in BOTH  
2. Compute Jaccard similarity: `|A âˆ© B| / |A âˆª B|`  
3. Build a subreddit similarity matrix based on shared audience  

**Interesting pairs to find:**  
- r/politics â†” r/conservative (do they share users or are they completely separate camps?)  
- r/wallstreetbets â†” r/collapse (doom + stonks overlap?)  
- r/femaledatingstrategy â†” r/dating_advice  

**Interview talking point:**  
> "I used a self-join on the author column to find cross-community users, then computed Jaccard similarity to build a subreddit audience overlap matrix. This is the same pattern used in recommendation systems â€” if two communities share 30% of their audience, they're likely to surface each other's content."


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

spark = (
    SparkSession.builder.appName('Crosspost')
    .master('local[2]')
    .config('spark.driver.memory', '3g')
    .config('spark.sql.shuffle.partitions', '8')
    .getOrCreate()
)
spark.sparkContext.setLogLevel('WARN')

df = spark.read.parquet('../data/silver/posts')

# Filter out bots and deleted accounts
df = df.filter(
    F.col('author').isNotNull() &
    ~F.col('author').isin('[deleted]', 'automoderator', 'bot') &
    ~F.col('author').endswith('bot')
)

print(f'Posts: {df.count():,}')

In [None]:
# â”€â”€ 1. Distinct authors per subreddit â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
author_sub = (
    df.select('author', 'subreddit')
    .distinct()   # one row per (author, subreddit) pair
)

sub_sizes = (
    author_sub.groupBy('subreddit')
    .agg(F.count('author').alias('unique_authors'))
    .orderBy(F.desc('unique_authors'))
)

print('=== UNIQUE AUTHORS PER SUBREDDIT ===')
sub_sizes.show(20, truncate=False)

In [None]:
# â”€â”€ 2. Cross-subreddit overlap â€” self join â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
# Self-join: for every (author, sub_A) row, join with all (author, sub_B) rows
# This finds all users who post in at least 2 different subreddits

# Alias the same dataframe twice
a = author_sub.alias('a')
b = author_sub.alias('b')

# Join on same author, different subreddits
# a.subreddit < b.subreddit avoids counting (politics,worldnews) AND (worldnews,politics)
pairs = (
    a.join(b,
        (F.col('a.author') == F.col('b.author')) &
        (F.col('a.subreddit') < F.col('b.subreddit')),
        how='inner'
    )
    .select(
        F.col('a.subreddit').alias('sub_a'),
        F.col('b.subreddit').alias('sub_b'),
        F.col('a.author').alias('author')
    )
)

# Count shared authors per pair
overlap = (
    pairs.groupBy('sub_a', 'sub_b')
    .agg(F.count('author').alias('shared_authors'))
)

print('=== RAW OVERLAP (TOP 20 PAIRS BY SHARED AUTHORS) ===')
overlap.orderBy(F.desc('shared_authors')).show(20, truncate=False)

In [None]:
# â”€â”€ 3. Jaccard similarity â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
# Jaccard = |A âˆ© B| / |A âˆª B|
# Normalises for community size â€” raw overlap favours large subs

sizes_a = sub_sizes.select(
    F.col('subreddit').alias('sub_a'),
    F.col('unique_authors').alias('size_a')
)
sizes_b = sub_sizes.select(
    F.col('subreddit').alias('sub_b'),
    F.col('unique_authors').alias('size_b')
)

jaccard = (
    overlap
    .join(sizes_a, on='sub_a')
    .join(sizes_b, on='sub_b')
    .withColumn('union_size',
        F.col('size_a') + F.col('size_b') - F.col('shared_authors')
    )
    .withColumn('jaccard_similarity',
        F.round(
            F.col('shared_authors').cast('double') / F.col('union_size'),
            4
        )
    )
    .select('sub_a', 'sub_b', 'shared_authors', 'size_a', 'size_b', 'jaccard_similarity')
    .orderBy(F.desc('jaccard_similarity'))
)

print('=== JACCARD SIMILARITY (most similar audiences) ===')
jaccard.show(30, truncate=False)

In [None]:
# â”€â”€ 4. Multi-community users â€” who posts everywhere? â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
# These are the most cross-cutting users in your dataset

user_breadth = (
    author_sub
    .groupBy('author')
    .agg(
        F.count('subreddit').alias('sub_count'),
        F.collect_set('subreddit').alias('subreddits')
    )
    .filter(F.col('sub_count') >= 3)  # posts in at least 3 of your subs
    .orderBy(F.desc('sub_count'))
)

print('=== USERS ACTIVE IN 3+ SUBREDDITS ===')
print(f'Count: {user_breadth.count():,}')
user_breadth.show(20, truncate=False)

In [None]:
# â”€â”€ 5. Ideological crossover â€” specific interesting pairs â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
interesting_pairs = [
    ('politics', 'conservative'),
    ('wallstreetbets', 'collapse'),
    ('femaledatingstrategy', 'dating_advice'),
    ('worldnews', 'collapse'),
    ('changemyview', 'unpopularopinion'),
]

print('=== SPECIFIC IDEOLOGICAL CROSSOVERS ===')
for sub_a, sub_b in interesting_pairs:
    # Normalise order
    a_, b_ = sorted([sub_a, sub_b])
    result = jaccard.filter(
        (F.col('sub_a') == a_) & (F.col('sub_b') == b_)
    ).collect()
    if result:
        r = result[0]
        print(f'{sub_a:30s} â†” {sub_b:30s}  shared={r.shared_authors:,}  jaccard={r.jaccard_similarity}')
    else:
        print(f'{sub_a} â†” {sub_b}  â€” no overlap found')

In [None]:
spark.stop()
print('Crossposting analysis complete âœ“')