# Compare Twitter and Reddit Populations

This script evaluates how similar subsets of Twitter and Reddit populations are within platform and across platform.
The populations we test are:
- Random US Twitter users active in 2015-2018
    - Must have tweeted >= 100 times in this time frame
- Politically engaged US Twitter users active in 2015-2018
    - Must have tweeted >= 100 times in this time frame
    - Must follow at least 5 politicians
- IRA Twitter accounts identified by Twitter
- Random Reddit accounts  active in 2015-2018
    - Must have posted/commented >= 100 times in this time frame
- Politically engaged Redditors  active in 2015-2018
    - Must have posted/commented >= 100 times in subreddits /r/politics identifies as US-political in this time frame
- IRA Reddit accounts identified by Reddit

In [None]:
%matplotlib inline

In [None]:
import json
import time
import string
import datetime
import matplotlib

import scipy.stats

import numpy as np
import pandas as pd
import statsmodels.api as sm

import matplotlib.pyplot as plt

In [None]:
import sklearn.preprocessing
import sklearn.metrics

In [None]:
from utils.youtube import strip_video_id_from_url

In [None]:
def convert_date(date):
    return datetime.datetime.strptime(date, "%a %b %d %H:%M:%S +0000 %Y")

def convert_time(timestamp):
    created_time = timestamp
    if ( isinstance(created_time, str) ):
        created_time = int(created_time)
        
    d = datetime.datetime.utcfromtimestamp(created_time)
    return d

def get_top_tlds(links_df, user_id_field, top_n=100):
    tld_user_counts = {}
    for tld, group in links_df.groupby("tld"):
        local_users = set(group[user_id_field])
        tld_user_counts[tld] = len(local_users)

    return sorted(tld_user_counts, key=tld_user_counts.get, reverse=True)[:top_n]

def links_df_to_shares(links_df, domains, user_id_field):
    user_shares = []

    for user,user_posts in links_df.groupby(user_id_field):
        local_tld_counts = dict(user_posts["tld"].value_counts().items())
        local_vector = [local_tld_counts.get(x, 0) for x in domains]
        user_shares.append([user] + local_vector)

    return pd.DataFrame(user_shares, columns=[user_id_field] + domains)

def links_to_norm_matrix(links_df, domains, user_id_field):
    if ( type(domains) == set ):
        domains = list(domains)
        
    shares_df = links_df_to_shares(links_df, domains, user_id_field)
    return sklearn.preprocessing.normalize(shares_df[domains], norm="l2", axis=1)

def get_top_channels(yt_df, user_id_field, top_n=100):
    channel_user_counts = {}
    for channel, group in yt_df.groupby("channel_id"):
        local_users = set(group[user_id_field])
        channel_user_counts[channel] = len(local_users)

    return sorted(channel_user_counts, key=channel_user_counts.get, reverse=True)[:top_n]

def channels_df_to_shares(links_df, channels, user_id_field):
    user_shares = []

    for user,user_posts in links_df.groupby(user_id_field):
        local_chan_counts = dict(user_posts["channel_id"].value_counts().items())
        local_vector = [local_chan_counts.get(x, 0) for x in channels]
        user_shares.append([user] + local_vector)

    return pd.DataFrame(user_shares, columns=[user_id_field] + channels)

def channels_to_norm_matrix(links_df, channels, user_id_field):
    if ( type(channels) == set ):
        channels = list(channels)
        
    shares_df = channels_df_to_shares(links_df, channels, user_id_field)
    return sklearn.preprocessing.normalize(shares_df[channels], norm="l2", axis=1)


In [None]:
political_domains_df = pd.read_csv("DomainIdeology.csv")
political_domains = set(political_domains_df["domain"].apply(str.lower))
print("Political Domains:", len(political_domains))

In [None]:
top_n_domains = 150
top_n_channels = 250
bootstrap_count = 5000

In [None]:
short_domain_map = {
    "abcn.ws": "abcnews.go.com",
    "amzn.to": "amazon.com",
    "apne.ws": "apnews.com",
    "apple.co": "apple.com",
    "bbc.in": "bbc.com",
    "ble.ac": "bleacherreport.com",
    "bloom.bg": "bloomberg.com",
    "bzfd.it": "buzzfeed.com",
    "cbsloc.al": "cbslocal.com",
    "cnb.cx": "cnbc.com",
    "cnn.it": "cnn.com",
    "cnn.it": "cnn.com",
    "dailym.ai" : "dailymail.co.uk",
    "econ.st": "economist.com",
    "es.pn": "espn.com",
    "fxn.ws": "foxnews.com",
    "hill.cm": "thehill.com",
    "huff.to" : "huffingtonpost.com",
    "lat.ms": "latimes.com",
    "lnkd.in": "linkedin.com",
    "n.pr": "npr.org",
    "nbcnews.to": "nbcnews.com",
    "nydn.us": "nydailynews.com",
    "nyp.st": "nypost.com",
    "nyti.ms": "nytimes.com",
    "on.rt.com": "rt.com",
    "on.wsj.com": "wsj.com",
    "politi.co": "politico.com",
    "redd.it": "reddit.com",
    "reut.rs": "reuters.com",
    "thebea.st": "thedailybeast.com",
    "ti.me": "time.com",
    "tmblr.co": "tumblr.com",
    "usat.ly": "usatoday.com",
    "wapo.st": "washingtonpost.com",
    "wp.me": "wordpress.com",
    "wpo.st": "washingtonpost.com",
    "yhoo.it": "yahoo.com",
    "youtu.be": "youtube.com",
}

In [None]:
twitter_links_df_rand = pd.read_csv("twitter_random_us_sample_links.csv",
    converters={"created_at": convert_date})
twitter_links_df_poli = pd.read_csv("twitter_political_us_sample_links.csv",
    converters={"created_at": convert_date})
twitter_links_df_ira = pd.read_csv(
    "twitter_ira_links.csv",
    converters={"created_at": lambda x: datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S")}
)

In [None]:
twitter_links_df_rand["tld"] = twitter_links_df_rand["tld"].apply(lambda x: short_domain_map.get(x, x))
twitter_links_df_poli["tld"] = twitter_links_df_poli["tld"].apply(lambda x: short_domain_map.get(x, x))
twitter_links_df_ira["tld"] = twitter_links_df_ira["tld"].apply(lambda x: short_domain_map.get(x, x))

In [None]:

twitter_top_tlds_rand = get_top_tlds(twitter_links_df_rand, user_id_field="user_id", top_n=top_n_domains)
twitter_top_tlds_poli = get_top_tlds(twitter_links_df_poli, user_id_field="user_id", top_n=top_n_domains)
twitter_top_tlds_ira = get_top_tlds(twitter_links_df_ira, user_id_field="user_id", top_n=top_n_domains)

top_tld_map_twitter = {
    "random": set(twitter_top_tlds_rand),
    "political": set(twitter_top_tlds_poli),
    "ira": set(twitter_top_tlds_ira),
}

for population, tlds in top_tld_map_twitter.items():
    print(population, ":", len(tlds))

# twitter_merged_tlds = set(twitter_top_tlds_rand)\
#     .intersection(twitter_top_tlds_poli)
# #     .intersection(twitter_top_tlds_ira)
# print("Merged Top TLDs:", len(twitter_merged_tlds))


In [None]:
reddit_links_df_rand = pd.read_csv("reddit_random_links.csv",
    converters={"created_at": convert_time})
reddit_links_df_poli = pd.read_csv("reddit_political_links.csv",
    converters={"created_at": convert_time})
reddit_links_df_ira = pd.read_csv("reddit_troll_links.csv",
    converters={"created_at": convert_time}
)

In [None]:
reddit_links_df_rand["tld"] = reddit_links_df_rand["tld"].apply(lambda x: short_domain_map.get(x, x))
reddit_links_df_poli["tld"] = reddit_links_df_poli["tld"].apply(lambda x: short_domain_map.get(x, x))
reddit_links_df_ira["tld"] = reddit_links_df_ira["tld"].apply(lambda x: short_domain_map.get(x, x))

In [None]:

reddit_top_tlds_rand = get_top_tlds(reddit_links_df_rand, user_id_field="user_name", top_n=top_n_domains)
reddit_top_tlds_poli = get_top_tlds(reddit_links_df_poli, user_id_field="user_name", top_n=top_n_domains)
reddit_top_tlds_ira = get_top_tlds(reddit_links_df_ira, user_id_field="user_name", top_n=top_n_domains)

top_tld_map_reddit = {
    "random": set(reddit_top_tlds_rand),
    "political": set(reddit_top_tlds_poli),
    "ira": set(reddit_top_tlds_ira),
}

for population, tlds in top_tld_map_reddit.items():
    print(population, ":", len(tlds))

# reddit_merged_tlds = set(reddit_top_tlds_rand)\
#     .intersection(reddit_top_tlds_poli)
# #     .intersection(reddit_top_tlds_ira)
# print("Merged Top TLDs:", len(reddit_merged_tlds))

In [None]:
# cross_platform_tlds = twitter_merged_tlds.intersection(reddit_merged_tlds)

# # Pop off these ultra-common TLDs. 
# #. The motivation here is that twitter.com is injected 
# #.  when a Twitter user retweets, and sharing within 
# #.  Reddit through crossposting increases its prevalence.
# #.  These phenomena will make activity across populations
# #.  appear artificially more similar than they ought to be.
# # cross_platform_tlds.remove("twitter.com")
# # cross_platform_tlds.remove("reddit.com")

# print("Cross-Platform Top TLDs:", len(cross_platform_tlds))

In [None]:
# print("Cross-Platform TLDs:")
# for x in sorted(cross_platform_tlds):
#     print("\t", x)

In [None]:
# print("TLDs in Reddit but not Twitter:")
# for tld in sorted(reddit_merged_tlds.difference(twitter_merged_tlds)):
#     print("\t", tld)

In [None]:
# print("TLDs in Twitter but not Reddit:")
# for tld in sorted(twitter_merged_tlds.difference(reddit_merged_tlds)):
#     print("\t", tld)

In [None]:
# Use the following to test whether focusing only on political domains 
#. changes outcomes
# top_tld_map_twitter = {k: political_domains for k in top_tld_map_twitter}
# top_tld_map_reddit = {k: political_domains for k in top_tld_map_reddit}

In [None]:
twitter_overlap_rand_poli = top_tld_map_twitter["random"].intersection(top_tld_map_twitter["political"])
twitter_overlap_rand_ira = top_tld_map_twitter["random"].intersection(top_tld_map_twitter["ira"])
twitter_overlap_poli_ira = top_tld_map_twitter["political"].intersection(top_tld_map_twitter["ira"])

twitter_user_links_mat_rand = links_to_norm_matrix(twitter_links_df_rand, top_tld_map_twitter["random"], "user_id")
twitter_user_links_mat_poli = links_to_norm_matrix(twitter_links_df_poli, top_tld_map_twitter["political"], "user_id")
twitter_user_links_mat_ira = links_to_norm_matrix(twitter_links_df_ira, top_tld_map_twitter["ira"], "user_id")

# Calculate pairwise similarity among users across populations
t2t_rand_rand_sim = sklearn.metrics.pairwise.cosine_similarity(twitter_user_links_mat_rand, twitter_user_links_mat_rand)
t2t_poli_poli_sim = sklearn.metrics.pairwise.cosine_similarity(twitter_user_links_mat_poli, twitter_user_links_mat_poli)
t2t_ira_ira_sim = sklearn.metrics.pairwise.cosine_similarity(twitter_user_links_mat_ira, twitter_user_links_mat_ira)

t2t_ira_rand_sim = sklearn.metrics.pairwise.cosine_similarity(
    links_to_norm_matrix(twitter_links_df_ira, twitter_overlap_rand_ira, "user_id"), 
    links_to_norm_matrix(twitter_links_df_rand, twitter_overlap_rand_ira, "user_id"))
t2t_ira_poli_sim = sklearn.metrics.pairwise.cosine_similarity(
    links_to_norm_matrix(twitter_links_df_ira, twitter_overlap_poli_ira, "user_id"), 
    links_to_norm_matrix(twitter_links_df_poli, twitter_overlap_poli_ira, "user_id"))
t2t_rand_poli_sim = sklearn.metrics.pairwise.cosine_similarity(
    links_to_norm_matrix(twitter_links_df_rand, twitter_overlap_rand_poli, "user_id"), 
    links_to_norm_matrix(twitter_links_df_poli, twitter_overlap_rand_poli, "user_id"))

# Collapse similarities down to get the mean similarity for each user on the left to all users on the right
#. note the minus 1 and reduction of shape by 1 for the within-platform groups, which I do to remove the 
#. self-similarity effect
t2t_rand_rand_sim_avg = (np.sum(t2t_rand_rand_sim, axis=1) - 1) / (twitter_user_links_mat_rand.shape[0] - 1)
t2t_poli_poli_sim_avg = (np.sum(t2t_poli_poli_sim, axis=1) - 1) / (twitter_user_links_mat_poli.shape[0] - 1)
t2t_ira_ira_sim_avg = (np.sum(t2t_ira_ira_sim, axis=1) - 1) / (twitter_user_links_mat_ira.shape[0] - 1)
t2t_ira_rand_sim_avg = np.mean(t2t_ira_rand_sim, axis=1)
t2t_ira_poli_sim_avg = np.mean(t2t_ira_poli_sim, axis=1)
t2t_ira_ira_sim_avg = np.mean(t2t_ira_ira_sim, axis=1)
t2t_rand_poli_sim_avg = np.sum(t2t_rand_poli_sim, axis=1)

# # Plot what these similarities look like
# plt.hist(t2t_ira_rand_sim_avg, bins=20, density=True, alpha=0.35, label="ira-rand")
# plt.hist(t2t_ira_poli_sim_avg, bins=20, density=True, alpha=0.35, label="ira-poli")
# plt.hist(t2t_ira_ira_sim_avg, bins=20, density=True, alpha=0.35, label="ira-ira")

# plt.legend()
# plt.show()

# Show similarity distribution within populations in this platform
t2t_rand_rand_sim_avg_bootstrap = [sklearn.utils.resample(t2t_rand_rand_sim_avg, replace=True, n_samples=t2t_rand_rand_sim_avg.shape[0]).mean() for i in range(bootstrap_count)]
t2t_poli_poli_sim_avg_bootstrap = [sklearn.utils.resample(t2t_poli_poli_sim_avg, replace=True, n_samples=t2t_poli_poli_sim_avg.shape[0]).mean() for i in range(bootstrap_count)]
t2t_ira_ira_sim_avg_bootstrap = [sklearn.utils.resample(t2t_ira_ira_sim_avg, replace=True, n_samples=t2t_ira_ira_sim_avg.shape[0]).mean() for i in range(bootstrap_count)]

plt.hist(t2t_rand_rand_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="rand-rand")
plt.hist(t2t_poli_poli_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="poli-poli")
plt.hist(t2t_ira_ira_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="ira-ira")

plt.title("Within-Population Similarity")
plt.legend()
plt.show()

# Show similarity distribution across populations in this platform
t2t_ira_rand_sim_avg_bootstrap = [sklearn.utils.resample(t2t_ira_rand_sim_avg, replace=True, n_samples=t2t_ira_rand_sim_avg.shape[0]).mean() for i in range(bootstrap_count)]
t2t_ira_poli_sim_avg_bootstrap = [sklearn.utils.resample(t2t_ira_poli_sim_avg, replace=True, n_samples=t2t_ira_poli_sim_avg.shape[0]).mean() for i in range(bootstrap_count)]
t2t_ira_ira_sim_avg_bootstrap = [sklearn.utils.resample(t2t_ira_ira_sim_avg, replace=True, n_samples=t2t_ira_ira_sim_avg.shape[0]).mean() for i in range(bootstrap_count)]

plt.hist(t2t_ira_rand_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="ira-rand")
plt.hist(t2t_ira_poli_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="ira-poli")
plt.hist(t2t_ira_ira_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="ira-ira")

plt.title("Across-Population Similarity")
plt.legend()
plt.show()

In [None]:
# Test whether the within-group similarities across the three populations are equal
f_stat, p_val = scipy.stats.f_oneway(t2t_rand_rand_sim_avg_bootstrap, t2t_poli_poli_sim_avg_bootstrap, t2t_ira_ira_sim_avg_bootstrap)
print("p-value for ANOVA:", p_val, f_stat)

t_stat, p_val = scipy.stats.ttest_ind(t2t_poli_poli_sim_avg_bootstrap, t2t_ira_ira_sim_avg_bootstrap, axis=0, equal_var=False)
print("p-value for Welch's t-Test between within-Poli and within-IRA:", p_val, t_stat)

t_stat, p_val = scipy.stats.ttest_ind(t2t_ira_rand_sim_avg_bootstrap, t2t_ira_poli_sim_avg_bootstrap, axis=0, equal_var=True)
print("p-value for Welch's t-Test between IRA-Rand and IRA-Poli:", p_val, t_stat)

In [None]:
plt.imshow(twitter_user_links_mat_rand.T)
plt.show()
plt.imshow(twitter_user_links_mat_poli.T)
plt.show()
plt.imshow(twitter_user_links_mat_ira.T)
plt.show()

In [None]:
for label, mat in [
    ("random", twitter_user_links_mat_rand), 
    ("political", twitter_user_links_mat_poli), 
    ("ira", twitter_user_links_mat_ira)]:
    print(label)
    domains = top_tld_map_twitter[label]
    for tld, prop in sorted(zip(domains, np.mean(mat, axis=0)), key=lambda x: x[1], reverse=True)[:10]:
        print("\t", tld, prop)

In [None]:
reddit_overlap_rand_poli = top_tld_map_reddit["random"].intersection(top_tld_map_reddit["political"])
reddit_overlap_rand_ira = top_tld_map_reddit["random"].intersection(top_tld_map_reddit["ira"])
reddit_overlap_poli_ira = top_tld_map_reddit["political"].intersection(top_tld_map_reddit["ira"])

reddit_user_links_mat_rand = links_to_norm_matrix(reddit_links_df_rand, top_tld_map_reddit["random"], "user_name")
reddit_user_links_mat_poli = links_to_norm_matrix(reddit_links_df_poli, top_tld_map_reddit["political"], "user_name")
reddit_user_links_mat_ira = links_to_norm_matrix(reddit_links_df_ira, top_tld_map_reddit["ira"], "user_name")

# Calculate pairwise similarity among users across populations
r2r_rand_rand_sim = sklearn.metrics.pairwise.cosine_similarity(reddit_user_links_mat_rand, reddit_user_links_mat_rand)
r2r_poli_poli_sim = sklearn.metrics.pairwise.cosine_similarity(reddit_user_links_mat_poli, reddit_user_links_mat_poli)
r2r_ira_ira_sim = sklearn.metrics.pairwise.cosine_similarity(reddit_user_links_mat_ira, reddit_user_links_mat_ira)

r2r_ira_rand_sim = sklearn.metrics.pairwise.cosine_similarity(
    links_to_norm_matrix(reddit_links_df_ira, reddit_overlap_rand_ira, "user_name"), 
    links_to_norm_matrix(reddit_links_df_rand, reddit_overlap_rand_ira, "user_name"))
r2r_ira_poli_sim = sklearn.metrics.pairwise.cosine_similarity(
    links_to_norm_matrix(reddit_links_df_ira, reddit_overlap_poli_ira, "user_name"), 
    links_to_norm_matrix(reddit_links_df_poli, reddit_overlap_poli_ira, "user_name"))
r2r_rand_poli_sim = sklearn.metrics.pairwise.cosine_similarity(
    links_to_norm_matrix(reddit_links_df_rand, reddit_overlap_rand_poli, "user_name"), 
    links_to_norm_matrix(reddit_links_df_poli, reddit_overlap_rand_poli, "user_name"))

# Collapse similarities down to get the mean similarity for each user on the left to all users on the right
#. note the minus 1 and reduction of shape by 1 for the within-platform groups, which I do to remove the 
#. self-similarity effect
r2r_rand_rand_sim_avg = (np.sum(r2r_rand_rand_sim, axis=1) - 1) / (reddit_user_links_mat_rand.shape[0] - 1)
r2r_poli_poli_sim_avg = (np.sum(r2r_poli_poli_sim, axis=1) - 1) / (reddit_user_links_mat_poli.shape[0] - 1)
r2r_ira_ira_sim_avg = (np.sum(r2r_ira_ira_sim, axis=1) - 1) / (reddit_user_links_mat_ira.shape[0] - 1)
r2r_ira_rand_sim_avg = np.mean(r2r_ira_rand_sim, axis=1)
r2r_ira_poli_sim_avg = np.mean(r2r_ira_poli_sim, axis=1)
r2r_ira_ira_sim_avg = np.mean(r2r_ira_ira_sim, axis=1)
r2r_rand_poli_sim_avg = np.sum(r2r_rand_poli_sim, axis=1)

# # Plot what these similarities look like
# plt.hist(r2r_ira_rand_sim_avg, bins=20, density=True, alpha=0.35, label="ira-rand")
# plt.hist(r2r_ira_poli_sim_avg, bins=20, density=True, alpha=0.35, label="ira-poli")
# plt.hist(r2r_ira_ira_sim_avg, bins=20, density=True, alpha=0.35, label="ira-ira")

# plt.legend()
# plt.show()

# Show similarity distribution within populations in this platform
r2r_rand_rand_sim_avg_bootstrap = [sklearn.utils.resample(r2r_rand_rand_sim_avg, replace=True, n_samples=r2r_rand_rand_sim_avg.shape[0]).mean() for i in range(bootstrap_count)]
r2r_poli_poli_sim_avg_bootstrap = [sklearn.utils.resample(r2r_poli_poli_sim_avg, replace=True, n_samples=r2r_poli_poli_sim_avg.shape[0]).mean() for i in range(bootstrap_count)]
r2r_ira_ira_sim_avg_bootstrap = [sklearn.utils.resample(r2r_ira_ira_sim_avg, replace=True, n_samples=r2r_ira_ira_sim_avg.shape[0]).mean() for i in range(bootstrap_count)]

plt.hist(r2r_rand_rand_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="rand-rand")
plt.hist(r2r_poli_poli_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="poli-poli")
plt.hist(r2r_ira_ira_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="ira-ira")

plt.title("Within-Population Similarity")
plt.legend()
plt.show()

# Show similarity distribution across populations in this platform
r2r_ira_rand_sim_avg_bootstrap = [sklearn.utils.resample(r2r_ira_rand_sim_avg, replace=True, n_samples=r2r_ira_rand_sim_avg.shape[0]).mean() for i in range(bootstrap_count)]
r2r_ira_poli_sim_avg_bootstrap = [sklearn.utils.resample(r2r_ira_poli_sim_avg, replace=True, n_samples=r2r_ira_poli_sim_avg.shape[0]).mean() for i in range(bootstrap_count)]
r2r_ira_ira_sim_avg_bootstrap = [sklearn.utils.resample(r2r_ira_ira_sim_avg, replace=True, n_samples=r2r_ira_ira_sim_avg.shape[0]).mean() for i in range(bootstrap_count)]

plt.hist(r2r_ira_rand_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="ira-rand")
plt.hist(r2r_ira_poli_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="ira-poli")
plt.hist(r2r_ira_ira_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="ira-ira")

plt.title("Across-Population Similarity")
plt.legend()
plt.show()

In [None]:
# Test whether the within-group similarities across the three populations are equal
f_stat, p_val = scipy.stats.f_oneway(r2r_rand_rand_sim_avg_bootstrap, r2r_poli_poli_sim_avg_bootstrap, r2r_ira_ira_sim_avg_bootstrap)
print("p-value for ANOVA:", p_val, f_stat)

t_stat, p_val = scipy.stats.ttest_ind(r2r_poli_poli_sim_avg_bootstrap, r2r_ira_ira_sim_avg_bootstrap, axis=0, equal_var=False)
print("p-value for Welch's t-Test between within-Poli and within-IRA:", p_val, t_stat)

t_stat, p_val = scipy.stats.ttest_ind(r2r_ira_rand_sim_avg_bootstrap, r2r_ira_poli_sim_avg_bootstrap, axis=0, equal_var=False)
print("p-value for Welch's t-Test between IRA-Rand and IRA-Poli:", p_val, t_stat)

In [None]:
plt.imshow(reddit_user_links_mat_rand.T)
plt.show()
plt.imshow(reddit_user_links_mat_poli.T)
plt.show()
plt.imshow(reddit_user_links_mat_ira.T)
plt.show()

In [None]:
for label, mat in [
    ("random", reddit_user_links_mat_rand), 
    ("political", reddit_user_links_mat_poli), 
    ("ira", reddit_user_links_mat_ira)]:
    print(label)
    domains = top_tld_map_reddit[label]
    for tld, prop in sorted(zip(domains, np.mean(mat, axis=0)), key=lambda x: x[1], reverse=True)[:10]:
        print("\t", tld, prop)

In [None]:
r2t_overlap_rand = top_tld_map_reddit["random"].intersection(top_tld_map_twitter["random"])
r2t_overlap_poli = top_tld_map_reddit["political"].intersection(top_tld_map_twitter["political"])
r2t_overlap_ira = top_tld_map_reddit["ira"].intersection(top_tld_map_twitter["ira"])

# Calculate pairwise similarity among users across platforms
# r2t_rand_sim = sklearn.metrics.pairwise.cosine_similarity(reddit_user_links_mat_rand, twitter_user_links_mat_rand)
# r2t_poli_sim = sklearn.metrics.pairwise.cosine_similarity(reddit_user_links_mat_poli, twitter_user_links_mat_poli)
# r2t_ira_sim = sklearn.metrics.pairwise.cosine_similarity(reddit_user_links_mat_ira, twitter_user_links_mat_ira)

r2t_rand_sim = sklearn.metrics.pairwise.cosine_similarity(
    links_to_norm_matrix(reddit_links_df_rand, r2t_overlap_rand, "user_name"), 
    links_to_norm_matrix(twitter_links_df_rand, r2t_overlap_rand, "user_id"))
r2t_poli_sim = sklearn.metrics.pairwise.cosine_similarity(
    links_to_norm_matrix(reddit_links_df_poli, r2t_overlap_poli, "user_name"), 
    links_to_norm_matrix(twitter_links_df_poli, r2t_overlap_poli, "user_id"))
r2t_ira_sim = sklearn.metrics.pairwise.cosine_similarity(
    links_to_norm_matrix(reddit_links_df_ira, r2t_overlap_ira, "user_name"), 
    links_to_norm_matrix(twitter_links_df_ira, r2t_overlap_ira, "user_id"))

# Collapse similarities down to get the mean similarity for each user on the left to all users on the right
r2t_rand_sim_avg = np.mean(r2t_rand_sim, axis=1)
r2t_poli_sim_avg = np.mean(r2t_poli_sim, axis=1)
r2t_ira_sim_avg = np.mean(r2t_ira_sim, axis=1)

# Show similarity distribution within populations in this platform
r2t_rand_sim_avg_bootstrap = [sklearn.utils.resample(r2t_rand_sim_avg, replace=True, n_samples=r2t_rand_sim_avg.shape[0]).mean() for i in range(bootstrap_count)]
r2t_poli_sim_avg_bootstrap = [sklearn.utils.resample(r2t_poli_sim_avg, replace=True, n_samples=r2t_poli_sim_avg.shape[0]).mean() for i in range(bootstrap_count)]
r2t_ira_sim_avg_bootstrap = [sklearn.utils.resample(r2t_ira_sim_avg, replace=True, n_samples=r2t_ira_sim_avg.shape[0]).mean() for i in range(bootstrap_count)]

plt.hist(r2t_rand_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="Random R2T")
plt.hist(r2t_poli_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="Political R2T")
plt.hist(r2t_ira_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="IRA R2T")

plt.title("Within-Population, Across-Platform Similarity")
plt.legend()
plt.show()


In [None]:
plt.hist(r2r_ira_rand_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="IRA-Random in Reddit")
plt.hist(r2r_ira_poli_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="IRA-Political in Reddit")
plt.hist(r2t_ira_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="IRA Reddit-to-Twitter")

plt.title("Within-Population, Across-Platform Similarity")
plt.legend()
plt.show()

In [None]:
# Test whether the across-platform similarities across the three populations are equal
f_stat, p_val = scipy.stats.f_oneway(r2t_rand_sim_avg_bootstrap, r2t_poli_sim_avg_bootstrap, r2t_ira_sim_avg_bootstrap)
print("p-value for ANOVA:", p_val, f_stat)

t_stat, p_val = scipy.stats.ttest_ind(r2t_rand_sim_avg_bootstrap, r2t_ira_sim_avg_bootstrap, axis=0, equal_var=False)
print("p-value for Welch's t-Test between Random and IRA:", p_val, t_stat)

t_stat, p_val = scipy.stats.ttest_ind(r2t_poli_sim_avg_bootstrap, r2t_ira_sim_avg_bootstrap, axis=0, equal_var=False)
print("p-value for Welch's t-Test between Poli and IRA:", p_val, t_stat)

# YouTube Channel Distributions

We've checked the differences in top-level domain sharing, but we know YouTube was very popular as well. Now, we turn to the distributions of YouTube channels.

In [None]:
twitter_yt_df_rand = twitter_links_df_rand[twitter_links_df_rand.tld == "youtube.com"].copy()
twitter_yt_df_poli = twitter_links_df_poli[twitter_links_df_poli.tld == "youtube.com"].copy()
twitter_yt_df_ira = twitter_links_df_ira[twitter_links_df_ira.tld == "youtube.com"].copy()

twitter_yt_df_rand["video_id"] = twitter_yt_df_rand.link.apply(strip_video_id_from_url)
twitter_yt_df_poli["video_id"] = twitter_yt_df_poli.link.apply(strip_video_id_from_url)
twitter_yt_df_ira["video_id"] = twitter_yt_df_ira.link.apply(strip_video_id_from_url)

twitter_yt_df_rand = twitter_yt_df_rand.dropna(subset=["video_id"])
twitter_yt_df_poli = twitter_yt_df_poli.dropna(subset=["video_id"])
twitter_yt_df_ira = twitter_yt_df_ira.dropna(subset=["video_id"])


In [None]:
twitter_video_ids = set(twitter_yt_df_rand["video_id"]).union(set(twitter_yt_df_poli["video_id"])).union(set(twitter_yt_df_ira["video_id"]))
print("Unique YT Videos:", len(twitter_video_ids))

with open("twitter_all_video_ids.csv", "w") as out_file:
    out_file.write("video_id\n")
    for video_id in twitter_video_ids:
        if ( len(video_id.strip()) == 0 ):
            continue
        out_file.write("%s\n" % video_id)

In [None]:
reddit_yt_df_rand = reddit_links_df_rand[reddit_links_df_rand.tld == "youtube.com"].copy()
reddit_yt_df_poli = reddit_links_df_poli[reddit_links_df_poli.tld == "youtube.com"].copy()
reddit_yt_df_ira = reddit_links_df_ira[reddit_links_df_ira.tld == "youtube.com"].copy()

reddit_yt_df_rand["video_id"] = reddit_yt_df_rand.link.apply(strip_video_id_from_url)
reddit_yt_df_poli["video_id"] = reddit_yt_df_poli.link.apply(strip_video_id_from_url)
reddit_yt_df_ira["video_id"] = reddit_yt_df_ira.link.apply(strip_video_id_from_url)

reddit_yt_df_rand = reddit_yt_df_rand.dropna(subset=["video_id"])
reddit_yt_df_poli = reddit_yt_df_poli.dropna(subset=["video_id"])
reddit_yt_df_ira = reddit_yt_df_ira.dropna(subset=["video_id"])


In [None]:
reddit_video_ids = set(reddit_yt_df_rand["video_id"]).union(set(reddit_yt_df_poli["video_id"])).union(set(reddit_yt_df_ira["video_id"]))
print("Unique YT Videos:", len(reddit_video_ids))

with open("reddit_all_video_ids.csv", "w") as out_file:
    out_file.write("video_id\n")
    for video_id in reddit_video_ids:
        if ( len(video_id.strip()) == 0 ):
            continue
        video_id = video_id.replace("%", "")
        out_file.write("%s\n" % video_id)

In [None]:
all_video_ids = reddit_video_ids.union(twitter_video_ids)
print("Merged YT Videos:", len(all_video_ids))

with open("twitter+reddit_all_video_ids.csv", "w") as out_file:
    out_file.write("video_id\n")
    for video_id in all_video_ids:
        if ( len(video_id.strip()) == 0 ):
            continue
        out_file.write("%s\n" % video_id)

In [None]:

twitter_yt_meta_df = pd.read_csv("twitter_video_metadata.csv")
twitter_yt_vid2chan_map = {row["video_id"]:row["channel_id"] for idx, row in twitter_yt_meta_df.iterrows()}

twitter_yt_df_rand["channel_id"] = twitter_yt_df_rand["video_id"].apply(twitter_yt_vid2chan_map.get)
twitter_yt_df_poli["channel_id"] = twitter_yt_df_poli["video_id"].apply(twitter_yt_vid2chan_map.get)
twitter_yt_df_ira["channel_id"] = twitter_yt_df_ira["video_id"].apply(twitter_yt_vid2chan_map.get)

twitter_top_yt_chans_rand = get_top_channels(twitter_yt_df_rand, user_id_field="user_id", top_n=top_n_channels)
twitter_top_yt_chans_poli = get_top_channels(twitter_yt_df_poli, user_id_field="user_id", top_n=top_n_channels)
twitter_top_yt_chans_ira = get_top_channels(twitter_yt_df_ira, user_id_field="user_id", top_n=top_n_channels)

top_yt_chans_map_twitter = {
    "random": set(twitter_top_yt_chans_rand),
    "political": set(twitter_top_yt_chans_poli),
    "ira": set(twitter_top_yt_chans_ira),
}

for population, tlds in top_yt_chans_map_twitter.items():
    print(population, ":", len(tlds))

# twitter_merged_yt_chans = set(twitter_top_yt_chans_rand)\
#     .intersection(twitter_top_yt_chans_poli)
# #     .intersection(twitter_top_yt_chans_ira)
# print("Merged Top Channels:", len(twitter_merged_yt_chans))



In [None]:

reddit_yt_meta_df = pd.read_csv("reddit_all_video_metadata.csv")
reddit_yt_vid2chan_map = {row["video_id"]:row["channel_id"] for idx, row in reddit_yt_meta_df.iterrows()}

reddit_yt_df_rand["channel_id"] = reddit_yt_df_rand["video_id"].apply(reddit_yt_vid2chan_map.get)
reddit_yt_df_poli["channel_id"] = reddit_yt_df_poli["video_id"].apply(reddit_yt_vid2chan_map.get)
reddit_yt_df_ira["channel_id"] = reddit_yt_df_ira["video_id"].apply(reddit_yt_vid2chan_map.get)

reddit_top_yt_chans_rand = get_top_channels(reddit_yt_df_rand, user_id_field="user_name", top_n=top_n_channels)
reddit_top_yt_chans_poli = get_top_channels(reddit_yt_df_poli, user_id_field="user_name", top_n=top_n_channels)
reddit_top_yt_chans_ira = get_top_channels(reddit_yt_df_ira, user_id_field="user_name", top_n=top_n_channels)

top_yt_chans_map_reddit = {
    "random": set(reddit_top_yt_chans_rand),
    "political": set(reddit_top_yt_chans_poli),
    "ira": set(reddit_top_yt_chans_ira),
}

for population, tlds in top_yt_chans_map_reddit.items():
    print(population, ":", len(tlds))

# reddit_merged_yt_chans = set(reddit_top_yt_chans_rand)\
#     .intersection(reddit_top_yt_chans_poli)
# #     .intersection(reddit_top_yt_chans_ira)
# print("Merged Top Channels:", len(reddit_merged_yt_chans))




In [None]:
# all_top_channels = reddit_merged_yt_chans.intersection(twitter_merged_yt_chans)
# print("Cross-Platform Top Channels:", len(all_top_channels))

In [None]:
# channels = sorted(all_top_channels)

In [None]:
twitter_overlap_rand_poli = top_yt_chans_map_twitter["random"].intersection(top_yt_chans_map_twitter["political"])
twitter_overlap_rand_ira = top_yt_chans_map_twitter["random"].intersection(top_yt_chans_map_twitter["ira"])
twitter_overlap_poli_ira = top_yt_chans_map_twitter["political"].intersection(top_yt_chans_map_twitter["ira"])

print("Rand->Poli Overlap:", len(twitter_overlap_rand_poli))
print("Rand->IRA Overlap:", len(twitter_overlap_rand_ira))
print("Poli->IRA Overlap:", len(twitter_overlap_poli_ira))

twitter_user_links_mat_rand = channels_to_norm_matrix(twitter_yt_df_rand, top_yt_chans_map_twitter["random"], "user_id")
twitter_user_links_mat_poli = channels_to_norm_matrix(twitter_yt_df_poli, top_yt_chans_map_twitter["political"], "user_id")
twitter_user_links_mat_ira = channels_to_norm_matrix(twitter_yt_df_ira, top_yt_chans_map_twitter["ira"], "user_id")

# Calculate pairwise similarity among users across populations
t2t_rand_rand_sim = sklearn.metrics.pairwise.cosine_similarity(twitter_user_links_mat_rand, twitter_user_links_mat_rand)
t2t_poli_poli_sim = sklearn.metrics.pairwise.cosine_similarity(twitter_user_links_mat_poli, twitter_user_links_mat_poli)
t2t_ira_ira_sim = sklearn.metrics.pairwise.cosine_similarity(twitter_user_links_mat_ira, twitter_user_links_mat_ira)

t2t_ira_rand_sim = sklearn.metrics.pairwise.cosine_similarity(
    channels_to_norm_matrix(twitter_yt_df_ira, twitter_overlap_rand_ira, "user_id"), 
    channels_to_norm_matrix(twitter_yt_df_rand, twitter_overlap_rand_ira, "user_id"))
t2t_ira_poli_sim = sklearn.metrics.pairwise.cosine_similarity(
    channels_to_norm_matrix(twitter_yt_df_ira, twitter_overlap_poli_ira, "user_id"), 
    channels_to_norm_matrix(twitter_yt_df_poli, twitter_overlap_poli_ira, "user_id"))
t2t_rand_poli_sim = sklearn.metrics.pairwise.cosine_similarity(
    channels_to_norm_matrix(twitter_yt_df_rand, twitter_overlap_rand_poli, "user_id"), 
    channels_to_norm_matrix(twitter_yt_df_poli, twitter_overlap_rand_poli, "user_id"))

# Collapse similarities down to get the mean similarity for each user on the left to all users on the right
#. note the minus 1 and reduction of shape by 1 for the within-platform groups, which I do to remove the 
#. self-similarity effect
t2t_rand_rand_sim_avg = (np.sum(t2t_rand_rand_sim, axis=1) - 1) / (twitter_user_links_mat_rand.shape[0] - 1)
t2t_poli_poli_sim_avg = (np.sum(t2t_poli_poli_sim, axis=1) - 1) / (twitter_user_links_mat_poli.shape[0] - 1)
t2t_ira_ira_sim_avg = (np.sum(t2t_ira_ira_sim, axis=1) - 1) / (twitter_user_links_mat_ira.shape[0] - 1)
t2t_ira_rand_sim_avg = np.mean(t2t_ira_rand_sim, axis=1)
t2t_ira_poli_sim_avg = np.mean(t2t_ira_poli_sim, axis=1)
t2t_ira_ira_sim_avg = np.mean(t2t_ira_ira_sim, axis=1)
t2t_rand_poli_sim_avg = np.sum(t2t_rand_poli_sim, axis=1)

# # Plot what these similarities look like
# plt.hist(t2t_ira_rand_sim_avg, bins=20, density=True, alpha=0.35, label="ira-rand")
# plt.hist(t2t_ira_poli_sim_avg, bins=20, density=True, alpha=0.35, label="ira-poli")
# plt.hist(t2t_ira_ira_sim_avg, bins=20, density=True, alpha=0.35, label="ira-ira")

# plt.legend()
# plt.show()

# Show similarity distribution within populations in this platform
t2t_rand_rand_sim_avg_bootstrap = [sklearn.utils.resample(t2t_rand_rand_sim_avg, replace=True, n_samples=t2t_rand_rand_sim_avg.shape[0]).mean() for i in range(bootstrap_count)]
t2t_poli_poli_sim_avg_bootstrap = [sklearn.utils.resample(t2t_poli_poli_sim_avg, replace=True, n_samples=t2t_poli_poli_sim_avg.shape[0]).mean() for i in range(bootstrap_count)]
t2t_ira_ira_sim_avg_bootstrap = [sklearn.utils.resample(t2t_ira_ira_sim_avg, replace=True, n_samples=t2t_ira_ira_sim_avg.shape[0]).mean() for i in range(bootstrap_count)]

plt.hist(t2t_rand_rand_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="rand-rand")
plt.hist(t2t_poli_poli_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="poli-poli")
plt.hist(t2t_ira_ira_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="ira-ira")

plt.title("Within-Population Similarity")
plt.legend()
plt.show()

# Show similarity distribution across populations in this platform
t2t_ira_rand_sim_avg_bootstrap = [sklearn.utils.resample(t2t_ira_rand_sim_avg, replace=True, n_samples=t2t_ira_rand_sim_avg.shape[0]).mean() for i in range(bootstrap_count)]
t2t_ira_poli_sim_avg_bootstrap = [sklearn.utils.resample(t2t_ira_poli_sim_avg, replace=True, n_samples=t2t_ira_poli_sim_avg.shape[0]).mean() for i in range(bootstrap_count)]
t2t_ira_ira_sim_avg_bootstrap = [sklearn.utils.resample(t2t_ira_ira_sim_avg, replace=True, n_samples=t2t_ira_ira_sim_avg.shape[0]).mean() for i in range(bootstrap_count)]

plt.hist(t2t_ira_rand_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="ira-rand")
plt.hist(t2t_ira_poli_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="ira-poli")
plt.hist(t2t_ira_ira_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="ira-ira")

plt.title("Across-Population Similarity")
plt.legend()
plt.show()

In [None]:
reddit_overlap_rand_poli = top_yt_chans_map_reddit["random"].intersection(top_yt_chans_map_reddit["political"])
reddit_overlap_rand_ira = top_yt_chans_map_reddit["random"].intersection(top_yt_chans_map_reddit["ira"])
reddit_overlap_poli_ira = top_yt_chans_map_reddit["political"].intersection(top_yt_chans_map_reddit["ira"])

print("Rand->Poli Overlap:", len(reddit_overlap_rand_poli))
print("Rand->IRA Overlap:", len(reddit_overlap_rand_ira))
print("Poli->IRA Overlap:", len(reddit_overlap_poli_ira))

reddit_user_links_mat_rand = channels_to_norm_matrix(reddit_yt_df_rand, top_yt_chans_map_twitter["random"], "user_name")
reddit_user_links_mat_poli = channels_to_norm_matrix(reddit_yt_df_poli, top_yt_chans_map_twitter["political"], "user_name")
reddit_user_links_mat_ira = channels_to_norm_matrix(reddit_yt_df_ira, top_yt_chans_map_twitter["ira"], "user_name")

# Calculate pairwise similarity among users across populations
r2r_rand_rand_sim = sklearn.metrics.pairwise.cosine_similarity(reddit_user_links_mat_rand, reddit_user_links_mat_rand)
r2r_poli_poli_sim = sklearn.metrics.pairwise.cosine_similarity(reddit_user_links_mat_poli, reddit_user_links_mat_poli)
r2r_ira_ira_sim = sklearn.metrics.pairwise.cosine_similarity(reddit_user_links_mat_ira, reddit_user_links_mat_ira)

r2r_ira_rand_sim = sklearn.metrics.pairwise.cosine_similarity(
    channels_to_norm_matrix(reddit_yt_df_ira, reddit_overlap_rand_ira, "user_name"), 
    channels_to_norm_matrix(reddit_yt_df_rand, reddit_overlap_rand_ira, "user_name"))
r2r_ira_poli_sim = sklearn.metrics.pairwise.cosine_similarity(
    channels_to_norm_matrix(reddit_yt_df_ira, reddit_overlap_poli_ira, "user_name"), 
    channels_to_norm_matrix(reddit_yt_df_poli, reddit_overlap_poli_ira, "user_name"))
r2r_rand_poli_sim = sklearn.metrics.pairwise.cosine_similarity(
    channels_to_norm_matrix(reddit_yt_df_rand, reddit_overlap_rand_poli, "user_name"), 
    channels_to_norm_matrix(reddit_yt_df_poli, reddit_overlap_rand_poli, "user_name"))

# Collapse similarities down to get the mean similarity for each user on the left to all users on the right
#. note the minus 1 and reduction of shape by 1 for the within-platform groups, which I do to remove the 
#. self-similarity effect
r2r_rand_rand_sim_avg = (np.sum(r2r_rand_rand_sim, axis=1) - 1) / (reddit_user_links_mat_rand.shape[0] - 1)
r2r_poli_poli_sim_avg = (np.sum(r2r_poli_poli_sim, axis=1) - 1) / (reddit_user_links_mat_poli.shape[0] - 1)
r2r_ira_ira_sim_avg = (np.sum(r2r_ira_ira_sim, axis=1) - 1) / (reddit_user_links_mat_ira.shape[0] - 1)
r2r_ira_rand_sim_avg = np.mean(r2r_ira_rand_sim, axis=1)
r2r_ira_poli_sim_avg = np.mean(r2r_ira_poli_sim, axis=1)
r2r_ira_ira_sim_avg = np.mean(r2r_ira_ira_sim, axis=1)
r2r_rand_poli_sim_avg = np.sum(r2r_rand_poli_sim, axis=1)

# # Plot what these similarities look like
# plt.hist(r2r_ira_rand_sim_avg, bins=20, density=True, alpha=0.35, label="ira-rand")
# plt.hist(r2r_ira_poli_sim_avg, bins=20, density=True, alpha=0.35, label="ira-poli")
# plt.hist(r2r_ira_ira_sim_avg, bins=20, density=True, alpha=0.35, label="ira-ira")

# plt.legend()
# plt.show()

# Show similarity distribution within populations in this platform
r2r_rand_rand_sim_avg_bootstrap = [sklearn.utils.resample(r2r_rand_rand_sim_avg, replace=True, n_samples=r2r_rand_rand_sim_avg.shape[0]).mean() for i in range(bootstrap_count)]
r2r_poli_poli_sim_avg_bootstrap = [sklearn.utils.resample(r2r_poli_poli_sim_avg, replace=True, n_samples=r2r_poli_poli_sim_avg.shape[0]).mean() for i in range(bootstrap_count)]
r2r_ira_ira_sim_avg_bootstrap = [sklearn.utils.resample(r2r_ira_ira_sim_avg, replace=True, n_samples=r2r_ira_ira_sim_avg.shape[0]).mean() for i in range(bootstrap_count)]

plt.hist(r2r_rand_rand_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="rand-rand")
plt.hist(r2r_poli_poli_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="poli-poli")
plt.hist(r2r_ira_ira_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="ira-ira")

plt.title("Within-Population Similarity")
plt.legend()
plt.show()

# Show similarity distribution across populations in this platform
r2r_ira_rand_sim_avg_bootstrap = [sklearn.utils.resample(r2r_ira_rand_sim_avg, replace=True, n_samples=r2r_ira_rand_sim_avg.shape[0]).mean() for i in range(bootstrap_count)]
r2r_ira_poli_sim_avg_bootstrap = [sklearn.utils.resample(r2r_ira_poli_sim_avg, replace=True, n_samples=r2r_ira_poli_sim_avg.shape[0]).mean() for i in range(bootstrap_count)]
r2r_ira_ira_sim_avg_bootstrap = [sklearn.utils.resample(r2r_ira_ira_sim_avg, replace=True, n_samples=r2r_ira_ira_sim_avg.shape[0]).mean() for i in range(bootstrap_count)]

plt.hist(r2r_ira_rand_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="ira-rand")
plt.hist(r2r_ira_poli_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="ira-poli")
plt.hist(r2r_ira_ira_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="ira-ira")

plt.title("Across-Population Similarity")
plt.legend()
plt.show()

In [None]:
r2t_overlap_rand = top_yt_chans_map_reddit["random"].intersection(top_yt_chans_map_twitter["random"])
r2t_overlap_poli = top_yt_chans_map_reddit["political"].intersection(top_yt_chans_map_twitter["political"])
r2t_overlap_ira = top_yt_chans_map_reddit["ira"].intersection(top_yt_chans_map_twitter["ira"])

print("Random Overlap:", len(r2t_overlap_rand))
print("Political Overlap:", len(r2t_overlap_poli))
print("IRA Overlap:", len(r2t_overlap_ira))

# Calculate pairwise similarity among users across platforms
r2t_rand_sim = sklearn.metrics.pairwise.cosine_similarity(
    channels_to_norm_matrix(reddit_yt_df_rand, r2t_overlap_rand, "user_name"), 
    channels_to_norm_matrix(twitter_yt_df_rand, r2t_overlap_rand, "user_id"))
r2t_poli_sim = sklearn.metrics.pairwise.cosine_similarity(
    channels_to_norm_matrix(reddit_yt_df_poli, r2t_overlap_poli, "user_name"), 
    channels_to_norm_matrix(twitter_yt_df_poli, r2t_overlap_poli, "user_id"))
r2t_ira_sim = sklearn.metrics.pairwise.cosine_similarity(
    channels_to_norm_matrix(reddit_yt_df_ira, r2t_overlap_ira, "user_name"), 
    channels_to_norm_matrix(twitter_yt_df_ira, r2t_overlap_ira, "user_id"))

# Collapse similarities down to get the mean similarity for each user on the left to all users on the right
r2t_rand_sim_avg = np.mean(r2t_rand_sim, axis=1)
r2t_poli_sim_avg = np.mean(r2t_poli_sim, axis=1)
r2t_ira_sim_avg = np.mean(r2t_ira_sim, axis=1)

# Show similarity distribution within populations in this platform
r2t_rand_sim_avg_bootstrap = [sklearn.utils.resample(r2t_rand_sim_avg, replace=True, n_samples=r2t_rand_sim_avg.shape[0]).mean() for i in range(bootstrap_count)]
r2t_poli_sim_avg_bootstrap = [sklearn.utils.resample(r2t_poli_sim_avg, replace=True, n_samples=r2t_poli_sim_avg.shape[0]).mean() for i in range(bootstrap_count)]
r2t_ira_sim_avg_bootstrap = [sklearn.utils.resample(r2t_ira_sim_avg, replace=True, n_samples=r2t_ira_sim_avg.shape[0]).mean() for i in range(bootstrap_count)]

plt.hist(r2t_rand_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="Random R2T")
plt.hist(r2t_poli_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="Political R2T")
plt.hist(r2t_ira_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="IRA R2T")

plt.title("Within-Population, Across-Platform Similarity")
plt.legend()
plt.show()



In [None]:
plt.hist(r2r_ira_rand_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="IRA-Random in Reddit")
plt.hist(r2r_ira_poli_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="IRA-Political in Reddit")
plt.hist(r2t_ira_sim_avg_bootstrap, bins=20, density=True, alpha=0.35, label="IRA Reddit-to-Twitter")

plt.title("Within-Population, Across-Platform Similarity")
plt.legend()
plt.show()

In [None]:
channel_map_reddit = {row["channel_id"]:row["channel_title"] for _, row in reddit_yt_meta_df.iterrows()}
channel_map_twitter = {row["channel_id"]:row["channel_title"] for _, row in twitter_yt_meta_df.iterrows()}

print(len(top_yt_chans_map_reddit["ira"]), len(top_yt_chans_map_twitter["ira"]))
print("*"*100, "\nReddit:")
for x in sorted(top_yt_chans_map_reddit["ira"]):
    print(x, channel_map_reddit[x])
    
print("*"*100, "\nTwitter:")
for x in sorted(top_yt_chans_map_twitter["ira"]):
    print(x, channel_map_twitter[x])