_All data was collected on 05-07 October 2024._

In [29]:
import hashlib
import logging
import os
import shutil
import string
import sqlite3
import time
import random
import re
import requests
from collections import defaultdict

import networkx as nx
import praw
import pandas as pd
import tqdm.notebook as tqdm_notebook
import tqdm.contrib.telegram as tqdm_telegram
from selenium import webdriver #also need a standalone chromedriver executable
from dotenv import load_dotenv
from matplotlib_venn import venn2
from pprint import pprint

In [2]:
logging.basicConfig(level=logging.INFO, filename='run.log', format='%(asctime)s %(levelname)-8s %(message)s')
load_dotenv(); #loading environmental variables from .env

In [3]:
def notify(title, message):
    #send a Telegram message using noti
    #https://github.com/variadico/noti
    if shutil.which("noti"):
        os.system("noti -g -t '%s' -m '%s'" % (title, message))

In [15]:
def tqdm(*args, **kwargs):
    #override tqdm to provide Telegram updates if ENV has necessary info
    #https://tqdm.github.io/docs/contrib.telegram/
    if os.environ.get('TG_TOKEN') and os.environ.get('TG_CHAT_ID'):
        return tqdm_telegram.tqdm(
            *args, **kwargs,
            token=os.environ.get('TG_TOKEN'), 
            chat_id=os.environ.get('TG_CHAT_ID'),
            mininterval=4 #TG rate limits are 20 messages per min
    )
    else:
        return tqdm_notebook.tqdm(*args, **kwargs)
                                    

# Getting list of subreddits

There are two Reddit lists that list best/popular communities:
* www.reddit.com/best/communities/1/
* www.reddit.com/subreddits/

The first one is endless and doesn't include many known popular communities. The reason is probably that Reddit doesn't want to suggest NSFW or controversial communities to new users or advertisers. Some examples of missing subs:
* `r/iamatotalpieceofshit`
* `r/therewasanattempt`
* `r/illusions`
* `r/Palestine`
* `r/PublicFreakout`
* `r/CombatFootage`
* `r/FUCKYOUINPARTICULAR`
* `r/IsItBullshit`
* `r/IdiotsFightingThings`
* `r/JordanPeterson`
* `r/MensRights`
* `r/OneSecondBeforeDisast`
* `r/TIHI`
* `r/ThatsInsane`
* `r/TrueUnpopularOpinion`
* `r/Wellthatsucks`
* `r/Whatcouldgowrong`
* `r/cringe`
* `r/sex`
* `r/stupidquestions`

Therefore, I decided to use www.reddit.com/subreddits/ only. It's a list of 4387 subreddits.

##  Getting most popular subreddits from www.reddit.com/subreddits/

In [None]:
#Old Reddit blocks non-browser requests, so I've got to use Selenium
pop_subs = set()
with open("popular-subreddits.txt", "w") as out:
    url = "https://www.reddit.com/subreddits"
    regex_subs = re.compile(r'class="titlerow"><a href="https://www.reddit.com/r/([^"]+)/"')
    regex_next = re.compile(r'<span class="next-button">\s*<a href="([^"]*)')
    driver = webdriver.Chrome()

    while True:
        attempts = 0
        matches = None
        while not matches and attempts < 5:
            driver.get(url)
            source = driver.page_source
            next_subs = regex_subs.findall(source)
            matches = regex_next.findall(source)
            time.sleep(5 + 5 * attempts)
            attempts += 1

        if next_subs:
            next_subs = [s for s in next_subs if s] #removing empty strings
            pop_subs |= set(next_subs)
        
        print(f"Processed {url}, {len(next_subs)} subs")
        if not matches:
            break #reached end of list

        [url] = matches
        out.flush()
        time.sleep(5)

print(f"Compiled popular-communities.txt ({len(pop_subs)} subs)")
notify("reddit-mods-ds", f"Compiled popular-communities.txt ({len(pop_subs)} subs)")

In [37]:
len(pop_subs)

4386

In [25]:
with open('popular-subreddits.txt', 'w') as f:
    f.write('\n'.join())

# Inspecting subreddits

To increase speed, I will be using two different Reddit accounts. I have authorized a `script` application for each of them on www.reddit.com/prefs/apps.

I will be using the schema
![](schema.png)

In [5]:
class DBFactory:

    db = None

    def __call__(self):
        # DBFactory.db = DBFactory.db or sqlite3.connect('db.sqlite3')
        DBFactory.db = sqlite3.connect('db.sqlite3')
        self.create_tables()
        self.init_tables()
        return self.db

    
    def create_tables(self):
        cursor = self.db.cursor()
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS subs(
                name TEXT PRIMARY KEY,
                nsubscr INTEGER,
                processed BOOLEAN DEFAULT 0
            )
        ''')
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS mods(
                mod TEXT,
                sub TEXT,
                PRIMARY KEY (mod, sub)
            )
        ''')
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS users(
                name TEXT PRIMARY KEY,
                is_bot BOOLEAN,
                is_suspended BOOLEAN,
                last_active INTEGER,
                last_comment TEXT,
                processed BOOLEAN DEFAULT 0
            )
        ''')
        self.db.commit()
        cursor.close()

    def init_tables(self):
        cursor = self.db.cursor()
        self.db.commit()
        cursor.close()
        

In [6]:
class SubInspector:
    
    def __init__(self, reddit):
        self.reddit = reddit
        self.db = DBFactory()()
        self.cursor = self.db.cursor()

    
    def inspect(self, *, sub_name=None):        
        self.cursor.execute("SELECT name FROM subs WHERE name=? AND processed=1", (sub_name,))
        if self.cursor.fetchone():
            return 0 #sub already processed
            
        sub = self.reddit.subreddit(sub_name)
        self.cursor.execute("INSERT OR IGNORE INTO subs(name, nsubscr) VALUES(?,?)", (sub.display_name, sub.subscribers))
        for mod in sub.moderator():
            self.cursor.execute("INSERT OR IGNORE INTO mods(mod, sub) VALUES(?,?)", (mod.name, sub.display_name))
            self.cursor.execute("INSERT OR IGNORE INTO users(name) VALUES(?)", (mod.name,))

        self.cursor.execute("UPDATE subs SET processed = 1 WHERE name = ?", (sub.display_name,))
        self.db.commit()

        logging.info(f'Inspected sub {sub.display_name}')
        return 1 #sub is new


    def set_reddit(self, reddit):
        self.reddit = reddit

    
    def __del__(self):
        self.cursor.close()
            

In [33]:
reddit1 = praw.Reddit(
    client_id=os.getenv("CLIENT_ID_1"),
    client_secret=os.getenv("CLIENT_SECRET_1"),
    user_agent=os.getenv("USER_AGENT_1"),
    username=os.getenv("USER_NAME_1"),
    password=os.getenv("USER_PASS_1")
)

reddit2 = praw.Reddit(
    client_id=os.getenv("CLIENT_ID_2"),
    client_secret=os.getenv("CLIENT_SECRET_2"),
    user_agent=os.getenv("USER_AGENT_2"),
    username=os.getenv("USER_NAME_2"),
    password=os.getenv("USER_PASS_2")
)

In [36]:
with open('popular-subreddits.txt', 'r') as f:
    pop_subs = [s.strip() for s in f.readlines()]

inspector = SubInspector(reddit1)
npr = 0

for sub_name in tqdm(pop_subs, desc="reddit-mods-db (subs)"):    
    if inspector.inspect(sub_name=sub_name):
        npr += 1
        time.sleep(0.5)

    #magic to overcome Reddit's ratelimits
    if npr and npr % 25 == 0:
        inspector.set_reddit(reddit2 if inspector.reddit is reddit1 else reddit1)
    if npr and npr % 50 == 0:
        time.sleep(10) 
    if npr and npr % 500 == 0:
        time.sleep(10)
    if npr and npr % 1000 == 0:
        time.sleep(10)

notify("reddit-mod-db", f"Done processing all {len(pop_subs)} popular subs!") 

  0%|          | 0/4386 [00:00<?, ?it/s]

# Inspecting moderators

Let's inspect moderators to weed out bots and those who have been inactive for a long time.

In [7]:
class UserInspector:
    
    def __init__(self, reddit):
        self.reddit = reddit
        self.db = DBFactory()()
        self.cursor = self.db.cursor()

    
    def inspect(self, *, user_name=None):        
        self.cursor.execute("SELECT name FROM users WHERE name=? AND processed=1", (user_name,))
        if self.cursor.fetchone():
            return 0 #user already processed
            
        user = self.reddit.redditor(user_name)
        
        try:
            is_suspended = hasattr(user, "is_suspended")
        except:
            is_suspended = True    
        if is_suspended:
            logging.info(f'Found suspended user {user_name}')
            self.cursor.execute("UPDATE users SET is_suspended=1,processed=1 WHERE name=?", (user_name,))
            self.db.commit()
            return 1

        last_post = next(user.submissions.new(), None)
        last_comment = next(user.comments.new(), None)
        last_comment_body = last_comment.body if last_comment else None

        if last_post:
            last_active = max(last_post.created, last_comment.created) if last_comment else last_post.created
        else:
            last_active = last_comment.created if last_comment else None
        if last_active:
            last_active = int(last_active)

        is_bot = self.is_bot(user_name, last_comment_body)
        if is_bot:
            logging.info(f'Identified user {user_name} as bot')

        self.cursor.execute(
            "UPDATE users SET is_suspended=0,processed=1,last_active=?,last_comment=?,is_bot=? WHERE name=?",
            (last_active, last_comment_body, is_bot, user_name)
        )
        
        self.db.commit()
        logging.info(f'Inspected user {user_name}')
        return 1


    def get_user_list(self, only_unprocessed=True):
        if only_unprocessed:
            self.cursor.execute("SELECT name FROM users WHERE processed=0")
        else:
            self.cursor.execute("SELECT name FROM users")
        return [r[0] for r in self.cursor.fetchall()]

        
    def set_reddit(self, reddit):
        self.reddit = reddit


    def is_bot(user_name, last_comment):
        #let's look at the user_name
        if re.search("[Bb][Oo][Tt][\W\d]*$", user_name):
            return True
        if re.search("[\-_][Bb][Oo][Tt][\-_]", user_name):
            return True
        if re.search("[\W\d][Bb][Oo][Tt][A-Z\d\-_]", user_name):
            return True
        if re.search("[Bb][Oo][Tt]\W+$", user_name):
            return True
        if re.search("^[Bb][Oo][Tt][A-Z0-9\-_]", user_name):
            return True
        #now let's use the last comment by the user
        if not last_comment:
            return False
        last_comment = last_comment.lower().strip(string.punctuation)
        if re.search(r'i(\'m| am) a bot', last_comment):
            return True
        if re.search(r'this\s\w*\s(was|has been)?\s?\w+\sby a bot', last_comment):
            #this was left by a bot
            #this comment was made by a bot
            #this message has been posted by a bot
            #etc
            return True
        return False
        

    def __del__(self):
        self.cursor.close()
            

In [8]:
reddit1 = praw.Reddit(
    client_id=os.getenv("CLIENT_ID_1"),
    client_secret=os.getenv("CLIENT_SECRET_1"),
    user_agent=os.getenv("USER_AGENT_1"),
    username=os.getenv("USER_NAME_1"),
    password=os.getenv("USER_PASS_1")
)

reddit2 = praw.Reddit(
    client_id=os.getenv("CLIENT_ID_2"),
    client_secret=os.getenv("CLIENT_SECRET_2"),
    user_agent=os.getenv("USER_AGENT_2"),
    username=os.getenv("USER_NAME_2"),
    password=os.getenv("USER_PASS_2")
)

In [19]:
inspector = UserInspector(reddit2)
users = inspector.get_user_list()
npr = 0

for user_name in tqdm(users, desc="reddit-mod-db (users)"):
    if inspector.inspect(user_name=user_name):
        npr += 1
        time.sleep(0.5)

    #magic to overcome Reddit's ratelimits
    if npr and npr % 25 == 0:
        inspector.set_reddit(reddit2 if inspector.reddit is reddit1 else reddit1)
    if npr and npr % 50 == 0:
        time.sleep(10)
    if npr and npr % 500 == 0:
        time.sleep(10)
    if npr and npr % 1000 == 0:
        time.sleep(10)

notify("reddit-mod-db", f"Done processing all {len(users)} users!") 

reddit-mod-db (users):   0%|          | 0/1388 [00:00<?, ?it/s]

# Writing output 

## Pooling data

In [80]:
db = DBFactory()()
cursor = db.cursor()

Usernames are personal data, and can potentially be used to identify the individual. We will encode all moderator usernames by hashing them with SHA256.

In [84]:
sha256_enc = lambda x: hashlib.sha256(x.encode()).hexdigest()

In [81]:
with open('../subreddits.csv', 'w') as f:
    f.write('name,n_members\n')
    cursor.execute("""
        SELECT name, nsubscr FROM subs ORDER BY nsubscr DESC;
    """)
    for name, n_members in tqdm(cursor.fetchall()):
        f.write(f'{name},{n_members}\n')

  0%|          | 0/4386 [00:00<?, ?it/s]

In [85]:
with open('../moderators.csv', 'w') as f:
    f.write(',username,subreddit\n')
    cursor.execute("""
        SELECT mods.mod, mods.sub FROM mods LEFT JOIN subs ON mods.sub = subs.name ORDER BY subs.nsubscr DESC;
    """)
    for i, (moderator, subreddit) in enumerate(tqdm(cursor.fetchall()), 1):
        f.write(f'{i},{sha256_enc(moderator)},{subreddit}\n')

  0%|          | 0/43563 [00:00<?, ?it/s]

In [87]:
with open('../users.csv', 'w') as f:
    f.write('username,is_suspended,is_bot,last_active\n')
    cursor.execute("""
        SELECT name, is_suspended, is_bot, last_active FROM users;
    """)
    for (name, is_suspended, is_bot, last_active) in tqdm(cursor.fetchall()):
        f.write(f'{sha256_enc(name)},{is_suspended},{is_bot},{last_active}\n')

  0%|          | 0/30943 [00:00<?, ?it/s]

## Writing graph 

In [98]:
df_subs = pd.read_csv('../subreddits.csv')
df_users = pd.read_csv('../users.csv')
df_mods = pd.read_csv('../moderators.csv', index_col=0)

In [99]:
df_subs.head()

Unnamed: 0,name,n_members
0,announcements,307637503
1,funny,63920084
2,AskReddit,48597308
3,gaming,43603831
4,worldnews,41113833


Writing as a bipartite graph, where both mods and subs are nodes.

In [105]:
g = nx.Graph()
for user, prop in df_users.iterrows():
    g.add_node(user, is_suspended=prop['is_suspended'], is_bot=prop['is_bot'], last_active=prop['last_active'], bipartite=0)
for sub, prop in df_subs.iterrows():
    g.add_node(sub, size=prop['n_members'], bipartite=1)

g.add_edges_from([tuple(row) for row in df_mods[['username', 'subreddit']].values])
nx.write_gexf(g, '../graph.gexf')