In [1]:
import yaml
from pathlib import Path
import pandas as pd

In [2]:
reddit = Path('/data/language-model-toxicity/data/reddit')

with open(reddit / 'banned_subs.yml') as f:
    subs_yaml = yaml.load(f, Loader=yaml.FullLoader)

In [3]:
tuples = []

def get_tuples(subs):
    for reason, vs in subs.items():
        for v in vs:
            if isinstance(v, str):
                yield (v, reason, None)
            else:
                # Can do this because there is a max depth of 2
                for v2, subreason, _ in get_tuples(v):
                    yield (v2, reason, subreason)

data = list(get_tuples(subs_yaml))
subs = pd.DataFrame(data, columns=["subreddit", "ban_reason", "ban_subreason"])
# subs.to_csv(reddit / 'banned_subreddits.csv', index=False)

In [4]:
# TODO: execute sqlite queries to join banned subreddits with urls and documents
docs = pd.read_csv(reddit / 'banned_subreddit_docs.csv')

In [5]:
docs

Unnamed: 0,id,location,url_hash,text,url,md5_hash,domain,subreddit,karma,ban_reason,ban_subreason
0,6560466,0814248-0086a411c4db4f6ac9a5929385fd38b1,0086a411c4db4f6ac9a5929385fd38b1,Donald Trump slammed Massachusetts Sen. Elizab...,http://nypost.com/2016/06/27/trump-blasts-eliz...,0086a411c4db4f6ac9a5929385fd38b1,nypost.com,The_Donald,9,ban_wave,
1,1426830,0175643-f13c6c08a4d412672b71d82c374a5e80,f13c6c08a4d412672b71d82c374a5e80,U.S. equities enjoyed a rebound on a classic “...,http://www.msn.com/en-us/money/markets/wall-st...,f13c6c08a4d412672b71d82c374a5e80,msn.com,SargonofAkkad,3,quarantined,
2,4558714,0559857-6906ab09f592d10810d20f538f76f802,6906ab09f592d10810d20f538f76f802,"What is Ivan Reitman, the director and produce...",http://mashable.com/2016/06/30/ghostbusters-iv...,6906ab09f592d10810d20f538f76f802,mashable.com,uncensorednews,35,violent_content,encouraging_violence
3,7682540,0957728-97fbe4fe8d4058aa193eeb140b339501,97fbe4fe8d4058aa193eeb140b339501,Keith and Helen Harvey\n\n2016-06-30 21:48:47 ...,http://www.therebel.media/dear_aziz_ansari_we_...,97fbe4fe8d4058aa193eeb140b339501,therebel.media,GavinMcInnes,16,violent_content,
4,4890345,0600488-a89c112077e35677ebf65ecc8fb4a55c,a89c112077e35677ebf65ecc8fb4a55c,On Thursday’s broadcast of the Fox News Channe...,http://www.breitbart.com/video/2016/07/01/loca...,a89c112077e35677ebf65ecc8fb4a55c,breitbart.com,The_Donald,23,ban_wave,
...,...,...,...,...,...,...,...,...,...,...,...
216345,5679959,0701286-f5dbc1071d8a2d00ff9816404ce72df9,f5dbc1071d8a2d00ff9816404ce72df9,Worldwide known cyber security company CrowdSt...,https://guccifer2.wordpress.com/2016/06/15/dnc,f5dbc1071d8a2d00ff9816404ce72df9,guccifer2.wordpress.com,The_Donald,33,ban_wave,
216346,7083048,0881098-19d325a5871317a813237ae5bcb70a67,19d325a5871317a813237ae5bcb70a67,Three boys surprised the girl while they were ...,https://www.theguardian.com/us-news/2016/jun/1...,19d325a5871317a813237ae5bcb70a67,theguardian.com,uncensorednews,3,violent_content,encouraging_violence
216347,785962,0096902-c28e10262bdc7136ea698d2c3f244851,c28e10262bdc7136ea698d2c3f244851,CNN’s Don Lemon snapped at Trump supporter Kay...,http://www.mediaite.com/online/don-lemon-snaps...,c28e10262bdc7136ea698d2c3f244851,mediaite.com,The_Donald,39,ban_wave,
216348,1244812,0153249-b132ad0850fc160e04bce91dd3432e2d,b132ad0850fc160e04bce91dd3432e2d,Republican U.S. Presidential candidate Donald ...,http://www.haaretz.com/world-news/u-s-election...,b132ad0850fc160e04bce91dd3432e2d,haaretz.com,The_Donald,3,ban_wave,


In [6]:
top_subs = docs.subreddit.value_counts()
top_subs = top_subs.to_frame().reset_index().rename(columns={'index': 'subreddit', 'subreddit': 'doc_count'})
top_subs = subs.merge(top_subs).sort_values(by='doc_count', ascending=False)
top_subs

Unnamed: 0,subreddit,ban_reason,ban_subreason,doc_count
150,The_Donald,ban_wave,,141266
33,WhiteRights,violent_content,proliferation_of_violent_content,13513
170,european,unmoderated,,12746
46,uncensorednews,violent_content,encouraging_violence,10599
193,SargonofAkkad,quarantined,,4260
...,...,...,...,...
76,QAnon,harassment_or_harassing_content,inciting_harassment,1
75,AFTERTHESTQRM,harassment_or_harassing_content,inciting_harassment,1
130,JustBeBlack,no_reason_given,,1
71,SubforWhitePeopleOnly,harassment_or_harassing_content,,1


In [18]:
def print_corpus(sub: str, path):
    corpus = docs[docs.subreddit == sub].drop_duplicates(subset='url_hash').text
    with open(path, 'w') as f:
        print(*corpus, file=f, sep='\n')

In [22]:
target_subs = ['The_Donald', 'WhiteRights', 'TheRedPill', 'NationalSocialism']

for sub in target_subs:
    print_corpus(sub, reddit / f'{sub}.txt')