In [12]:
import pandas as pd
from pathlib import Path

DATA_DIR = Path('/data/language-model-toxicity/data')

In [13]:
banned_subs = pd.read_csv(DATA_DIR / 'reddit' / 'banned_subreddits.csv')
banned_subs.ban_reason = banned_subs.ban_subreason.combine_first(banned_subs.ban_reason)
banned_subs = banned_subs.drop(columns='ban_subreason')
banned_subs

Unnamed: 0,subreddit,ban_reason
0,1040808DChess,violent_content
1,1350IsPrettyNifty,violent_content
2,2meirl4cubed_meirl,violent_content
3,3rdMusic,violent_content
4,AccidentaISuicide,violent_content
...,...,...
3222,VaccineSnakeOil,quarantined
3223,vaccinetruths,quarantined
3224,VaccineUniversity,quarantined
3225,VaccineVictimSlander,quarantined


In [14]:
mbfc = pd.read_csv(DATA_DIR / 'mbfc.csv')
mbfc = mbfc.drop(columns=['source_url', 'ref'])
mbfc = mbfc.rename(columns={'source_url_normalized': 'domain', 'fact': 'factual_reliability'})
mbfc

Unnamed: 0,domain,factual_reliability,bias
0,villagevoice.com,high,left
1,insideclimatenews.org,high,left-center
2,fury.news,low,extreme-right
3,now8news.com,low,center
4,constitution.com,low,extreme-right
...,...,...,...
1061,fusion.kinja.com,mixed,left-center
1062,aptnnews.ca,high,center
1063,wsws.org,high,left
1064,spin.com-death-and-taxes,high,left


In [15]:
docs = pd.read_csv(DATA_DIR / 'openwebtext' / 'openwebtext_subreddits.csv')
docs = docs.drop_duplicates()
docs

Unnamed: 0,url,md5_hash,domain,subreddit,karma
0,http://fsutorch.com/2016/06/25/ferris-water-te...,2d70235438926114f4e518ccc10532b7,fsutorch.com,Michigan,21
1,http://www.huffingtonpost.com/tara-hedman/what...,ba3c686567ca9510d303bb16d2ac89e7,huffingtonpost.com,daddit,3
2,http://www.atlasobscura.com/articles/the-stran...,6bb504c50d730ba73438971fc75104db,atlasobscura.com,Libraries,23
3,https://propakistani.pk/2016/06/27/nadra-launc...,51b97a36e8e3da611a674a53c0fe03a2,propakistani.pk,pakistan,14
4,http://www.localmemphis.com/news/local-news/on...,3ab39f04111eb90901b80647f2bfe539,localmemphis.com,Bad_Cop_No_Donut,82
...,...,...,...,...,...
7356587,http://www.recode.net/2016/6/8/11883518/app-bo...,ddbc49ca678fcfb816056902864b8f48,recode.net,economy,3
7356588,http://www.bbc.com/future/story/20160623-polya...,ec2ad1f4e1be48a45775356aa11eb6b1,bbc.com,tangentiallyspeaking,6
7356589,https://majorleaguefantasysports.com/2016/06/2...,adc4ef5a825cccc55b5e166ef7c85b00,majorleaguefantasysports.com,Browns,15
7356590,https://www.washingtonpost.com/sports/olympics...,e15344e2d85640feba1e76524cfbbd88,washingtonpost.com,Swimming,62


In [16]:
domain_prevalence = docs['domain'].value_counts()
domain_prevalence = domain_prevalence.to_frame().reset_index().rename(columns={'index': 'domain', 'domain': 'domain_prevalence'})
domain_prevalence

Unnamed: 0,domain,domain_prevalence
0,theguardian.com,143965
1,bbc.co.uk,143688
2,washingtonpost.com,97218
3,nytimes.com,89578
4,reuters.com,79417
...,...,...
262327,freedoge.co.in,1
262328,freedirt.com,1
262329,farawayuniverse.wordpress.com,1
262330,therevolvingbook.com,1


In [17]:
toxicity = pd.read_csv(DATA_DIR / 'openwebtext' / 'owtc_toxicity_scores.csv')
toxicity

Unnamed: 0,toxicity,md5_hash
0,0.197121,7ee548bdb7d2e04aa3d44bbc5757add0
1,0.352377,1d1585dcadd8ffcda7bc1842db9dc1ce
2,0.090887,c17d759681f301aaad8bde44ac80dde3
3,0.182207,88f94bb7fddfc12c16407397ac6f84c4
4,0.106864,585e6b698c1d84618c51acdc37b23678
...,...,...
7770358,0.212587,cc918b210f66c17a1f8790dbe1cc4ffc
7770359,0.210010,09b831fa0de6f99e3ed496b820e3562c
7770360,0.094110,5e9bb7748d6949ee94ccf288df2f6645
7770361,0.184078,76ddc4a3d542b0eb6018068c01a26646


In [67]:
subscriber_counts = pd.read_json(DATA_DIR / 'reddit' / 'reddit_subscriber_counts.jsonl', 
                                 lines=True, 
                                 dtype={'subreddit': str, 'subscribers': pd.Int64Dtype()})
subscriber_counts.subscribers = subscriber_counts.subscribers.astype(pd.Int64Dtype())
subscriber_counts

Unnamed: 0,subreddit,subscribers
0,nsfw,1880887
1,features,3868
2,request,1588
3,olympics,417898
4,de,275903
...,...,...
914061,varnlife,1
914062,wetlegends,1
914063,u_cvgfvcdfgvc,
914064,Mercbenzking,1


In [18]:
word_counts = pd.read_csv(DATA_DIR / 'openwebtext' / 'openwebtext_word_counts.csv')
word_counts

Unnamed: 0,md5_hash,word_count
0,7ee548bdb7d2e04aa3d44bbc5757add0,841
1,1d1585dcadd8ffcda7bc1842db9dc1ce,450
2,c17d759681f301aaad8bde44ac80dde3,721
3,88f94bb7fddfc12c16407397ac6f84c4,463
4,585e6b698c1d84618c51acdc37b23678,402
...,...,...
8013764,09b831fa0de6f99e3ed496b820e3562c,1118
8013765,5e9bb7748d6949ee94ccf288df2f6645,265
8013766,6a137932a53619627fd7acd6829be268,4435
8013767,76ddc4a3d542b0eb6018068c01a26646,1514


In [115]:
df = docs.merge(toxicity)
df = df.merge(word_counts)
df = df.merge(domain_prevalence)

# Allow null for this data
df = df.merge(mbfc, how='inner')
df = df.merge(banned_subs, how='left')
df = df.merge(subscriber_counts, how='left')

df

Unnamed: 0,url,md5_hash,domain,subreddit,karma,toxicity,word_count,domain_prevalence,factual_reliability,bias,ban_reason,subscribers
0,http://www.businessinsider.com/donald-trump-be...,666ccf7cd57ac845c9f50c543cc8c70c,businessinsider.com,TrueProgressive,12,0.180235,508,29833,high,left-center,,1175
1,http://www.businessinsider.com/donald-trump-be...,666ccf7cd57ac845c9f50c543cc8c70c,businessinsider.com,politics,249,0.180235,508,29833,high,left-center,,6201779
2,http://www.businessinsider.com/donald-trump-be...,666ccf7cd57ac845c9f50c543cc8c70c,businessinsider.com,uspolitics,23,0.180235,508,29833,high,left-center,,19038
3,http://www.businessinsider.com/isis-loses-oil-...,20c802dbc52a155743e127543363843e,businessinsider.com,oil,38,0.106787,1363,29833,high,left-center,,12004
4,http://www.businessinsider.com/donald-trump-ma...,1ca0d9c33b1975dcc810e611e80b6f1d,businessinsider.com,politics,34,0.252859,827,29833,high,left-center,,6201779
...,...,...,...,...,...,...,...,...,...,...,...,...
1532015,http://www.theburningspear.com/2009/12/challen...,70465f8a23ff427ad77921c1302edff1,theburningspear.com,communism,20,0.415754,1328,1,high,left,,145286
1532016,http://blackpigeonspeaks.com/2017/07/the-educa...,7ced66019289465973def5f96f8f9123,blackpigeonspeaks.com,The_Donald,11,0.216780,1047,1,low,extreme-right,ban_wave,
1532017,http://aceflashman.wordpress.com/2009/12/14/my...,082560717a07cdb0923f21fd1873fab4,aceflashman.wordpress.com,conspiratard,41,0.209468,722,1,low,center,,65845
1532018,http://mtstandard.com/news/local/whitehall-gol...,822007dc724ef3b82b84987f24ccd3a8,mtstandard.com,Montana,19,0.041162,268,1,high,center,,23092


In [116]:
out = df.sort_values('karma', ascending=False).drop_duplicates(subset='md5_hash')

out = out.rename(columns={'md5_hash': 'id', 'subreddit': 'subredditName', 'factual_reliability': 'factualReliability', 'domain_prevalence': 'domainPrevalence', 'word_count': 'numberOfWords', 'subscribers': 'subredditPopularity'})

out['isSubredditBannedOrQuarantined'] = ~out['ban_reason'].isna()
out = out.drop(columns='ban_reason')

# FIXME: use correct values here
out['documentText'] = None

out = out[['id', 'documentText', 'numberOfWords', 'url', 'domain', 'domainPrevalence', 
           'subredditName', 'subredditPopularity', 'isSubredditBannedOrQuarantined', 
           'toxicity', 'factualReliability', 'bias']]

out = out.sample(5_000)
out

Unnamed: 0,id,documentText,numberOfWords,url,domain,domainPrevalence,subredditName,subredditPopularity,isSubredditBannedOrQuarantined,toxicity,factualReliability,bias
596165,7a6f18df8a32746d4d55147608ad78b3,,855,http://blogs.timesofisrael.com/why-the-murder-...,timesofisrael.com,8586,Israel,63501,False,0.358226,high,left-center
1478274,e613d406abc4cfd037256c43612b5d6c,,428,http://www.financialexpress.com/industry/techn...,financialexpress.com,777,artificial,112110,False,0.019548,high,center
1085831,00c6b3e70d2d312b2536e4293bdb8e02,,297,http://www.patheos.com/blogs/exploringourmatri...,patheos.com,12418,atheismrebooted,17411,False,0.118141,mixed,center
1058852,1050a005f637efbeadc1a241f0230437,,553,http://www.orlandosentinel.com/sports/college/...,orlandosentinel.com,3840,CFB,852901,False,0.112448,high,right-center
1438203,4889871ba6a72e90f4457a335c4e035f,,1753,http://www.rappler.com/newsbreak/64529-inc-lob...,rappler.com,1520,Philippines,327515,False,0.112330,mixed,left
...,...,...,...,...,...,...,...,...,...,...,...,...
1322465,c6660301cbbaafb07c805b0c8cd8ba6e,,1101,http://www.providencejournal.com/article/20150...,providencejournal.com,963,providence,13296,False,0.211597,high,left-center
169316,52a3b666f3a7becebe89c2af7a95d43d,,631,http://www.breitbart.com/big-government/2017/0...,breitbart.com,42650,The_Donald,,True,0.090975,mixed,right
335039,e744e50d3c6c6054a3d2a03f7b40e9af,,438,http://thehill.com/blogs/ballot-box/gop-primar...,thehill.com,43688,politics,6201779,False,0.174114,mixed,left-center
294574,b8831be0a84e10e56669f59e19433067,,2301,http://www.politico.com/story/2016/12/michael-...,politico.com,38252,politics,6201779,False,0.237182,high,left-center


In [118]:
out.to_json('rtp_corpus_viz_data_v6.json', orient='records')