In [12]:
import pandas as pd
from pathlib import Path

DATA_DIR = Path('/data/language-model-toxicity/data')

In [13]:
banned_subs = pd.read_csv(DATA_DIR / 'reddit' / 'banned_subreddits.csv')
banned_subs.ban_reason = banned_subs.ban_subreason.combine_first(banned_subs.ban_reason)
banned_subs = banned_subs.drop(columns='ban_subreason')
banned_subs

Unnamed: 0,subreddit,ban_reason
0,1040808DChess,violent_content
1,1350IsPrettyNifty,violent_content
2,2meirl4cubed_meirl,violent_content
3,3rdMusic,violent_content
4,AccidentaISuicide,violent_content
...,...,...
3222,VaccineSnakeOil,quarantined
3223,vaccinetruths,quarantined
3224,VaccineUniversity,quarantined
3225,VaccineVictimSlander,quarantined


In [14]:
mbfc = pd.read_csv(DATA_DIR / 'mbfc.csv')
mbfc = mbfc.drop(columns=['source_url', 'ref'])
mbfc = mbfc.rename(columns={'source_url_normalized': 'domain', 'fact': 'factual_reliability'})
mbfc

Unnamed: 0,domain,factual_reliability,bias
0,villagevoice.com,high,left
1,insideclimatenews.org,high,left-center
2,fury.news,low,extreme-right
3,now8news.com,low,center
4,constitution.com,low,extreme-right
...,...,...,...
1061,fusion.kinja.com,mixed,left-center
1062,aptnnews.ca,high,center
1063,wsws.org,high,left
1064,spin.com-death-and-taxes,high,left


In [15]:
docs = pd.read_csv(DATA_DIR / 'openwebtext' / 'openwebtext_subreddits.csv')
docs = docs.drop_duplicates()
docs

Unnamed: 0,url,md5_hash,domain,subreddit,karma
0,http://fsutorch.com/2016/06/25/ferris-water-te...,2d70235438926114f4e518ccc10532b7,fsutorch.com,Michigan,21
1,http://www.huffingtonpost.com/tara-hedman/what...,ba3c686567ca9510d303bb16d2ac89e7,huffingtonpost.com,daddit,3
2,http://www.atlasobscura.com/articles/the-stran...,6bb504c50d730ba73438971fc75104db,atlasobscura.com,Libraries,23
3,https://propakistani.pk/2016/06/27/nadra-launc...,51b97a36e8e3da611a674a53c0fe03a2,propakistani.pk,pakistan,14
4,http://www.localmemphis.com/news/local-news/on...,3ab39f04111eb90901b80647f2bfe539,localmemphis.com,Bad_Cop_No_Donut,82
...,...,...,...,...,...
7356587,http://www.recode.net/2016/6/8/11883518/app-bo...,ddbc49ca678fcfb816056902864b8f48,recode.net,economy,3
7356588,http://www.bbc.com/future/story/20160623-polya...,ec2ad1f4e1be48a45775356aa11eb6b1,bbc.com,tangentiallyspeaking,6
7356589,https://majorleaguefantasysports.com/2016/06/2...,adc4ef5a825cccc55b5e166ef7c85b00,majorleaguefantasysports.com,Browns,15
7356590,https://www.washingtonpost.com/sports/olympics...,e15344e2d85640feba1e76524cfbbd88,washingtonpost.com,Swimming,62


In [16]:
domain_prevalence = docs['domain'].value_counts()
domain_prevalence = domain_prevalence.to_frame().reset_index().rename(columns={'index': 'domain', 'domain': 'domain_prevalence'})
domain_prevalence

Unnamed: 0,domain,domain_prevalence
0,theguardian.com,143965
1,bbc.co.uk,143688
2,washingtonpost.com,97218
3,nytimes.com,89578
4,reuters.com,79417
...,...,...
262327,freedoge.co.in,1
262328,freedirt.com,1
262329,farawayuniverse.wordpress.com,1
262330,therevolvingbook.com,1


In [17]:
toxicity = pd.read_csv(DATA_DIR / 'openwebtext' / 'owtc_toxicity_scores.csv')
toxicity

Unnamed: 0,toxicity,md5_hash
0,0.197121,7ee548bdb7d2e04aa3d44bbc5757add0
1,0.352377,1d1585dcadd8ffcda7bc1842db9dc1ce
2,0.090887,c17d759681f301aaad8bde44ac80dde3
3,0.182207,88f94bb7fddfc12c16407397ac6f84c4
4,0.106864,585e6b698c1d84618c51acdc37b23678
...,...,...
7770358,0.212587,cc918b210f66c17a1f8790dbe1cc4ffc
7770359,0.210010,09b831fa0de6f99e3ed496b820e3562c
7770360,0.094110,5e9bb7748d6949ee94ccf288df2f6645
7770361,0.184078,76ddc4a3d542b0eb6018068c01a26646


In [67]:
subscriber_counts = pd.read_json(DATA_DIR / 'reddit' / 'reddit_subscriber_counts.jsonl', 
                                 lines=True, 
                                 dtype={'subreddit': str, 'subscribers': pd.Int64Dtype()})
subscriber_counts.subscribers = subscriber_counts.subscribers.astype(pd.Int64Dtype())
subscriber_counts

Unnamed: 0,subreddit,subscribers
0,nsfw,1880887
1,features,3868
2,request,1588
3,olympics,417898
4,de,275903
...,...,...
914061,varnlife,1
914062,wetlegends,1
914063,u_cvgfvcdfgvc,
914064,Mercbenzking,1


In [18]:
word_counts = pd.read_csv(DATA_DIR / 'openwebtext' / 'openwebtext_word_counts.csv')
word_counts

Unnamed: 0,md5_hash,word_count
0,7ee548bdb7d2e04aa3d44bbc5757add0,841
1,1d1585dcadd8ffcda7bc1842db9dc1ce,450
2,c17d759681f301aaad8bde44ac80dde3,721
3,88f94bb7fddfc12c16407397ac6f84c4,463
4,585e6b698c1d84618c51acdc37b23678,402
...,...,...
8013764,09b831fa0de6f99e3ed496b820e3562c,1118
8013765,5e9bb7748d6949ee94ccf288df2f6645,265
8013766,6a137932a53619627fd7acd6829be268,4435
8013767,76ddc4a3d542b0eb6018068c01a26646,1514


In [102]:
df = docs.merge(toxicity)
df = df.merge(word_counts)
df = df.merge(domain_prevalence)

# Allow null for this data
df = df.merge(mbfc, how='left')
df = df.merge(banned_subs, how='left')
df = df.merge(subscriber_counts, how='left')

df

Unnamed: 0,url,md5_hash,domain,subreddit,karma,toxicity,word_count,domain_prevalence,factual_reliability,bias,ban_reason,subscribers
0,http://fsutorch.com/2016/06/25/ferris-water-te...,2d70235438926114f4e518ccc10532b7,fsutorch.com,Michigan,21,0.059421,1061,5,,,,97439
1,http://fsutorch.com/2016/06/25/ferris-water-te...,2d70235438926114f4e518ccc10532b7,fsutorch.com,news,7,0.059421,1061,5,,,,20935015
2,http://www.fsutorch.com/2013/01/16/news/ferris...,c2d86066536c3a40e6ef27033ad0d229,fsutorch.com,FerrisStateUniversity,3,0.087065,526,5,,,,344
3,http://fsutorch.com/2016/10/28/one-third-party/,1c2563a6a6d318da34b84a44521b9f45,fsutorch.com,FerrisStateUniversity,5,0.112653,650,5,,,,344
4,http://fsutorch.com/2016/10/28/one-third-party/,1c2563a6a6d318da34b84a44521b9f45,fsutorch.com,jillstein,9,0.112653,650,5,,,,7376
...,...,...,...,...,...,...,...,...,...,...,...,...
7129154,http://www.ilovekent.net/2016/05/16/kent-is-18...,247a53c2ee8c9f87c2be0a43d7596aef,ilovekent.net,CityofKent,3,0.064415,348,2,,,,
7129155,http://www.ilovekent.net/2016/05/16/kent-is-18...,247a53c2ee8c9f87c2be0a43d7596aef,ilovekent.net,Seattle,161,0.064415,348,2,,,,218214
7129156,http://coldreadingworld.com/what-is-cold-reading/,f1739f2f78763eae25a99c512ea1adf9,coldreadingworld.com,SocialEngineering,40,0.121936,1226,1,,,,131894
7129157,https://www.organicweb.com.au/20209/general-te...,054b9ccaa46bfee9b5d2e19c0532820f,organicweb.com.au,howto,3,0.077501,177,1,,,,1576727


In [103]:
out = df.sort_values('karma', ascending=False).drop_duplicates(subset='md5_hash')

out = out.rename(columns={'md5_hash': 'id', 'subreddit': 'subredditName', 'factual_reliability': 'factualReliability', 'domain_prevalence': 'domainPrevalence', 'word_count': 'numberOfWords', 'subscribers': 'subredditPopularity'})

out['isSubredditBannedOrQuarantined'] = ~out['ban_reason'].isna()
out = out.drop(columns='ban_reason')

# FIXME: use correct values here
out['documentText'] = None

out = out[['id', 'documentText', 'numberOfWords', 'url', 'domain', 'domainPrevalence', 
           'subredditName', 'subredditPopularity', 'isSubredditBannedOrQuarantined', 
           'toxicity', 'factualReliability', 'bias']]

out = out.sample(5_000)
out

Unnamed: 0,id,documentText,numberOfWords,url,domain,domainPrevalence,subredditName,subredditPopularity,isSubredditBannedOrQuarantined,toxicity,factualReliability,bias
5260356,226802f1e16e4237b6ac88710eebe0e8,,955,http://onforb.es/12N473P,onforb.es,213,Cheese,78872,False,0.049576,,
6199793,f85d98d8b47e09d1e39dbc30101fcbd7,,489,http://www.journalism.co.uk/news/leveson-inqui...,journalism.co.uk,66,ukpolitics,262734,False,0.053709,,
5563723,15e9af1f7b28ee1e05266ce8abdc8b73,,682,http://londonist.com/2016/08/london-s-secret-c...,londonist.com,492,london,237944,False,0.079937,,
3213487,449be42c835c2179d7184b01d72904c8,,519,http://www.japantimes.co.jp/news/2014/06/19/na...,japantimes.co.jp,4044,japan,245507,False,0.185898,high,center
470529,f5842297deb8f1334a7b29a6781d5c87,,844,http://www.cbc.ca/news/canada/british-columbia...,cbc.ca,62571,news,20935015,False,0.099134,,
...,...,...,...,...,...,...,...,...,...,...,...,...
1829918,8dc03d8a52405aa711e073340795e54c,,996,http://www.dailymail.co.uk/sport/football/arti...,dailymail.co.uk,60392,soccer,2066434,False,0.150059,mixed,right
2746359,eb8ed1c245a2296ff6000fc563e72cab,,395,http://www.theatlantic.com/politics/archive/20...,theatlantic.com,32461,politics,6201779,False,0.213492,high,left-center
2500736,f0e67ac0532d48c7c82752b2103d5632,,365,http://popwatch.ew.com/2011/09/22/law-order-sv...,ew.com,9321,SVU,9800,False,0.208396,,
4110897,97444315fbfc5500024263013f7de21d,,325,http://www.washingtoncitypaper.com/blogs/sexis...,washingtoncitypaper.com,725,reddit.com,,False,0.264824,,


In [114]:
out.to_json('rtp_corpus_viz_data_v5.json', orient='records')