In [1]:
import pandas as pd
from pathlib import Path

DATA_DIR = Path('/data/language-model-toxicity/data')

In [11]:
banned_subs = pd.read_csv(DATA_DIR / 'reddit' / 'banned_subreddits.csv')
banned_subs.ban_reason = banned_subs.ban_subreason.combine_first(banned_subs.ban_reason)
banned_subs = banned_subs.drop(columns='ban_subreason')
banned_subs

Unnamed: 0,subreddit,ban_reason
0,1040808DChess,violent_content
1,1350IsPrettyNifty,violent_content
2,2meirl4cubed_meirl,violent_content
3,3rdMusic,violent_content
4,AccidentaISuicide,violent_content
...,...,...
3222,VaccineSnakeOil,quarantined
3223,vaccinetruths,quarantined
3224,VaccineUniversity,quarantined
3225,VaccineVictimSlander,quarantined


In [30]:
mbfc = pd.read_csv(DATA_DIR / 'mbfc.csv')

mbfc = mbfc.drop(columns=['source_url', 'ref'])
mbfc = mbfc.rename(columns={'source_url_normalized': 'domain', 'fact': 'factual_reliability'})

# bias_to_int = ['extreme-left', 'left', 'left-center', 'center', 'right-center', 'right', 'extreme-right']
# bias_to_int = {k: v for k, v in zip(bias_to_int, range(len(bias_to_int)))}
# mbfc.bias = mbfc.bias.apply(lambda x: bias_to_int[x])

# reliability_to_int = {'low': 0, 'mixed': 1, 'high': 2}
# mbfc.factual_reliability = mbfc.factual_reliability.apply(lambda x: reliability_to_int[x])

mbfc

Unnamed: 0,domain,factual_reliability,bias
0,villagevoice.com,high,left
1,insideclimatenews.org,high,left-center
2,fury.news,low,extreme-right
3,now8news.com,low,center
4,constitution.com,low,extreme-right
...,...,...,...
1061,fusion.kinja.com,mixed,left-center
1062,aptnnews.ca,high,center
1063,wsws.org,high,left
1064,spin.com-death-and-taxes,high,left


In [12]:
docs = pd.read_csv(DATA_DIR / 'openwebtext' / 'openwebtext_subreddits.csv')
docs = docs.drop_duplicates()
docs

Unnamed: 0,url,md5_hash,domain,subreddit,karma
0,http://fsutorch.com/2016/06/25/ferris-water-te...,2d70235438926114f4e518ccc10532b7,fsutorch.com,Michigan,21
1,http://www.huffingtonpost.com/tara-hedman/what...,ba3c686567ca9510d303bb16d2ac89e7,huffingtonpost.com,daddit,3
2,http://www.atlasobscura.com/articles/the-stran...,6bb504c50d730ba73438971fc75104db,atlasobscura.com,Libraries,23
3,https://propakistani.pk/2016/06/27/nadra-launc...,51b97a36e8e3da611a674a53c0fe03a2,propakistani.pk,pakistan,14
4,http://www.localmemphis.com/news/local-news/on...,3ab39f04111eb90901b80647f2bfe539,localmemphis.com,Bad_Cop_No_Donut,82
...,...,...,...,...,...
7356587,http://www.recode.net/2016/6/8/11883518/app-bo...,ddbc49ca678fcfb816056902864b8f48,recode.net,economy,3
7356588,http://www.bbc.com/future/story/20160623-polya...,ec2ad1f4e1be48a45775356aa11eb6b1,bbc.com,tangentiallyspeaking,6
7356589,https://majorleaguefantasysports.com/2016/06/2...,adc4ef5a825cccc55b5e166ef7c85b00,majorleaguefantasysports.com,Browns,15
7356590,https://www.washingtonpost.com/sports/olympics...,e15344e2d85640feba1e76524cfbbd88,washingtonpost.com,Swimming,62


In [58]:
domain_prevalence = docs['domain'].value_counts()
domain_prevalence = domain_prevalence.to_frame().reset_index().rename(columns={'index': 'domain', 'domain': 'domain_prevalence'})
domain_prevalence

Unnamed: 0,domain,domain_prevalence
0,theguardian.com,143965
1,bbc.co.uk,143688
2,washingtonpost.com,97218
3,nytimes.com,89578
4,reuters.com,79417
...,...,...
262327,soffer801.wordpress.com,1
262328,simmerandboyle.com,1
262329,moviemavericks.com,1
262330,hurricanezone.net,1


In [32]:
toxicity = pd.read_csv(DATA_DIR / 'openwebtext' / 'owtc_toxicity_scores.csv')
toxicity

Unnamed: 0,toxicity,md5_hash
0,0.197121,7ee548bdb7d2e04aa3d44bbc5757add0
1,0.352377,1d1585dcadd8ffcda7bc1842db9dc1ce
2,0.090887,c17d759681f301aaad8bde44ac80dde3
3,0.182207,88f94bb7fddfc12c16407397ac6f84c4
4,0.106864,585e6b698c1d84618c51acdc37b23678
...,...,...
7770358,0.212587,cc918b210f66c17a1f8790dbe1cc4ffc
7770359,0.210010,09b831fa0de6f99e3ed496b820e3562c
7770360,0.094110,5e9bb7748d6949ee94ccf288df2f6645
7770361,0.184078,76ddc4a3d542b0eb6018068c01a26646


In [86]:
df = docs.merge(toxicity)
df = df.merge(mbfc)
df = df.merge(banned_subs, how='left')
df = df.merge(domain_prevalence)
df

Unnamed: 0,url,md5_hash,domain,subreddit,karma,toxicity,factual_reliability,bias,ban_reason,domain_prevalence
0,http://www.businessinsider.com/donald-trump-be...,666ccf7cd57ac845c9f50c543cc8c70c,businessinsider.com,TrueProgressive,12,0.180235,high,left-center,,29833
1,http://www.businessinsider.com/donald-trump-be...,666ccf7cd57ac845c9f50c543cc8c70c,businessinsider.com,politics,249,0.180235,high,left-center,,29833
2,http://www.businessinsider.com/donald-trump-be...,666ccf7cd57ac845c9f50c543cc8c70c,businessinsider.com,uspolitics,23,0.180235,high,left-center,,29833
3,http://www.businessinsider.com/isis-loses-oil-...,20c802dbc52a155743e127543363843e,businessinsider.com,oil,38,0.106787,high,left-center,,29833
4,http://www.businessinsider.com/donald-trump-ma...,1ca0d9c33b1975dcc810e611e80b6f1d,businessinsider.com,politics,34,0.252859,high,left-center,,29833
...,...,...,...,...,...,...,...,...,...,...
1532015,http://www.theburningspear.com/2009/12/challen...,70465f8a23ff427ad77921c1302edff1,theburningspear.com,communism,20,0.415754,high,left,,1
1532016,http://blackpigeonspeaks.com/2017/07/the-educa...,7ced66019289465973def5f96f8f9123,blackpigeonspeaks.com,The_Donald,11,0.216780,low,extreme-right,ban_wave,1
1532017,http://aceflashman.wordpress.com/2009/12/14/my...,082560717a07cdb0923f21fd1873fab4,aceflashman.wordpress.com,conspiratard,41,0.209468,low,center,,1
1532018,http://mtstandard.com/news/local/whitehall-gol...,822007dc724ef3b82b84987f24ccd3a8,mtstandard.com,Montana,19,0.041162,high,center,,1


In [87]:
out = df.sort_values('karma', ascending=False).drop_duplicates(subset='md5_hash')

out = out.rename(columns={'md5_hash': 'id', 'subreddit': 'subredditName', 'factual_reliability': 'factualReliability', 'domain_prevalence': 'domainPrevalence'})

out['isSubredditBanned'] = out['ban_reason'].apply(lambda x: not (pd.isna(x) or x == 'quarantined'))
out['isSubredditQuarantined'] = out['ban_reason'].apply(lambda x: x == 'quarantined')
out = out.drop(columns='ban_reason')

out['documentText'] = "TEMP"
out['numberOfWords'] = 0
out['subredditPopularity'] = 0

out = out[['id', 'documentText', 'numberOfWords', 'url', 'domain', 'domainPrevalence', 
           'subredditName', 'subredditPopularity', 'isSubredditBanned', 'isSubredditQuarantined', 
           'toxicity', 'factualReliability', 'bias']]

out

Unnamed: 0,id,documentText,numberOfWords,url,domain,domainPrevalence,subredditName,subredditPopularity,isSubredditBanned,isSubredditQuarantined,toxicity,factualReliability,bias
449572,85ca0cdcebe2c8cab3fd416f71e05113,TEMP,0,https://motherboard.vice.com/en_us/article/wjz...,vice.com,26037,technology,0,False,False,0.336055,mixed,left-center
451310,5854b82feb68247ba54b8d62f9164e0d,TEMP,0,https://motherboard.vice.com/en_us/article/vbz...,vice.com,26037,science,0,False,False,0.255494,mixed,left-center
87593,9cf71d19128a95b5c574ed555eb15b2b,TEMP,0,http://www.cnn.com/2014/05/29/tech/innovation/...,cnn.com,70411,todayilearned,0,False,False,0.123934,mixed,left
150957,306087c104c7b9f1c10076688df68593,TEMP,0,http://www.cnn.com/2016/12/08/health/dinosaur-...,cnn.com,70411,todayilearned,0,False,False,0.076750,mixed,left
1335896,bb47ed89c4a7096c5b5173290fd3a339,TEMP,0,https://townhall.com/columnists/bobbarr/2017/1...,townhall.com,5945,technology,0,False,False,0.213993,mixed,right
...,...,...,...,...,...,...,...,...,...,...,...,...,...
139291,001ec9731d92cb4bd37cb06c15983478,TEMP,0,http://edition.cnn.com/2014/06/21/world/asia/c...,cnn.com,70411,worldnews,0,False,False,0.071771,mixed,left
1075001,e523980c1802cfbe45358ebdac2b17ba,TEMP,0,http://blogs.scientificamerican.com/guest-blog...,scientificamerican.com,11944,TrueReddit,0,False,False,0.293412,high,center
1382119,ad6398d02191ef108bf7b4e06f3c18ce,TEMP,0,http://www.hindustantimes.com/india-news/short...,hindustantimes.com,6121,india,0,False,False,0.159805,high,left-center
1074999,25823fa2a960f78d109074ebe90662a3,TEMP,0,http://www.scientificamerican.com/article.cfm?...,scientificamerican.com,11944,Anthropology,0,False,False,0.311517,high,center


In [88]:
out.to_json('rtp_corpus_viz_data_v3.json.gz', orient='records')