In [1]:
import os
import typing as t
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
tqdm.pandas()

In [3]:
DATA_DIR = Path(os.path.expanduser('~/jigsaw-toxic/data'))
RUDDIT_SRC_DIR = DATA_DIR / 'ruddit'
JIGSAW_TOXIC_SEVERITY_RATING_DIR = DATA_DIR / 'jigsaw-toxic-severity-rating'
RUDDIT_DATASET_DIR = DATA_DIR / 'datasets/ruddit'

In [5]:
!ls -la $RUDDIT_SRC_DIR/Dataset

total 32520
drwxrwxr-x 2 1001 1001     4096 Jan 28 19:41 .
drwxrwxr-x 4 1001 1001     4096 Jan 28 19:41 ..
-rw-rw-r-- 1 1001 1001     8754 Jan  1 09:15 create_dataset_variants.py
-rw-rw-r-- 1 1001 1001      454 Jan  1 09:15 identityterms_group.txt
-rw-rw-r-- 1 1001 1001     1354 Jan  1 09:15 load_node_dictionary.py
-rw-rw-r-- 1 1001 1001 15674990 Jan  1 09:15 node_dictionary.npy
-rw-rw-r-- 1 1001 1001       52 Jan  1 09:15 post_with_issues.csv
-rw-rw-r-- 1 1001 1001      265 Jan  1 09:15 ReadMe.md
-rw-rw-r-- 1 1001 1001   128343 Jan  1 09:15 Ruddit.csv
-rw-rw-r-- 1 1001 1001  4077200 Jan  1 09:15 Ruddit_individual_annotations.csv
-rw-rw-r-- 1 1001 1001  1835021 Jan  1 09:15 ruddit_with_text.csv
-rw-rw-r-- 1 1001 1001   115216 Jan  1 09:15 sample_input_file.csv
-rw-rw-r-- 1 1001 1001 11419592 Jan  1 09:15 Thread_structure.txt


In [10]:
individual_annotations_df = pd.read_csv(RUDDIT_SRC_DIR / 'Dataset/Ruddit_individual_annotations.csv')
off_score_df = pd.read_csv(RUDDIT_SRC_DIR/ 'Dataset/Ruddit.csv')
text_df = pd.read_csv(RUDDIT_SRC_DIR / 'Dataset/ruddit_with_text.csv')

In [11]:
off_score_df

Unnamed: 0,comment_id,post_id,offensiveness_score
0,cza1q49,42g75o,-0.083
1,cza1wdh,42g75o,-0.022
2,cza23qx,42g75o,0.167
3,cza2bw8,42g75o,-0.146
4,cza2iji,42g75o,-0.083
...,...,...,...
5995,f0i0mqp,cu67co,0.064
5996,f80wlxq,cganu1,0.458
5997,f8uksbp,cu67co,-0.292
5998,fa6nc1r,cganu1,0.333


In [12]:
text_df

Unnamed: 0,post_id,comment_id,txt,url,offensiveness_score
0,42g75o,cza1q49,> The difference in average earnings between m...,https://www.reddit.com/r/changemyview/comments...,-0.083
1,42g75o,cza1wdh,"The myth is that the ""gap"" is entirely based o...",https://www.reddit.com/r/changemyview/comments...,-0.022
2,42g75o,cza23qx,[deleted],https://www.reddit.com/r/changemyview/comments...,0.167
3,42g75o,cza2bw8,The assertion is that women get paid less for ...,https://www.reddit.com/r/changemyview/comments...,-0.146
4,42g75o,cza2iji,You said in the OP that's not what they're mea...,https://www.reddit.com/r/changemyview/comments...,-0.083
...,...,...,...,...,...
5833,cu67co,f0i0mqp,They should only censor things that talk badly...,https://i.redd.it/kfsmqzxae3i31.jpg/f0i0mqp/,0.064
5834,cganu1,f80wlxq,> and one of them is a woman. \n\nOH SHIT we b...,https://www.reddit.com/r/worldpolitics/comment...,0.458
5835,cu67co,f8uksbp,how is this flared as US politics,https://i.redd.it/kfsmqzxae3i31.jpg/f8uksbp/,-0.292
5836,cganu1,fa6nc1r,People in Hong Kong must decide if they are go...,https://www.reddit.com/r/worldpolitics/comment...,0.333


In [14]:
len(text_df['comment_id'].unique())

5838

In [19]:
print(len(text_df[text_df['txt'] == '[deleted]']))

116


In [26]:
ruddit_score_df = text_df.copy()
ruddit_score_df = ruddit_score_df[ruddit_score_df['txt'] != '[deleted]']
score_sr = ruddit_score_df['offensiveness_score']
ruddit_score_df['score'] = (score_sr - score_sr.min()) / (score_sr.max() - score_sr.min())
ruddit_score_df = ruddit_score_df.rename({'txt': 'comment_text'}, axis=1)[['comment_text', 'score']]

In [27]:
ruddit_score_df

Unnamed: 0,comment_text,score
0,> The difference in average earnings between m...,0.431478
1,"The myth is that the ""gap"" is entirely based o...",0.464133
3,The assertion is that women get paid less for ...,0.397752
4,You said in the OP that's not what they're mea...,0.431478
5,>Men and women are not payed less for the same...,0.453426
...,...,...
5833,They should only censor things that talk badly...,0.510171
5834,> and one of them is a woman. \n\nOH SHIT we b...,0.721092
5835,how is this flared as US politics,0.319593
5836,People in Hong Kong must decide if they are go...,0.654176


In [29]:
ruddit_score_df.to_csv(RUDDIT_DATASET_DIR / 'train.csv', index=False)

In [31]:
def _remove_quote(text: str) -> str:
    if not text.startswith('>'):
        return text
    return '\n'.join(text.split('\n')[1:])

no_quote_ruddit_score_df = ruddit_score_df.copy()
no_quote_ruddit_score_df['comment_text'] = no_quote_ruddit_score_df['comment_text'].apply(_remove_quote)

In [32]:
no_quote_ruddit_score_df.to_csv(RUDDIT_DATASET_DIR / 'train_no_quote.csv', index=False)