In [14]:
import sys
sys.path.append('/Users/avgalichin/Documents/kaggle/Jigsaw2022')
import os
from pathlib import Path
import pandas as pd
from transformers import AutoTokenizer

from src.data_utils import (clean, rank_comments, get_groups,
                            get_folds, encode_df)

In [2]:
main_dir = Path('/Users/avgalichin/Documents/kaggle/Jigsaw2022/Datasets')
data_dir = main_dir/'Initial'

In [3]:
data = pd.read_csv(data_dir/'Data_ToxicCommentClassification.csv')
data = data.rename(columns={'comment_text': 'text'})
print(data.shape)
data.head()

(159571, 8)


Unnamed: 0,id,text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


#### Downloading and cleaning data

In [4]:
data = clean(data.copy(), col_name='text', min_len=15)
print(data.shape)
data.head()

(159140, 8)


Unnamed: 0,id,text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation Why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,""" More I can't make any real suggestions on im...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


#### rank comments

In [5]:
weights = [0.3, 0.5, 0.05, 0.05, 0.05, 0.05]

toxicity_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
out_col = 'label'

In [6]:
data = rank_comments(data.copy(), aggregate_cols=toxicity_cols, weights=weights,
                     out_col=out_col)
data = data[['id', 'text', out_col]]
print(data.shape)
data.head()

(159140, 3)


Unnamed: 0,id,text,label
0,0000997932d777bf,Explanation Why the edits made under my userna...,0.0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0.0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0.0
3,0001b41b1c6bb37e,""" More I can't make any real suggestions on im...",0.0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0.0


In [7]:
data.label.unique()

array([0.  , 0.9 , 0.3 , 0.45, 0.4 , 0.35, 0.5 , 0.05, 0.85, 0.95, 1.  ,
       0.1 , 0.15, 0.8 ])

#### Split data by folds

In [8]:
data = get_folds(data.copy(), split_by='label', n_folds=5, shuffle=True)
data.head()

Unnamed: 0,id,text,label,fold
0,0000997932d777bf,Explanation Why the edits made under my userna...,0.0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0.0,4
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0.0,4
3,0001b41b1c6bb37e,""" More I can't make any real suggestions on im...",0.0,2
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0.0,3


#### Extract input_ids and attention masks for specified tokenizer

In [11]:
tokenizer_type = 'roberta-base'  # change if needed

tokenizer = AutoTokenizer.from_pretrained(tokenizer_type)

max_length = 256

In [12]:
data = encode_df(data.copy(), tokenizer, 
                 col_name='text', max_length=256, drop=True)
data.head()

Unnamed: 0,id,label,fold,input_ids,attention_mask
0,0000997932d777bf,0.0,0,"[0, 9089, 11181, 1258, 2612, 5, 39708, 156, 22...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,000103f0d9cfb60f,0.0,4,"[0, 495, 108, 1584, 605, 328, 91, 2856, 42, 36...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,000113f07ec002fd,0.0,4,"[0, 13368, 313, 6, 38, 437, 269, 45, 667, 7, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,0001b41b1c6bb37e,0.0,2,"[0, 113, 901, 38, 64, 75, 146, 143, 588, 9622,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,0001d958c54c6e35,0.0,3,"[0, 1185, 6, 21958, 6, 32, 127, 6132, 4, 5053,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


#### Split data on train and val

In [15]:
train_dir = main_dir/'train'
val_dir = main_dir/'val'

if not os.path.exists(train_dir):
    os.mkdir(train_dir)
    os.mkdir(val_dir)

In [16]:
# save tcc train/val data

val_fold = 4

data_train = data[data.fold != val_fold].drop('fold', 1)
data_val = data[data.fold == val_fold].drop('fold', 1)

# data_train.to_csv(train_dir/'tcc_data_ruddit.csv', index=False)
# data_val.to_csv(val_dir/'tcc_data_ruddit.csv', index=False)