In [32]:
import sys
sys.path.append('/Users/avgalichin/Documents/kaggle/Jigsaw2022')
import os
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer

from src.data_utils import (clean, rank_comments, get_groups,
                            get_folds, encode_df)

#### Downloading and cleaning data

In [2]:
main_dir = Path('/Users/avgalichin/Documents/kaggle/Jigsaw2022/Datasets')
data_dir = main_dir/'Initial'

In [3]:
data = pd.read_csv(data_dir/'ruddit_with_text.csv')
data = data[['comment_id', 'txt', 'offensiveness_score']]
data = data.rename(columns={'txt': 'text', 'offensiveness_score': 'label'})
print(data.shape)
data.head()

(5838, 3)


Unnamed: 0,comment_id,text,label
0,cza1q49,> The difference in average earnings between m...,-0.083
1,cza1wdh,"The myth is that the ""gap"" is entirely based o...",-0.022
2,cza23qx,[deleted],0.167
3,cza2bw8,The assertion is that women get paid less for ...,-0.146
4,cza2iji,You said in the OP that's not what they're mea...,-0.083


In [4]:
data = clean(data.copy(), col_name='text', min_len=15)

# special clean for ruddit
data.loc[:, 'text'] = data['text'].str.strip('>')
data.loc[:, 'text'] = data['text'].str.strip()

print(data.shape)
data.head()

(5707, 3)


Unnamed: 0,comment_id,text,label
0,cza1q49,The difference in average earnings between men...,-0.083
1,cza1wdh,"The myth is that the ""gap"" is entirely based o...",-0.022
2,cza2bw8,The assertion is that women get paid less for ...,-0.146
3,cza2iji,You said in the OP that's not what they're mea...,-0.083
4,cza2jj3,Men and women are not payed less for the same ...,-0.042


#### Split data by folds

In [5]:
data = get_groups(data.copy(), n_groups=15, col_name='label')
data = get_folds(data.copy(), split_by='label_group', n_folds=10, shuffle=True)
data.head()

Effective number of groups: 15
Quantiles: [-0.889, -0.489, -0.375, -0.312, -0.25, -0.188, -0.146, -0.083, -0.04200000000000001, 0.021, 0.083, 0.146, 0.22899999999999998, 0.354, 0.562]


Unnamed: 0,comment_id,text,label,label_group,fold
0,cza1q49,The difference in average earnings between men...,-0.083,7.0,5
1,cza1wdh,"The myth is that the ""gap"" is entirely based o...",-0.022,8.0,5
2,cza2bw8,The assertion is that women get paid less for ...,-0.146,6.0,8
3,cza2iji,You said in the OP that's not what they're mea...,-0.083,7.0,1
4,cza2jj3,Men and women are not payed less for the same ...,-0.042,8.0,4


#### Get end text 

In [6]:
end_length = 64

data.loc[:, 'end_text'] = list(map(lambda t: t[-end_length:], list(data.loc[:, 'text'].values)))
data.head()

Unnamed: 0,comment_id,text,label,label_group,fold,end_text
0,cza1q49,The difference in average earnings between men...,-0.083,7.0,5,"tors. So it isn't a myth, you just feel that y..."
1,cza1wdh,"The myth is that the ""gap"" is entirely based o...",-0.022,8.0,5,"th is that the ""gap"" is entirely based on the ..."
2,cza2bw8,The assertion is that women get paid less for ...,-0.146,6.0,8,"ame* jobs, and that they get paid less *becaus..."
3,cza2iji,You said in the OP that's not what they're mea...,-0.083,7.0,1,"rs, women make less. Isn't that what JLaw was ..."
4,cza2jj3,Men and women are not payed less for the same ...,-0.042,8.0,4,ecause averages show the bigger picture. Edit:...


#### Extract input_ids and attention masks for specified tokenizer

In [79]:
tokenizer_type = 'distilbert-base-uncased' # change if needed

tokenizer = AutoTokenizer.from_pretrained(tokenizer_type)

max_length = 128

In [14]:
full_data = encode_df(data.drop('end_text', 1), tokenizer,
                      col_name='text', max_length=max_length,
                      drop=True)
end_data = encode_df(data.drop('text', 1), tokenizer,
                     col_name='end_text', max_length=end_length,
                     drop=True)

In [15]:
full_data.head()

Unnamed: 0,comment_id,label,label_group,fold,input_ids,attention_mask
0,cza1q49,-0.083,7.0,5,"[0, 133, 2249, 11, 674, 1107, 227, 604, 8, 390...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,cza1wdh,-0.022,8.0,5,"[0, 133, 17721, 16, 14, 5, 22, 29183, 113, 16,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,cza2bw8,-0.146,6.0,8,"[0, 133, 19395, 16, 14, 390, 120, 1199, 540, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,cza2iji,-0.083,7.0,1,"[0, 1185, 26, 11, 5, 24839, 14, 18, 45, 99, 51...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,cza2jj3,-0.042,8.0,4,"[0, 17762, 8, 390, 32, 45, 582, 196, 540, 13, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [16]:
end_data.head()

Unnamed: 0,comment_id,label,label_group,fold,input_ids,attention_mask
0,cza1q49,-0.083,7.0,5,"[0, 90, 994, 4, 407, 24, 965, 75, 10, 17721, 6...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,cza1wdh,-0.022,8.0,5,"[0, 212, 16, 14, 5, 22, 29183, 113, 16, 4378, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,cza2bw8,-0.146,6.0,8,"[0, 4344, 3226, 1315, 6, 8, 14, 51, 120, 1199,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,cza2iji,-0.083,7.0,1,"[0, 4926, 6, 390, 146, 540, 4, 26421, 75, 14, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,cza2jj3,-0.042,8.0,4,"[0, 49414, 15613, 311, 5, 2671, 2170, 4, 39391...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


#### split on train and val and save data

In [17]:
def split_and_save(df, val_fold, main_dir, save_name):
    train_dir = main_dir/'train'
    val_dir = main_dir/'val'
    
    if not os.path.exists(train_dir):
        os.mkdir(train_dir)
        os.mkdir(val_dir)
    
    df_train = df[df.fold != val_fold]#.drop('fold', 1)
    df_val = df[df.fold == val_fold]#.drop('fold', 1)
    
    df_train.to_csv(train_dir/save_name, index=False)
    df_val.to_csv(val_dir/save_name, index=False)

In [18]:
# save

val_fold = 9

split_and_save(full_data.copy(), val_fold, main_dir, save_name='roberta_ruddit_data.csv')
split_and_save(end_data.copy(), val_fold, main_dir, save_name='roberta_ruddit_end_data.csv')

#### Upper is a creation of dataset for training

#### Here we will create a subset of datasets and make pairs for benchmarking

Logic:

    Split data on N groups based on their toxicity scores
    For each comment corresponding to i group randomly choose (N - 1) comments (if possible - without repeats) from other groups. That will be our pairs

In [41]:
# using split on 'label' from latter preprocessing -> N = 10
N = 10

benchmark_data = full_data[full_data.fold == val_fold].copy()
print(benchmark_data.shape)
benchmark_data.head()

(570, 6)


Unnamed: 0,comment_id,label,label_group,fold,input_ids,attention_mask
5,cza31e2,-0.021,8.0,9,"[0, 3684, 5, 5007, 4044, 16, 6, 16, 41, 674, 9...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
11,cza47xu,-0.062,7.0,9,"[0, 100, 218, 75, 206, 51, 214, 143, 430, 4, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
14,cza4ldq,-0.083,7.0,9,"[0, 387, 35948, 5550, 1395, 3922, 5, 3926, 52,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
50,czjg9g7,-0.261,3.0,9,"[0, 46069, 16, 341, 11, 5, 18805, 9, 10, 37201...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
51,czjgxgu,-0.146,6.0,9,"[0, 100, 437, 45, 5, 621, 47, 214, 6827, 7, 4,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [42]:
init_columns = ['label', 'input_ids', 'attention_mask'] 

columns = ['label_1', 'input_ids_1', 'attention_mask_1',
           'label_2', 'input_ids_2', 'attention_mask_2']
pairs_data = pd.DataFrame(columns=columns)

groups = set(benchmark_data.label_group.unique())

for group in tqdm(groups):
    
    group_formation = []
    
    group_elems = benchmark_data.loc[benchmark_data.label_group == group, init_columns].reset_index(drop=True) 
    group_size = group_elems.shape[0]
    
    group_elems = group_elems.rename(
        columns={
            'label': 'label_1',
            'input_ids': 'input_ids_1',
            'attention_mask': 'attention_mask_1'
        })
    
    sample_groups = groups# - set([group])
    
    for sgroup in sample_groups:
        try:
            cur_sample = benchmark_data.loc[benchmark_data.label_group == sgroup, init_columns]\
                                       .sample(n=group_size)
        except:
            cur_sample = benchmark_data.loc[benchmark_data.label_group == sgroup, init_columns]\
                                       .sample(n=group_size, replace=True)

        cur_sample = cur_sample.rename(
            columns={
                'label': 'label_2',
                'input_ids': 'input_ids_2',
                'attention_mask': 'attention_mask_2'
            }).reset_index(drop=True)
        
        cur_pairs_data = pd.concat([group_elems, cur_sample], axis=1)
        pairs_data = pd.concat([pairs_data, cur_pairs_data], axis=0)

100%|██████████| 15/15 [00:01<00:00, 11.23it/s]


In [74]:
print(pairs_data.shape)
pairs_data.head()

(8550, 6)


Unnamed: 0,label_1,input_ids_1,attention_mask_1,label_2,input_ids_2,attention_mask_2
0,-0.574,"[0, 12582, 8936, 3529, 106, 6, 89, 13, 34558, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ...",-0.562,"[0, 45435, 6, 26730, 6828, 33, 1102, 4, 2, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ..."
1,-0.625,"[0, 2264, 247, 32, 209, 1110, 341, 116, 2, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...",-0.812,"[0, 100, 581, 109, 24, 13, 481, 328, 2, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ..."
2,-0.521,"[0, 100, 74, 129, 5848, 14, 21285, 6353, 817, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",-0.574,"[0, 7199, 686, 114, 24, 18, 1687, 7735, 6, 53,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,-0.604,"[0, 100, 460, 1137, 82, 45, 7, 21587, 7, 907, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",-0.688,"[0, 37167, 7, 2067, 1577, 346, 9, 360, 454, 10...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,-0.5,"[0, 42271, 328, 19576, 1457, 14, 203, 4, 38, 9...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",-0.562,"[0, 9904, 6, 24, 18, 65, 5, 1609, 2746, 1315, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [75]:
roberta_tokenizer = AutoTokenizer.from_pretrained('roberta-base')

In [83]:
pairs_data_distil = pairs_data.copy()
pairs_data_distil.loc[:, 'text_1'] = pairs_data_distil.input_ids_1.apply(
    lambda x: roberta_tokenizer.decode(x)[3:].split('</s>')[0])
pairs_data_distil.loc[:, 'text_2'] = pairs_data_distil.input_ids_2.apply(
    lambda x: roberta_tokenizer.decode(x)[3:].split('</s>')[0])
pairs_data_distil = pairs_data_distil.drop(['input_ids_1', 'input_ids_2'], axis=1)
pairs_data_distil = pairs_data_distil.drop(['attention_mask_1', 'attention_mask_2'], axis=1)

In [84]:
pairs_data_distil.head()

Unnamed: 0,label_1,label_2,text_1,text_2
0,-0.574,-0.562,"Spiders eat them, there for spiders win.","Eh, Stranger Things have happened."
1,-0.625,-0.812,What country are these terms used?,I'll do it for free!
2,-0.521,-0.574,I would only argue that whip cream makes for a...,"Not sure if it's considered weird, but *Popula..."
3,-0.604,-0.688,I always tell people not to hesitate to buy a ...,having to wait X number of days until a new ep...
4,-0.5,-0.562,Same! Except double that much. I just bought t...,"Yes, it's one the highest paying jobs per hour..."


In [85]:
pairs_data_distil = encode_df(pairs_data_distil.copy(), tokenizer,
                              col_name='text_1', max_length=max_length,
                              drop=True)
pairs_data_distil = pairs_data_distil.rename(columns={
    'input_ids': 'input_ids_1',
    'attention_mask': 'attention_mask_1'
})
pairs_data_distil = encode_df(pairs_data_distil.copy(), tokenizer,
                              col_name='text_2', max_length=max_length,
                              drop=True)
pairs_data_distil = pairs_data_distil.rename(columns={
    'input_ids': 'input_ids_2',
    'attention_mask': 'attention_mask_2'
})
pairs_data_distil.head()

Unnamed: 0,label_1,label_2,input_ids_1,attention_mask_1,input_ids_2,attention_mask_2
0,-0.574,-0.562,"[101, 14160, 4521, 2068, 1010, 2045, 2005, 141...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...","[101, 15501, 1010, 7985, 2477, 2031, 3047, 101...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ..."
1,-0.625,-0.812,"[101, 2054, 2406, 2024, 2122, 3408, 2109, 1029...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...","[101, 1045, 1005, 2222, 2079, 2009, 2005, 2489...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ..."
2,-0.521,-0.574,"[101, 1045, 2052, 2069, 7475, 2008, 11473, 694...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2025, 2469, 2065, 2009, 1005, 1055, 2641...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,-0.604,-0.688,"[101, 1045, 2467, 2425, 2111, 2025, 2000, 1639...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2383, 2000, 3524, 1060, 2193, 1997, 2420...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,-0.5,-0.562,"[101, 2168, 999, 3272, 3313, 2008, 2172, 1012,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2748, 1010, 2009, 1005, 1055, 2028, 1996...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [87]:
# save benchmark data
save_dir = main_dir/'benchmark'

pairs_data_distil.to_csv(save_dir/'distil_ruddit_benchmark_data.csv', index=False)