In [1]:
import sys
sys.path.append('/Users/avgalichin/Documents/kaggle/Jigsaw2022')
import os
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer

from src.data_utils import (clean, rank_comments, get_groups,
                            get_folds, encode_df)

#### Downloading and cleaning data

In [2]:
main_dir = Path('/Users/avgalichin/Documents/kaggle/Jigsaw2022/Datasets')
data_dir = main_dir/'Initial'

In [3]:
data = pd.read_csv(data_dir/'ruddit_with_text.csv')
data = data[['comment_id', 'txt', 'offensiveness_score']]
data = data.rename(columns={'txt': 'text', 'offensiveness_score': 'label'})
print(data.shape)
data.head()

(5838, 3)


Unnamed: 0,comment_id,text,label
0,cza1q49,> The difference in average earnings between m...,-0.083
1,cza1wdh,"The myth is that the ""gap"" is entirely based o...",-0.022
2,cza23qx,[deleted],0.167
3,cza2bw8,The assertion is that women get paid less for ...,-0.146
4,cza2iji,You said in the OP that's not what they're mea...,-0.083


In [4]:
data = clean(data.copy(), col_name='text', min_len=5)

# special clean for ruddit
data.loc[:, 'text'] = data['text'].str.strip('>')
data.loc[:, 'text'] = data['text'].str.strip()

print(data.shape)
data.head()

(5710, 3)


  df.loc[:, col_name] = df[col_name].str.replace('https?://\S+|www\.\S+', ' social medium ')
  df.loc[:, col_name] = df[col_name].str.replace('\s+', ' ')


Unnamed: 0,comment_id,text,label
0,cza1q49,The difference in average earnings between men...,-0.083
1,cza1wdh,"The myth is that the ""gap"" is entirely based o...",-0.022
2,cza2bw8,The assertion is that women get paid less for ...,-0.146
3,cza2iji,You said in the OP that's not what they're mea...,-0.083
4,cza2jj3,Men and women are not payed less for the same ...,-0.042


#### Split data by folds

In [5]:
id2fold_path = main_dir / "id2fold.csv"

try:
    id2fold = pd.read_csv(id2fold_path)
    data = data.merge(id2fold, on='comment_id')
except FileNotFoundError:
    print("No id2label file found. Creating new split")

    data = get_groups(data.copy(), n_groups=15, col_name='label')
    data = get_folds(data.copy(), split_by='label_group', n_folds=10, shuffle=True)

    data[['comment_id', 'fold']].to_csv(id2fold_path, index=False)

In [6]:
data.head()

Unnamed: 0,comment_id,text,label,fold
0,cza1q49,The difference in average earnings between men...,-0.083,5
1,cza1wdh,"The myth is that the ""gap"" is entirely based o...",-0.022,1
2,cza2bw8,The assertion is that women get paid less for ...,-0.146,2
3,cza2iji,You said in the OP that's not what they're mea...,-0.083,5
4,cza2jj3,Men and women are not payed less for the same ...,-0.042,6


#### Extract input_ids and attention masks for specified tokenizer

In [7]:
tokenizer_type = 'bert-base-cased' # change if needed (cased)

tokenizer = AutoTokenizer.from_pretrained(tokenizer_type)

max_length = 256

In [8]:
data = encode_df(data, tokenizer,
                 col_name='text', max_length=max_length,
                 drop=True)

  df.drop(col_name, 1, inplace=True)


In [9]:
data.head()

Unnamed: 0,comment_id,label,fold,input_ids,attention_mask
0,cza1q49,-0.083,5,"[101, 1109, 3719, 1107, 1903, 18155, 1206, 144...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,cza1wdh,-0.022,1,"[101, 1109, 12849, 1110, 1115, 1103, 107, 7275...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,cza2bw8,-0.146,2,"[101, 1109, 26878, 1110, 1115, 1535, 1243, 300...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,cza2iji,-0.083,5,"[101, 1192, 1163, 1107, 1103, 152, 2101, 1115,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,cza2jj3,-0.042,6,"[101, 3401, 1105, 1535, 1132, 1136, 2653, 1174...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


#### split on train and val and save data

In [10]:
def split_and_save(df, val_fold, main_dir, save_name):
    train_dir = main_dir/'train'
    val_dir = main_dir/'val'
    
    if not os.path.exists(train_dir):
        os.mkdir(train_dir)
        os.mkdir(val_dir)
    
    df_train = df[df.fold != val_fold]#.drop('fold', 1)
    df_val = df[df.fold == val_fold]#.drop('fold', 1)
    
    df_train.to_csv(train_dir / save_name, index=False)
    df_val.to_csv(val_dir / save_name, index=False)

In [11]:
# save

val_fold = 9

split_and_save(data.copy(), val_fold, main_dir, save_name='bert_ruddit_data.csv')

#### Upper is a creation of dataset for training

#### Here we will create a subset of datasets and make pairs for benchmarking

Logic:

    Split data on N groups based on their toxicity scores
    For each comment corresponding to i group randomly choose (N - 1) comments (if possible - without repeats) from other groups. That will be our pairs

In [None]:
# using split on 'label' from latter preprocessing -> N = 10
N = 10

benchmark_data = full_data[full_data.fold == val_fold].copy()
print(benchmark_data.shape)
benchmark_data.head()

In [None]:
init_columns = ['label', 'input_ids', 'attention_mask'] 

columns = ['label_1', 'input_ids_1', 'attention_mask_1',
           'label_2', 'input_ids_2', 'attention_mask_2']
pairs_data = pd.DataFrame(columns=columns)

groups = set(benchmark_data.label_group.unique())

for group in tqdm(groups):
    
    group_formation = []
    
    group_elems = benchmark_data.loc[benchmark_data.label_group == group, init_columns].reset_index(drop=True) 
    group_size = group_elems.shape[0]
    
    group_elems = group_elems.rename(
        columns={
            'label': 'label_1',
            'input_ids': 'input_ids_1',
            'attention_mask': 'attention_mask_1'
        })
    
    sample_groups = groups# - set([group])
    
    for sgroup in sample_groups:
        try:
            cur_sample = benchmark_data.loc[benchmark_data.label_group == sgroup, init_columns]\
                                       .sample(n=group_size)
        except:
            cur_sample = benchmark_data.loc[benchmark_data.label_group == sgroup, init_columns]\
                                       .sample(n=group_size, replace=True)

        cur_sample = cur_sample.rename(
            columns={
                'label': 'label_2',
                'input_ids': 'input_ids_2',
                'attention_mask': 'attention_mask_2'
            }).reset_index(drop=True)
        
        cur_pairs_data = pd.concat([group_elems, cur_sample], axis=1)
        pairs_data = pd.concat([pairs_data, cur_pairs_data], axis=0)

In [None]:
print(pairs_data.shape)
pairs_data.head()

In [None]:
roberta_tokenizer = AutoTokenizer.from_pretrained('roberta-base')

In [None]:
pairs_data_distil = pairs_data.copy()
pairs_data_distil.loc[:, 'text_1'] = pairs_data_distil.input_ids_1.apply(
    lambda x: roberta_tokenizer.decode(x)[3:].split('</s>')[0])
pairs_data_distil.loc[:, 'text_2'] = pairs_data_distil.input_ids_2.apply(
    lambda x: roberta_tokenizer.decode(x)[3:].split('</s>')[0])
pairs_data_distil = pairs_data_distil.drop(['input_ids_1', 'input_ids_2'], axis=1)
pairs_data_distil = pairs_data_distil.drop(['attention_mask_1', 'attention_mask_2'], axis=1)

In [None]:
pairs_data_distil.head()

In [None]:
pairs_data_distil = encode_df(pairs_data_distil.copy(), tokenizer,
                              col_name='text_1', max_length=max_length,
                              drop=True)
pairs_data_distil = pairs_data_distil.rename(columns={
    'input_ids': 'input_ids_1',
    'attention_mask': 'attention_mask_1'
})
pairs_data_distil = encode_df(pairs_data_distil.copy(), tokenizer,
                              col_name='text_2', max_length=max_length,
                              drop=True)
pairs_data_distil = pairs_data_distil.rename(columns={
    'input_ids': 'input_ids_2',
    'attention_mask': 'attention_mask_2'
})
pairs_data_distil.head()

In [None]:
# save benchmark data
save_dir = main_dir/'benchmark'

pairs_data_distil.to_csv(save_dir/'distil_ruddit_benchmark_data.csv', index=False)

In [2]:
from transformers import AutoTokenizer
import numpy as np
import pandas as pd

In [20]:
tokenizer_type = 'bert-base-cased' 
tokenizer = AutoTokenizer.from_pretrained(tokenizer_type)

In [15]:
s1 = "This is first sentence"
s2 = "This is second sentence"

encoded_s = tokenizer.encode_plus(s1, s2, truncation=True, add_special_tokens=True, max_length=256, padding='max_length')

In [26]:
p = tokenizer.encode_plus(s1, truncation=True, add_special_tokens=True, max_length=15, padding='max_length')

In [28]:
encoded_s

{'input_ids': [101, 1188, 1110, 1148, 5650, 102, 1188, 1110, 1248, 5650, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [20]:
np.array(encoded_s['input_ids'])

array([ 101, 1188, 1110, 1148, 5650,  102, 1188, 1110, 1248, 5650,  102,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [1]:
import sys
sys.path.append('/Users/avgalichin/Documents/kaggle/Jigsaw2022')

from src.dataset import JigsawDatasetPaired

In [2]:
import tensorflow as tf
import pandas as pd

In [3]:
path = "/Users/avgalichin/Documents/kaggle/Jigsaw2022/Datasets/val/bert_ruddit_data.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,comment_id,label,fold,input_ids,attention_mask
0,cza5maz,-0.188,9,"[101, 1135, 112, 188, 1103, 3719, 1206, 1103, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,cza6wrd,-0.104,9,"[101, 2119, 117, 1191, 1240, 1676, 8204, 1107,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,czagngx,0.292,9,"[101, 1573, 1184, 1202, 1128, 1294, 1104, 1115...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,czjep9r,-0.146,9,"[101, 8147, 117, 178, 2834, 1341, 1104, 1625, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,czjfc9s,0.25,9,"[101, 2857, 1128, 1294, 1126, 170, 3101, 3161,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [4]:
ds = JigsawDatasetPaired(["/Users/avgalichin/Documents/kaggle/Jigsaw2022/Datasets/val/bert_ruddit_data.csv"],
                    ['label'],
                    512,
                    0,
                    102,
                    32,
                    None,
                    False,
                    None)

In [6]:
for ex in ds.dataset.take(1):
    pass

In [5]:
df.loc[:, 'input_ids'] = df.input_ids.apply(lambda x: list(map(int, x[1:-1].split(', '))))

In [25]:
data = JigsawDatasetPaired(
    data_path=[path],
    label_columns=["label"],
    pad_token=0,
    split_token=102,
    shuffle=True,
    shuffle_buffer_size=5000)

In [26]:
for ex in data.dataset:
    break