### Preparation of pairs for benchmarking

Validation dataset from competition is unstable, so custom benchmark dataset needs to be prepared.

For that purpose we will take approx. 10-20% of created validation data and create pairs of less and more toxic comments. This pairs will be evaluated separately from competition data
    
`!` Unintended Bias dataset works pretty well, so we will work with it

In [1]:
from pathlib import Path
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from transformers import AutoTokenizer

In [2]:
main_dir = Path('/Users/avgalichin/Documents/kaggle/Jigsaw2022/Datasets')
data_dir = main_dir/'val'

#### Downloading  data

In [3]:
data = pd.read_csv(data_dir/'ub_data.csv')
print(data.shape)
data.head()

(53454, 4)


Unnamed: 0,id,label,input_ids,attention_mask
0,239583,0.6,"[0, 243, 18, 10861, 14, 209, 1669, 32, 145, 37...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,239592,0.5,"[0, 25441, 1506, 25668, 6, 34629, 1952, 8, 148...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,239655,0.361111,"[0, 42735, 55, 385, 9683, 366, 6, 27023, 4, 2,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ..."
3,239671,0.0,"[0, 44468, 154, 1114, 4, 38, 3805, 7, 304, 138...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,239743,0.166667,"[0, 104, 8774, 6, 117, 55, 295, 1113, 295, 111...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


#### Get Stratified split on 5 folds, then take 1 fold as benchmark

In [4]:
def get_groups(df, n_groups=10, col_name='target'):
    """Split by quantiles"""
    
    q_orders = [(1. / n_groups) * n for n in range(n_groups)]
    quantiles = df[col_name].quantile(q_orders).values
    quantiles = sorted(list(set(quantiles)))
    
    print(f'Effective number of groups: {len(quantiles)}')
    print(quantiles)
    
    for i, q in enumerate(quantiles):
        df.loc[df[col_name] >= q, 'group'] = i
        
    return df


def get_folds(df, n_folds=5, shuffle=True, split_by='group'):
    """
    Get stratified folds
    
    !: Use it only if 'labels' have a comparably small number of values
    """
    skf = StratifiedKFold(n_splits=n_folds, shuffle=shuffle)

    labels_vals = df[split_by].unique()
    for idx, lv in enumerate(labels_vals):
        df.loc[df[split_by] == lv, 'encoded_label'] = idx
    
    for fold, (_, val_idx) in enumerate(skf.split(X=df, y=df.encoded_label)):
        df.loc[val_idx, 'fold'] = fold
    df['fold'] = df['fold'].astype(np.uint8)
    
    df = df.drop('encoded_label', 1)
    
    return df

In [5]:
data = get_groups(data.copy(), col_name='label')
data = get_folds(data.copy(), n_folds=10)
data.groupby('fold').group.value_counts(normalize=True)

Effective number of groups: 7
[0.0, 0.16666666666666666, 0.2, 0.3, 0.4, 0.5, 0.6973684210526315]


fold  group
0     1.0      0.259259
      5.0      0.158623
      0.0      0.147774
      3.0      0.120090
      2.0      0.107744
      4.0      0.106435
      6.0      0.100075
1     1.0      0.259259
      5.0      0.158436
      0.0      0.147774
      3.0      0.120277
      2.0      0.107744
      4.0      0.106435
      6.0      0.100075
2     1.0      0.259259
      5.0      0.158436
      0.0      0.147774
      3.0      0.120277
      2.0      0.107744
      4.0      0.106435
      6.0      0.100075
3     1.0      0.259259
      5.0      0.158436
      0.0      0.147774
      3.0      0.120277
      2.0      0.107744
      4.0      0.106435
      6.0      0.100075
4     1.0      0.259495
      5.0      0.158466
                 ...   
5     4.0      0.106268
      6.0      0.100094
6     1.0      0.259308
      5.0      0.158466
      0.0      0.147615
      3.0      0.120299
      2.0      0.107951
      4.0      0.106268
      6.0      0.100094
7     1.0      0.259308
    

In [None]:
validation_data = data[data.fold != 9].copy()


In [8]:
benchmark_data = data[data.fold == 9].copy()
benchmark_data = benchmark_data.drop('fold', 1)
print(benchmark_data.shape)
benchmark_data.head()

(5345, 5)


Unnamed: 0,id,label,input_ids,attention_mask,group
4,239743,0.166667,"[0, 104, 8774, 6, 117, 55, 295, 1113, 295, 111...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1.0
21,242329,0.166667,"[0, 243, 18, 41, 2679, 41143, 119, 2133, 4, 38...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1.0
25,242561,0.4,"[0, 14783, 6, 14, 27726, 11054, 1819, 473, 176...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",4.0
28,242755,0.2,"[0, 7605, 42, 1804, 31140, 6, 52, 218, 75, 216...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2.0
35,243843,0.7,"[0, 1121, 127, 2979, 6, 5, 2448, 8, 384, 4186,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",6.0
