In [1]:
from pathlib import Path
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from transformers import AutoTokenizer

In [2]:
main_dir = Path('/Users/avgalichin/Documents/kaggle/Jigsaw2022/Datasets')
data_dir = main_dir/'Initial'

### Preparation of ToxicCommentClassification dataset

#### Downloading and cleaning the data

In [3]:
tcc_data = pd.read_csv(data_dir/'Data_ToxicCommentClassification.csv')
tcc_data = tcc_data.rename(columns={'comment_text': 'text'})
print(tcc_data.shape)
tcc_data.head()

(159571, 8)


Unnamed: 0,id,text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
def clean_fn(df, col_name='text'):
    # TODO: Expand (maybe not)
    
    df.loc[:, col_name] = df[col_name].str.replace('https?://\S+|www\.\S+', ' social medium ')
    df.loc[:, col_name] = df[col_name].str.replace('\s+', ' ')  # remove more than 1 white space
    df.loc[:, col_name] = df[col_name].str.strip()
    df = df[df[col_name].str.len() > 15]
    # also drop duplicated texts (note: maybe better to mean their labels and take it as score)
    df = df.drop_duplicates(subset=col_name, keep=False)
    
    df.reset_index(drop=True, inplace=True)
    
    return df

# cleaning
tcc_data = clean_fn(tcc_data.copy())
print(tcc_data.shape)
tcc_data.head()

(159140, 8)


Unnamed: 0,id,text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation Why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,""" More I can't make any real suggestions on im...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


#### Get toxicity rankings for each comment 

In [13]:
def _rank_comments_v1(df):
    """Taken from: https://www.kaggle.com/andrej0marinchenko/jigsaw-ensemble-0-86#Create-3-versions-of-clean-data"""
    
    df.loc[:, 'severe_toxic'] = df.severe_toxic * 2
    df.loc[:, 'label'] = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1)
    df['label'] = df.label.astype(np.int32)
    df.loc[:, 'label'] = df.label / df.label.max()
    
    return df


def _rank_comments_v2(df, weights=[0.3, 0.5, 0.05, 0.05, 0.05, 0.05]):
    """Taken from: https://www.kaggle.com/c/jigsaw-toxic-severity-rating/discussion/288757"""
    
    df.loc[:, 'label'] = (weights * df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]).sum(axis=1)
    
    return df


def get_ranks(df, rtype=1, **kwargs):
    """
    Get comment ranks based on one of the rank methods
    
    1: Ranks comments in range [0; 1] based on overall dataset toxicity rates
    2: Ranks comments in range [0: 1] independently
    3: ?
    """
        
    if rtype == 1:
        return _rank_comments_v1(df)
    elif rtype == 2:
        return _rank_comments_v2(df, kwargs['weights'])
    elif rtype == 3:
        raise NotImplementedError
    else:
        raise NotImplementedError

In [14]:
# get ranks of type = rank_type

rank_type = 2

weights = [0.1, 0.5, 0.2, 0.04, 0.12, 0.04]

tcc_data = get_ranks(tcc_data.copy(), rtype=rank_type, weights=weights)
tcc_data = tcc_data[['id', 'text', 'label']]
tcc_data = tcc_data[tcc_data.label > 0]
tcc_data = tcc_data.reset_index(drop=True)
print(tcc_data.shape)
tcc_data.head()

(16151, 3)


Unnamed: 0,id,text,label
0,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,0.92
1,0005c987bdfc9d4b,Hey... what is it.. @ | talk . What is it... a...,0.1
2,0007e25b2121310b,"Bye! Don't look, come or think of comming back...",0.1
3,001810bf8c45bf5f,You are gay or antisemmitian? Archangel WHite ...,0.46
4,00190820581d90ce,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",0.42


In [15]:
# TODO: Highly imbalanced... Think about it

tcc_data.label.value_counts(normalize=True)

0.10    0.349700
0.42    0.234103
0.30    0.108352
0.22    0.074980
0.92    0.060863
0.46    0.045694
0.96    0.020370
0.20    0.019565
0.12    0.018575
0.14    0.015355
0.32    0.011207
0.80    0.009783
0.26    0.009225
0.04    0.004644
0.50    0.003467
0.34    0.002848
0.60    0.002477
0.16    0.001919
1.00    0.001919
0.36    0.001238
0.72    0.000867
0.64    0.000743
0.84    0.000619
0.76    0.000495
0.18    0.000433
0.24    0.000310
0.30    0.000186
0.68    0.000062
Name: label, dtype: float64

#### Get cross-validation folds

In [16]:
def get_folds(df, n_folds=5, shuffle=True):
    """
    Get stratified folds
    
    !: Use it only if 'labels' have a comparably small number of values
    """
    skf = StratifiedKFold(n_splits=n_folds, shuffle=shuffle)

    labels_vals = df.label.unique()
    for idx, lv in enumerate(labels_vals):
        df.loc[df['label'] == lv, 'encoded_label'] = idx
    
    for fold, (_, val_idx) in enumerate(skf.split(X=df, y=df.encoded_label)):
        df.loc[val_idx, 'fold'] = fold
    df['fold'] = df['fold'].astype(np.uint8)
    
    df = df.drop('encoded_label', 1)
    
    return df

tcc_data = get_folds(tcc_data.copy())
tcc_data.groupby('fold').label.value_counts(normalize=True)



fold  label
0     0.10     0.349737
      0.42     0.233983
      0.30     0.108326
      0.22     0.074899
      0.92     0.060972
      0.46     0.045497
      0.96     0.020427
      0.20     0.019808
      0.12     0.018570
      0.14     0.015475
      0.32     0.011142
      0.80     0.009595
      0.26     0.009285
      0.04     0.004643
      0.50     0.003405
      0.34     0.002786
      0.60     0.002476
      0.16     0.001857
      1.00     0.001857
      0.36     0.001238
      0.64     0.000929
      0.72     0.000929
      0.84     0.000619
      0.18     0.000310
      0.24     0.000310
      0.30     0.000310
      0.68     0.000310
      0.76     0.000310
1     0.10     0.349536
      0.42     0.234056
                 ...   
3     0.64     0.000619
      0.76     0.000619
      0.84     0.000619
      0.24     0.000310
4     0.10     0.349845
      0.42     0.234365
      0.30     0.108359
      0.22     0.074923
      0.92     0.060681
      0.46     0.045511
    

### Preparation of UnintendedBiasInToxicityClassification dataset

#### Downloading and cleaning the data

In [33]:
ub_data = pd.read_csv(data_dir/'Data_UnintendedBiasInToxicityClassification.csv')
ub_data = ub_data.rename(columns={'comment_text': 'text'})

# for V1 keep only toxicity rates and toxicity annotator counts. Maybe make use of other features later
keep_list = ['id', 'text', 'target',
             'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit',
             'toxicity_annotator_count']
ub_data = ub_data[keep_list]
ub_data = ub_data.rename(columns={'target': 'toxicity'})

print(ub_data.shape)
ub_data.head()

(1804874, 10)


Unnamed: 0,id,text,toxicity,severe_toxicity,obscene,threat,insult,identity_attack,sexual_explicit,toxicity_annotator_count
0,59848,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
1,59849,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
2,59852,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
3,59855,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
4,59856,haha you guys are a bunch of losers.,0.893617,0.021277,0.0,0.0,0.87234,0.021277,0.0,47


In [34]:
def clean_fn_ub(df, col_name='text', min_annot_cnt=5, col_annot='toxicity_annotator_count'):
    """Clean text and remove comments that are annotated by less than 'min_annot_cnt' annotators"""
    
    df = clean_fn(df.copy())
    df = df[df[col_annot] > min_annot_cnt]
    # also drop duplicated texts (note: maybe better to mean their labels and take it as score)
    df = df.drop_duplicates(subset=col_name, keep=False)
    df.reset_index(drop=True, inplace=True)
    
    return df

# cleaning
ub_data = clean_fn_ub(ub_data.copy())
print(ub_data.shape)
ub_data.head()

(526144, 10)


Unnamed: 0,id,text,toxicity,severe_toxicity,obscene,threat,insult,identity_attack,sexual_explicit,toxicity_annotator_count
0,59856,haha you guys are a bunch of losers.,0.893617,0.021277,0.0,0.0,0.87234,0.021277,0.0,47
1,59859,ur a sh*tty comment.,0.666667,0.047619,0.638095,0.0,0.333333,0.0,0.009524,105
2,59861,hahahahahahahahhha suck it.,0.457627,0.050847,0.305085,0.0,0.254237,0.0,0.220339,59
3,239579,This is a great story. Man. I wonder if the pe...,0.44,0.0,0.293333,0.0,0.32,0.0,0.04,75
4,239583,It's ridiculous that these guys are being call...,0.6,0.0,0.1,0.1,0.6,0.0,0.0,10


#### Get target score

    The actual target is quite a garbage for our task, so we need to change it.
    
        V1: Make like for tcc data v2
        V2: In progress

In [35]:
def _rank_comments_ub_v1(df):
    """Taken from: https://www.kaggle.com/andrej0marinchenko/jigsaw-ensemble-0-86#Create-3-versions-of-clean-data"""
    
    df.loc[:, 'severe_toxicity'] = df.severe_toxicity * 2
    df.loc[:, 'target'] = df[['severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack']].sum(axis=1)
    df.loc[:, 'target'] = df.target / df.target.max()
    
    return df


def _rank_comments_ub_v2(df, weights=[0.5, 0.1, 0.1, 0.1, 0.1]):
    """Taken from: https://www.kaggle.com/c/jigsaw-toxic-severity-rating/discussion/288757"""
    
    cols = ['toxicity', 'severe_toxicity', 'obscene', 'threat',
            'insult', 'identity_attack', 'sexual_explicit']
    df.loc[:, 'target'] = (weights * df[cols]).sum(axis=1)
    
    return df

In [36]:
# keep only id, text and label

weights = [0.03, 0.867, 0.26, 0.065, 0.1, 0.07, 0.04]

ub_data = _rank_comments_ub_v2(ub_data.copy(), weights=weights)
ub_data = ub_data[['id', 'text', 'target']]
ub_data = ub_data[ub_data['target'] > 0.1]
ub_data = ub_data.reset_index(drop=True)
print(ub_data.shape)
ub_data.head()

(119693, 3)


Unnamed: 0,id,text,target
0,59856,haha you guys are a bunch of losers.,0.133979
1,59859,ur a sh*tty comment.,0.260905
2,59861,hahahahahahahahhha suck it.,0.171373
3,239579,This is a great story. Man. I wonder if the pe...,0.123067
4,239583,It's ridiculous that these guys are being call...,0.1105


#### Get cross-validation folds

`!` Here targets have many values. So first split targets on groups, then do stratified folding based on this groups

In [37]:
def get_groups(df, n_groups=10, col_name='target'):
    """Split by quantiles"""
    
    q_orders = [(1. / n_groups) * n for n in range(n_groups)]
    quantiles = df[col_name].quantile(q_orders).values
    quantiles = sorted(list(set(quantiles)))
    
    print(f'Effective number of groups: {len(quantiles)}')
    print(quantiles)
    
    for i, q in enumerate(quantiles):
        df.loc[df[col_name] >= q, 'label'] = i
        
    return df

ub_data = get_groups(ub_data.copy())
ub_data = get_folds(ub_data.copy(), n_folds=10)
ub_data.groupby('fold').label.value_counts(normalize=True)

Effective number of groups: 10
[0.10000000000000002, 0.10900000000000001, 0.11961250000000001, 0.13, 0.1422, 0.15295588235294116, 0.16616666666666666, 0.18120000000000003, 0.20677956521739133, 0.2557]


fold  label
0     3.0      0.104094
      6.0      0.101003
      1.0      0.100418
      4.0      0.100167
      7.0      0.100167
      9.0      0.100084
      8.0      0.099916
      0.0      0.099582
      5.0      0.098914
      2.0      0.095656
1     3.0      0.104094
      6.0      0.101003
      1.0      0.100418
      4.0      0.100167
      7.0      0.100167
      9.0      0.100084
      8.0      0.099916
      0.0      0.099582
      5.0      0.098914
      2.0      0.095656
2     3.0      0.104094
      6.0      0.101003
      1.0      0.100418
      4.0      0.100251
      7.0      0.100084
      9.0      0.100084
      8.0      0.099916
      0.0      0.099666
      5.0      0.098830
      2.0      0.095656
                 ...   
7     3.0      0.104019
      6.0      0.100927
      1.0      0.100343
      4.0      0.100259
      7.0      0.100175
      9.0      0.100175
      8.0      0.099925
      0.0      0.099591
      5.0      0.098839
      2.0      0.095747
8   

In [39]:
# remove 'label' column and rename 'target' to 'label'
# ub_data = ub_data.drop('label', 1)
# ub_data = ub_data.rename(columns={'target': 'label'})
ub_data.head()

Unnamed: 0,id,text,target,label,fold
0,59856,haha you guys are a bunch of losers.,0.133979,3.0,5
1,59859,ur a sh*tty comment.,0.260905,9.0,0
2,59861,hahahahahahahahhha suck it.,0.171373,6.0,9
3,239579,This is a great story. Man. I wonder if the pe...,0.123067,2.0,9
4,239583,It's ridiculous that these guys are being call...,0.1105,1.0,4


### Save all datasets prepared (train and val). Also save id and folds as another csv (to remember fold sep.)

`!` Extract input_ids and attention masks for specified tokenizer

In [40]:
tokenizer_type = 'roberta-base'  # change if needed

tokenizer = AutoTokenizer.from_pretrained(tokenizer_type)

In [41]:
def encode_df(df, tokenizer, col_name='text', max_length=128):
    
    input_ids_list, attn_mask_list = [], []

    for idx, text in tqdm(enumerate(df.text.values), total=df.shape[0]):
        output = tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length'
        )

        input_ids, attn_mask = output['input_ids'], output['attention_mask']

        input_ids_list.append(input_ids)
        attn_mask_list.append(attn_mask)
        
    col_len = df.shape[1]
        
    df.insert(col_len, 'input_ids', input_ids_list)
    df.insert(col_len + 1, 'attention_mask', attn_mask_list)
    
    df.drop('text', 1, inplace=True)
    
    return df

In [42]:
train_dir = main_dir/'train'
val_dir = main_dir/'val'

if not os.path.exists(train_dir):
    os.mkdir(train_dir)
    os.mkdir(val_dir)

In [43]:
tcc_data = encode_df(tcc_data.copy(), tokenizer)
ub_data = encode_df(ub_data.copy(), tokenizer)

100%|██████████| 16151/16151 [00:12<00:00, 1247.71it/s]
100%|██████████| 119693/119693 [01:01<00:00, 1941.98it/s]


In [44]:
ub_data_sorted = ub_data.copy()
ub_data = ub_data.drop('label', 1)
ub_data = ub_data.rename(columns={'target': 'label'})

In [46]:
# save tcc train/val data

val_fold_tcc = 4

tcc_data_train = tcc_data[tcc_data.fold != val_fold_tcc].drop('fold', 1)
tcc_data_val = tcc_data[tcc_data.fold == val_fold_tcc].drop('fold', 1)

tcc_data_train.to_csv(train_dir/'tcc_data_ruddit.csv', index=False)
tcc_data_val.to_csv(val_dir/'tcc_data_ruddit.csv', index=False)

In [47]:
# save ub train/val data

val_fold_ub = 9

ub_data_train = ub_data[ub_data.fold != val_fold_ub].drop('fold', 1)
ub_data_val = ub_data[ub_data.fold == val_fold_ub].drop('fold', 1)

ub_data_train.to_csv(train_dir/'ub_data_ruddit.csv', index=False)
ub_data_val.to_csv(val_dir/'ub_data_ruddit.csv', index=False)

In [48]:
# save folds distribution

tcc_data_folds = tcc_data[['id', 'fold']]
ub_data_folds = ub_data[['id', 'fold']]

tcc_data_folds.to_csv(data_dir/'tcc_folds_ruddit.csv', index=False)
ub_data_folds.to_csv(data_dir/'ub_folds_ruddit.csv', index=False)

#### Save ub_data sorted + sizes of groups (another version of dataset for better shuffling)

In [49]:
# save sorted version of ub_data.csv for better shuffling

ub_data_sorted = ub_data_sorted.sort_values(by='target')

ub_data_train_sorted = ub_data_sorted[ub_data_sorted.fold != val_fold_ub].drop('fold', 1)
ub_data_val_sorted = ub_data_sorted[ub_data_sorted.fold == val_fold_ub].drop('fold', 1)

train_group_sizes = ub_data_train_sorted.label.value_counts().values
val_group_sizes = ub_data_val_sorted.label.value_counts().values

In [50]:
# reduce each class to smallest class size by random picking

reduce_size_train = train_group_sizes[-1]
reduce_size_val = val_group_sizes[-1]

data_tmp = pd.DataFrame(columns=ub_data_train_sorted.columns)

for lbl in ub_data_train_sorted.label.unique():
    tmp = ub_data_train_sorted[ub_data_train_sorted.label == lbl].sample(n=reduce_size_train)
    
    data_tmp = pd.concat([data_tmp, tmp], axis=0)
    
ub_data_train_sorted = data_tmp.copy()

data_tmp = pd.DataFrame(columns=ub_data_val_sorted.columns)

for lbl in ub_data_val_sorted.label.unique():
    tmp = ub_data_val_sorted[ub_data_val_sorted.label == lbl].sample(n=reduce_size_val)
    
    data_tmp = pd.concat([data_tmp, tmp], axis=0)
    
ub_data_val_sorted = data_tmp.copy()

In [51]:
ub_data_train_sorted = ub_data_train_sorted.drop('label', 1)
ub_data_val_sorted = ub_data_val_sorted.drop('label', 1)

ub_data_train_sorted = ub_data_train_sorted.rename(columns={'target': 'label'})
ub_data_val_sorted = ub_data_val_sorted.rename(columns={'target': 'label'})

In [52]:
ub_data_train_sorted.to_csv(train_dir/'ub_data_sorted_ruddit.csv', index=False)
ub_data_val_sorted.to_csv(val_dir/'ub_data_sorted_ruddit.csv', index=False)

In [113]:
print(f'Group shape:\n\nTrain: {reduce_size_train}\nVal: {reduce_size_val}')

Group shape:

Train: 38788
Val: 4310


In [53]:
print(f'Group shape:\n\nTrain: {reduce_size_train}\nVal: {reduce_size_val}')

Group shape:

Train: 10308
Val: 1145


In [None]:
data = pd.read_csv('')