In [2]:
import os
import random
import gc
from pprint import pprint
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid')
from sklearn.model_selection import StratifiedKFold
%matplotlib inline
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AutoModel
from transformers import AutoTokenizer
from transformers import AutoConfig

In [3]:
data1=pd.read_csv("../jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
data2=pd.read_csv("../jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv")

In [4]:
print(len(data1))
data1['score']=data1[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1)
print(data1.head())
pre_data1=data1[['id','comment_text','score']]
pre_data1.describe()

223549
                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  score  
0             0        0       0       0              0      0  
1             0        0       0       0              0      0  
2             0        0       0       0              0      0  
3             0        0       0       0              0      0  
4             0        0       0       0              0      0  


Unnamed: 0,score
count,223549.0
mean,0.221857
std,0.75419
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,6.0


In [7]:
def data_preprocess(df):
    value_not_zero=df[df.score>0]
    value_zero=df[df.score==0]
    subset = df.sample(n=len(value_not_zero))
    op_data=pd.concat([value_not_zero, subset], ignore_index=True)
    print(len(value_not_zero))
    print(len(value_zero))
    print(len(op_data))
    return op_data

In [8]:
print(len(data2))
data2['score']=data2[['toxic', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack']].sum(axis=1)
print(data2.head())
pre_data2=data2[['comment_text','score']]
pre_data2.describe()

1902194
      id                                       comment_text     toxic  \
0  59848  This is so cool. It's like, 'would you want yo...  0.000000   
1  59849  Thank you!! This would make my life a lot less...  0.000000   
2  59852  This is such an urgent design problem; kudos t...  0.000000   
3  59855  Is this something I'll be able to install on m...  0.000000   
4  59856               haha you guys are a bunch of losers.  0.893617   

   severe_toxicity  obscene  identity_attack   insult  threat  asian  atheist  \
0         0.000000      0.0         0.000000  0.00000     0.0    NaN      NaN   
1         0.000000      0.0         0.000000  0.00000     0.0    NaN      NaN   
2         0.000000      0.0         0.000000  0.00000     0.0    NaN      NaN   
3         0.000000      0.0         0.000000  0.00000     0.0    NaN      NaN   
4         0.021277      0.0         0.021277  0.87234     0.0    0.0      0.0   

   ...    rating  funny  wow  sad  likes  disagree  sexual_explici

Unnamed: 0,id,score
count,1902194.0,1902194.0
mean,3912771.0,0.234544
std,2497349.0,0.460042
min,59848.0,0.0
25%,827354.2,0.0
50%,5282205.0,0.0
75%,5862735.0,0.3333333
max,7194639.0,5.255702


In [9]:
pre_data1_op=data_preprocess(pre_data1)
pre_data2_op=data_preprocess(pre_data2)

22468
201081
44936
590294
1311900
1180588


In [10]:
all_data=pd.concat([pre_data1_op, pre_data2_op], ignore_index=True)
print(len(all_data))

1225524


In [11]:
DEBUG = False
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device: ', device.type)


SEED = 2022
seed_everything(SEED)

Device:  cuda


In [12]:
# BERT
BERT = 'bert-base-uncased'
# Distilbert
DISTILBERT = 'distilbert-base-uncased'
# Roberta
ROBERTA = 'roberta-base'
cfg ={}
ARCH_PATH = ROBERTA
cfg['train'] = {'n_folds': 5}

In [13]:
def get_bin_stratified(df, n_bins=20, n_splits=5):
    df['bin'] = pd.cut(df.score, n_bins, labels=[i for i in range(n_bins)])
    
    df['fold'] = np.nan

    skf = StratifiedKFold(n_splits=n_splits, random_state=SEED, shuffle=True)
    gen_skf = skf.split(df.comment_text, y=df.bin)

    for fold, (idx_train, idx_val) in enumerate(gen_skf):
        df.loc[idx_val, 'fold'] = fold
    df['fold'] = df['fold'].astype('int8')

In [21]:
cfg['tokenizer'] ={'name': ARCH_PATH, 
                   'max_length': 210}
tokenizer = AutoTokenizer.from_pretrained(cfg['tokenizer']['name'])

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [22]:
if DEBUG:
    text = df.loc[SEED, 'comment_text']
    print('Text Length ', len(text.split(' ')))
    text_tokenized = tokenizer.encode_plus(
                        text,
                        add_special_tokens=True,
                        padding='max_length',
                        max_length=cfg['tokenizer']['max_length'], 
                        truncation=True
                        )
    
    for key, value in text_tokenized.items():
        print(key, type(value))
        print(value)
        print()

In [23]:
class jigsawDataset(Dataset):
    
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.df)
    def __getitem__(self, index):
        text = self.df.loc[index, 'comment_text']
        inputs = self.tokenizer.encode_plus(
            text,                                 
            add_special_tokens=True,
            padding='max_length',
            max_length=self.max_len,
            truncation=True
        )
        ids = inputs['input_ids'] 
        mask = inputs['attention_mask'] 
        
        if cfg['tokenizer']['name']=='bert-base-uncased':
            token_type_ids = inputs['token_type_ids'] 
        else:
            token_type_ids = 1.
        
        target = self.df.loc[index, ['score']]
        
        return {
            'ids': torch.LongTensor(ids),
            'mask': torch.LongTensor(mask),
             'token_type_ids': torch.tensor(token_type_ids)
            },{
            'target': torch.Tensor(target)
        }

In [24]:
cfg['dl_train'] = {
    'batch_size': 8 if device.type=='cpu' else 32, 
    'shuffle': True, 
    'num_workers': os.cpu_count(), 
    'pin_memory': True
}

cfg['dl_val'] = {
    'batch_size': 8 if device.type=='cpu' else 64, 
    'shuffle': False, 
    'num_workers': os.cpu_count(), 
    'pin_memory': True
}

In [25]:
cfg['model'] = {'name': ARCH_PATH}

In [26]:
class jigsawBERT(nn.Module):
    
    def __init__(self, name, dropout=True):
        super(jigsawBERT, self).__init__()
        self.bert = AutoModel.from_pretrained(name)
        self.name = name
        
        if name == BERT:
            self.in_features = self.bert.pooler.dense.out_features
        elif name == DISTILBERT:
            self.in_features = self.bert.transformer.layer[5].output_layer_norm.normalized_shape[0]
        elif name == ROBERTA:
            self.in_features = self.bert.pooler.dense.out_features
        else:
            self.in_features = 768
        
        self.fc = nn.Linear(self.in_features, 1)
        self.dense = nn.Linear(self.in_features, self.in_features)
        self.activation = nn.Tanh()
        self.dropout = nn.Dropout(p=0.2)
        self.layer_norm = nn.LayerNorm(self.in_features)
        
        torch.nn.init.kaiming_normal_(self.dense.weight)
        torch.nn.init.kaiming_normal_(self.fc.weight)
        
    def forward(self, ids, mask, token_type_ids):
        if self.name == BERT:
            last_hidden_state, output = self.bert(ids,
                                                  attention_mask=mask,
                                                  token_type_ids=token_type_ids,
                                                  return_dict=False)
        elif self.name == DISTILBERT:
            last_hidden_state = self.bert(ids, 
                                           attention_mask=mask, 
                                           return_dict=False)
            first_token_tensor = last_hidden_state[0][:, 0]
            output = self.dense(first_token_tensor)
            output = self.activation(output)
            
        elif self.name == ROBERTA:
            last_hidden_state, output = self.bert(ids,
                                                  attention_mask=mask,
                                                  return_dict=False)
        
        output = self.layer_norm(output)
        output = self.dropout(output)
        output = self.fc(output)
        return output


In [27]:
def jigsawMetric(y_pred, y_gt):
    assert y_pred.size() == y_gt.size()
    metric = nn.MSELoss()
    metric = torch.sqrt(metric(y_pred, y_gt))
    return metric

In [28]:
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR

from transformers import get_cosine_schedule_with_warmup
from transformers import AdamW

cfg['optim'] = {'lr': 8e-6, 
#                 'weight_decay': 0.01
               }
cfg['scheduler'] = {'num_warmup_steps': 3, 
                    'num_training_steps': 7, 
                   }

In [29]:
from torch.cuda.amp import GradScaler
from torch.cuda.amp import autocast
cfg['train'] ={
    'n_folds': 5,
    'n_epochs': 100
}

class StoreLoss:
    
    def __init__(self, fold):
        self.loss_train_mean = []
        self.loss_train_std = []
        self.loss_val_mean = []
        self.loss_val_std = []
        
        self.fold = fold
        
    def get_loss(self, loss_train, loss_val):
        self.loss_train_mean.append(loss_train[0])
        self.loss_train_std.append(loss_train[1])
        self.loss_val_mean.append(loss_val[0])
        self.loss_val_std.append(loss_val[1])
        
    def plot_loss(self):
        
        def get_ax(ax, loss_train, loss_val, title='mean'):
            ax.plot(loss_train, marker='o', label='train')
            ax.plot(loss_val, marker='x', label='val')
            ax.set_xlabel('Epoch')
            ax.set_ylabel(f'RMSE ({title})')
            ax.set_title(f'RMSE({title}) vs Epoch at fold {self.fold}')
            ax.legend()
            return ax
        
        fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 4))
        
        ax[0] = get_ax(ax[0], self.loss_train_mean, self.loss_val_mean, title='mean')
        ax[1] = get_ax(ax[1], self.loss_train_std, self.loss_val_std, title='std')
        
        
        fig.show()

In [30]:
if DEBUG:
    store = StoreLoss(fold=0)
    
    for epoch in range(10):
        loss_train = np.random.rand(2)
        loss_val = np.random.rand(2)
        
        store.get_loss(loss_train, loss_val)
    
    store.plot_loss()

In [31]:
def train_fn(model, dl, criterion, optim, scheduler):
    scaler = GradScaler()
    
    loss_train = []
    loss_total = 0
    
    model.train()
    model.to(device)
    
    progress_bar = tqdm(dl, desc='train')
    
    for i, data in enumerate(progress_bar):
        optim.zero_grad()
        
        inputs = {key: value.to(device) for key, value in data[0].items()}
        targets = data[1]['target'].to(device)
        
        with autocast():
            outputs = model(**inputs)
            loss = criterion(outputs, targets)

        scaler.scale(loss).backward()
        
        loss_train.append(loss.item())
        loss_total += loss.item()
        
        progress_bar.set_postfix({'RMSE(batch)': loss.item(), 
                                  'RMSE(ave)': loss_total / (i+1), 
                                  'lr': optim.param_groups[0]['lr']})
        
        scaler.step(optim)
        scaler.update()
    
    return np.mean(loss_train), np.std(loss_train)

def val_fn(model, dl):
    scaler = GradScaler()
    
    loss_val = []
    loss_total = 0
    
    model.eval()
    model.to(device)
    
    progress_bar = tqdm(dl, desc='val')
    
    with torch.no_grad():
        for i, data in enumerate(progress_bar):
            inputs = {key: value.to(device) for key, value in data[0].items()}
            targets = data[1]['score'].to(device)
            
            with autocast():
                outputs = model(**inputs)
                loss = jigsawMetric(outputs, targets)
            
            loss_val.append(loss.item())
            loss_total += loss.item()
            
            progress_bar.set_postfix({'RMSE(batch)': loss.item(), 'RMSE(ave)': loss_total / (i+1)})
    
    loss_val_2 = np.array(loss_val)**2 * cfg['dl_val']['batch_size'] / len(dl.dataset)
    print('RMSE for validation set overall: ', np.sqrt(loss_val_2.sum()))
    
    return np.sqrt(loss_val_2.sum()), np.std(loss_val)

In [32]:
def run_one_epoch(model, train_dl, val_dl, criterion, optim, scheduler):
    inputs_train = {
        'model': model, 
        'dl': train_dl, 
        'criterion': criterion, 
        'optim': optim, 
        'scheduler': scheduler
    }

    inputs_val = {'model': model, 
                  'dl': val_dl}

    loss_train = train_fn(**inputs_train)
    loss_val = val_fn(**inputs_val)
    
    return loss_train, loss_val

In [33]:
def get_dls_for_n_fold(df, fold, tokenizer):
    train_df = df.loc[df.fold!=fold].reset_index(drop=True)
    val_df = df.loc[df.fold==fold].reset_index(drop=True)
    
    train_ds = jigsawDataset(
        train_df, 
        tokenizer=tokenizer, 
        max_len=cfg['tokenizer']['max_length']
    )
    
    val_ds = jigsawDataset(
        val_df, 
        tokenizer=tokenizer, 
        max_len=cfg['tokenizer']['max_length']
    )
    
    train_dl = DataLoader(train_ds, **cfg['dl_train'])
    val_dl = DataLoader(val_ds, **cfg['dl_val'])
    
    return train_dl, val_dl

In [34]:
class EarlyStopping:
    
    def __init__(self, patience=2, seq=False):
        self.patience = patience
        self.counter = 0
        self.best_score = None
        self.stop = False
        
    def __call__(self, loss, model, optim, cfg, path):
        if self.best_score is None:
            self.best_score = loss
            self.save_checkpoint(model, optim, cfg, path)
        elif loss < self.best_score:
            print(f'Loss decreased {self.best_score} -> {loss}.')
            self.best_score = loss
            self.counter = 0
            self.save_checkpoint(model, optim, cfg, path)
        else:
            self.counter += 1
            if self.counter > self.patience: self.stop = True
                
    def save_checkpoint(self, model, optim, cfg, path):
        save_list = {'model': model.state_dict(), 
                     'cfg': cfg}
        SAVE_PATH = path
        torch.save(save_list, SAVE_PATH)

In [35]:
def main():
    seed_everything(SEED)
        
    df = all_data
    get_bin_stratified(df, n_splits=cfg['train']['n_folds'])

    tokenizer = AutoTokenizer.from_pretrained(cfg['tokenizer']['name'])
    
    for fold in range(cfg['train']['n_folds']):
        print('Fold:', fold)
        store = StoreLoss(fold=fold)
        es = EarlyStopping()

        train_dl, val_dl = get_dls_for_n_fold(df, fold, tokenizer)

        model = jigsawBERT(name=cfg['model']['name'])
        criterion =jigsawMetric
        optim = AdamW(model.parameters(), **cfg['optim'])
        scheduler = get_cosine_schedule_with_warmup(optim, **cfg['scheduler'])
        if optim.param_groups[0]['lr']==0:
            optim.step()
            scheduler.step()

        inputs = {'model': model,
                  'train_dl': train_dl,
                  'val_dl': val_dl,
                  'criterion': criterion,
                  'optim': optim,
                  'scheduler': scheduler}

        for epoch in range(cfg['train']['n_epochs']):
            loss_train, loss_val = run_one_epoch(**inputs)
            
            store.get_loss(loss_train, loss_val)
            
            es(loss_val[0], model, optim, cfg, path=f'jigsawBERT_fold{fold}.tar')
            if es.stop:
                print('Early Stop !')
                break

            scheduler.step()
            
        store.plot_loss()
        
        del model, optim
        gc.collect()

In [None]:
%%time
main()



Fold: 0


Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
train:   2%|▏         | 475/30639 [05:19<5:36:36,  1.49it/s, RMSE(batch)=0.771, RMSE(ave)=0.875, lr=2.67e-6]

In [None]:
MODEL_NAME = 'jigsawBERT'

def val_fn_cv(model, dl):
    scaler = GradScaler()
    preds = []
    
    model.eval()
    model.to(device)
    
    progress_bar = tqdm(dl, desc='cv')
    
    with torch.no_grad():
        for i, data in enumerate(progress_bar):
            inputs = {key: value.to(device) for key, value in data[0].items()}
            targets = data[1]['target'].to(device)
            
            with autocast():
                outputs = model(**inputs)
            
            preds.append(outputs.detach().cpu().numpy())
    
    preds = np.concatenate(preds)
    
    return preds

def main_cv():
    seed_everything(SEED)
    
    df = all_data
    get_bin_stratified(df, n_splits=cfg['train']['n_folds'])
    df['oof'] = np.nan

    tokenizer = AutoTokenizer.from_pretrained(cfg['tokenizer']['name'])
    
    for fold in range(cfg['train']['n_folds']):
        train_dl, val_dl = get_dls_for_n_fold(df, fold, tokenizer)

        model = jigsawBERT(name=cfg['model']['name'])
        PATH = os.path.join(MODEL_NAME + f'_fold{fold}.tar')
        saved_contents = torch.load(PATH, map_location=device)
        
        model.load_state_dict(saved_contents['model'])
        if fold==0:
            cfg_for_train = saved_contents['cfg']
            print('Configuration for training:')
            print()
            pprint(cfg_for_train)
            print()
        
        print('Fold:', fold)
        
        inputs = {'model': model,
                  'dl': val_dl}
        
        preds = val_fn_cv(**inputs)
        df.loc[df.fold==fold, 'oof'] = preds

    return df

In [None]:
from sklearn.metrics import mean_squared_error

df = main_cv()
df.to_csv('oof_df.csv', index=False)

mse = mean_squared_error(df['score'], df['oof'])
rmse = np.sqrt(mse)
print('CV score: ', rmse)

In [None]:
temp_df = pd.DataFrame()
temp_df['x'] = np.linspace(-3.5, 1.5, 10)
temp_df['y'] = temp_df['x']

plt.figure(figsize=(8, 8))
sns.scatterplot(data=df, x='score', y='oof', label='oof vs target')
sns.lineplot(data=temp_df, x='x', y='y', color='orange')
plt.title('OOF Prediction vs Target')
plt.legend()
plt.show()