In [1]:
import os
import gc
import copy
import time
import random
import string

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

# Utils
from tqdm import tqdm
from collections import defaultdict

# Sklearn Imports
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold

# For Transformer Models
from transformers import AutoTokenizer, AutoModel, AdamW
from transformers import BertTokenizer, BertModel, BertConfig

# Suppress warnings
import warnings
# warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Config

In [2]:
CONFIG = {
    'seed': 42,
    'max_length': 128, 
    'train_batch_size': 32,
    'valid_batch_size': 32,
    'num_classes': 6,
    'model_name': 'distilbert-base-uncased',
    'n_accumulate': 1,
    'device': torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    'learning_rate': 1e-4,
    'weight_decay': 1e-6,
    'scheduler': None,
    'epochs': 2,
    'train_split': 0.8
}

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [3]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG['seed'])

# Data

In [4]:
df = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
print(df.shape)
df.head()

(159571, 8)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
class ToxicDataset(Dataset):
    def __init__(self, df, tokenizer, max_length, mode="train"):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.mode = mode
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        comment_text = " ".join(str(self.df.loc[index, 'comment_text']).split())
        comment_text_tokenized = self.tokenizer.encode_plus(
                                comment_text,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length'
                            )
        toxic_ids = comment_text_tokenized['input_ids'] 
        toxic_mask = comment_text_tokenized['attention_mask']
#         token_type_ids = comment_text_tokenized['token_type_ids']
        token_type_ids = 0
        
        target_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
        if self.mode == "train":
            target = self.df.loc[index, target_cols]
        else:
            target = [0,0,0,0,0,0]
        
        return {
            'toxic_ids': torch.tensor(toxic_ids, dtype=torch.long),
            'toxic_mask': torch.tensor(toxic_mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'target': torch.tensor(target, dtype=torch.float)
        } 

In [6]:
df_train = df[0 : int(df.shape[0]*CONFIG['train_split'])].copy()
df_valid= df[int(df.shape[0]*CONFIG['train_split']) : df.shape[0]].copy()
df_train.reset_index(inplace=True, drop=True)
df_valid.reset_index(inplace=True, drop=True)

train_dataset = ToxicDataset(df_train, tokenizer=CONFIG['tokenizer'], max_length=CONFIG['max_length'])
valid_dataset = ToxicDataset(df_valid, tokenizer=CONFIG['tokenizer'], max_length=CONFIG['max_length'])

train_loader = DataLoader(train_dataset, batch_size=CONFIG['train_batch_size'], num_workers=2, shuffle=True, pin_memory=True, drop_last=True)
valid_loader = DataLoader(valid_dataset, batch_size=CONFIG['valid_batch_size'], num_workers=2, shuffle=False, pin_memory=True)

# Model

In [7]:
class ToxicModel(nn.Module):
    def __init__(self, model_name):
        super(ToxicModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.drop = nn.Dropout(p=0.3)
        self.fc = nn.Linear(768, CONFIG['num_classes'])
        
    def forward(self, comment_ids, comment_mask, token_type_ids):        
        x = self.model(input_ids=comment_ids,attention_mask=comment_mask,output_hidden_states=False)
        x = self.drop(x[0][:,0])
        x = self.fc(x)
        outputs = x
        return outputs

In [8]:
def criterion(pred, target):
#     return F.binary_cross_entropy(pred, target)
    return torch.nn.BCEWithLogitsLoss()(pred, target)

# Training

In [9]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        toxic_ids = data['toxic_ids'].to(device, dtype = torch.long)
        toxic_mask = data['toxic_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['target'].to(device, dtype=torch.float)
        
        batch_size = toxic_ids.size(0)

        pred = model(toxic_ids, toxic_mask, token_type_ids)
        
        loss = criterion(pred, targets)
        loss = loss / CONFIG['n_accumulate']
        loss.backward()
    
        if (step + 1) % CONFIG['n_accumulate'] == 0:
            optimizer.step()

            # zero the parameter gradients
            optimizer.zero_grad()

            if scheduler is not None:
                scheduler.step()
                
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])
    gc.collect()
    
    return epoch_loss

In [10]:
@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:        
        toxic_ids = data['toxic_ids'].to(device, dtype = torch.long)
        toxic_mask = data['toxic_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['target'].to(device, dtype=torch.float)
        
        batch_size = toxic_ids.size(0)

        pred = model(toxic_ids, toxic_mask, token_type_ids)
        
        loss = criterion(pred, targets)
        
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Valid_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])   
    
    gc.collect()
    
    return epoch_loss

In [11]:
def run_training(model, optimizer, scheduler, device, num_epochs):
    
    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))
    
    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_loss = np.inf
    history = defaultdict(list)
    
    for epoch in range(1, num_epochs + 1): 
        gc.collect()
        train_epoch_loss = train_one_epoch(model, optimizer, scheduler, 
                                           dataloader=train_loader, 
                                           device=CONFIG['device'], epoch=epoch)
        
        val_epoch_loss = valid_one_epoch(model, valid_loader, device=CONFIG['device'], 
                                         epoch=epoch)
    
        history['Train Loss'].append(train_epoch_loss)
        history['Valid Loss'].append(val_epoch_loss)
        
        
        # deep copy the model
        if val_epoch_loss <= best_epoch_loss:
            print(f"Validation Loss Improved ({best_epoch_loss} ---> {val_epoch_loss})")
            best_epoch_loss = val_epoch_loss
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = f"Epoch-{epoch}.bin"
            torch.save(model.state_dict(), PATH)
            # Save a model file from the current directory
            print(f"Model Saved")
            
        print()
    
    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print("Best Loss: {:.4f}".format(best_epoch_loss))
    
    # load best model weights
    model.load_state_dict(best_model_wts)
    
    return model, history

In [12]:
def fetch_scheduler(optimizer):
    if CONFIG['scheduler'] == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=CONFIG['T_max'], 
                                                   eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == 'CosineAnnealingWarmRestarts':
        scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer,T_0=CONFIG['T_0'], 
                                                             eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == None:
        return None
        
    return scheduler

In [13]:
model = ToxicModel(CONFIG['model_name'])
model.to(CONFIG['device'])

# Define Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=CONFIG['learning_rate'], weight_decay=CONFIG['weight_decay'])
scheduler = fetch_scheduler(optimizer)

model, history = run_training(model, optimizer, scheduler,
                              device=CONFIG['device'],
                              num_epochs=CONFIG['epochs'])

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[INFO] Using GPU: Tesla P100-PCIE-16GB



100%|██████████| 3989/3989 [15:06<00:00,  4.40it/s, Epoch=1, LR=0.0001, Train_Loss=0.0495]
100%|██████████| 998/998 [01:14<00:00, 13.33it/s, Epoch=1, LR=0.0001, Valid_Loss=0.0437]


Validation Loss Improved (inf ---> 0.04374783788248138)
Model Saved



100%|██████████| 3989/3989 [15:07<00:00,  4.40it/s, Epoch=2, LR=0.0001, Train_Loss=0.0389]
100%|██████████| 998/998 [01:15<00:00, 13.28it/s, Epoch=2, LR=0.0001, Valid_Loss=0.0427]


Validation Loss Improved (0.04374783788248138 ---> 0.0427050941625936)
Model Saved

Training complete in 0h 32m 46s
Best Loss: 0.0427


# Testing

In [14]:
df_test = pd.read_csv("/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip")
df_test

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.
...,...,...
153159,fffcd0960ee309b5,". \n i totally agree, this stuff is nothing bu..."
153160,fffd7a9a6eb32c16,== Throw from out field to home plate. == \n\n...
153161,fffda9e8d6fafa9e,""" \n\n == Okinotorishima categories == \n\n I ..."
153162,fffe8f1340a79fc2,""" \n\n == """"One of the founding nations of the..."


In [15]:
test_dataset = ToxicDataset(df_test, tokenizer=CONFIG['tokenizer'], max_length=CONFIG['max_length'], mode="test")
test_loader = DataLoader(test_dataset, batch_size=CONFIG['valid_batch_size'], num_workers=2, shuffle=False, pin_memory=True)

In [16]:
@torch.no_grad()
def test_fn(model, dataloader, device):
    model.eval()
    
    test_preds = []
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        toxic_ids = data['toxic_ids'].to(device, dtype = torch.long)
        toxic_mask = data['toxic_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        
        outputs = model(toxic_ids, toxic_mask, token_type_ids)
        outputs = np.array(torch.sigmoid(outputs).cpu().detach().numpy())
        test_preds.append(outputs) 
    
    gc.collect()
    
    return test_preds

In [17]:
model = ToxicModel(CONFIG['model_name'])
model.to(CONFIG['device'])
model.load_state_dict(torch.load('/kaggle/working/Epoch-2.bin'))

preds = test_fn(model, test_loader, CONFIG['device'])
final_preds = np.array(preds)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 4787/4787 [05:33<00:00, 14.33it/s]
  


In [18]:
op = []
for y in final_preds:
    for a in y:
        op.append(a)
        
op = np.array(op)

In [19]:
def pred_one_input(ct, model):
    ct_tokenized = CONFIG['tokenizer'].encode_plus(
                            ct,
                            truncation=True,
                            add_special_tokens=True,
                            max_length=CONFIG['max_length'],
                            padding='max_length'
                        )

    ct_in = torch.tensor([ct_tokenized['input_ids']], dtype=torch.long)
    ct_mk = torch.tensor([ct_tokenized['attention_mask']], dtype=torch.long)
    return (model(ct_in.to(CONFIG['device'], dtype = torch.long) , ct_mk.to(CONFIG['device'], dtype = torch.long))).cpu().detach().numpy()

In [20]:
sub_df = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip')

# for i in sub_df.index:
#     sub_df.loc[i, ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']] = np.squeeze(pred_one_input(df_test.loc[i, 'comment_text'], model))

sub_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']] = op

In [21]:
sub_df

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.994255,0.443665,0.968408,0.065068,0.924833,0.759309
1,0000247867823ef7,0.003549,0.000026,0.000418,0.000029,0.000624,0.000140
2,00013b17ad220c46,0.001099,0.000028,0.000153,0.000026,0.000242,0.000066
3,00017563c3f7919a,0.000371,0.000082,0.000124,0.000054,0.000168,0.000085
4,00017695ad8997eb,0.008662,0.000040,0.001144,0.000046,0.001481,0.000337
...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,0.699978,0.007321,0.300883,0.004638,0.158828,0.026868
153160,fffd7a9a6eb32c16,0.037457,0.000164,0.003789,0.000304,0.006145,0.003659
153161,fffda9e8d6fafa9e,0.000383,0.000076,0.000127,0.000051,0.000162,0.000081
153162,fffe8f1340a79fc2,0.000445,0.000057,0.000125,0.000040,0.000163,0.000071


In [22]:
sub_df.to_csv('toxic_submission.csv', index=False)