In [14]:
import pandas as pd
import numpy as np
import re

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, RandomSampler

from transformers import BertTokenizer, BertModel

from tqdm import tqdm
import warnings

warnings.filterwarnings('ignore')

device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda:0')

PATH = "../data/inheritim/"

SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

# Data preparation

In [15]:
train_df = pd.read_csv(PATH + 'jigsaw.csv', index_col=0)

In [16]:
train_df

Unnamed: 0,id,comment_text,toxic,non_toxic,length
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,1,43
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,1,17
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,1,42
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,1,113
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,1,13
...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,1,47
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,1,18
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,1,12
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,1,25


In [17]:
from sklearn.model_selection import train_test_split

df_train, df_valid = train_test_split(train_df, test_size=0.2, stratify=train_df["toxic"], random_state=SEED)

let's take equal number of toxic and non-toxic comments since our data imbalanced

In [18]:
df_train = pd.concat(
    [
        df_train[df_train["toxic"] == 0].sample(
            10000, random_state=199, replace=False
        ),
        df_train[df_train["toxic"] == 1].sample(
            10000, random_state=199, replace=False
        ),
    ]
)
df_valid = pd.concat(
    [
        df_valid[df_valid["toxic"] == 0].sample(
            1000, random_state=199, replace=False
        ),
        df_valid[df_valid["toxic"] == 1].sample(
            1000, random_state=199, replace=False
        ),
    ]
)

In [19]:
df_train.shape, df_valid.shape

((20000, 5), (2000, 5))

## Custom Dataset and load it to Dataloader 

In [20]:
def clear_str(string: str) -> str:
    """
    Tokenization/string cleaning for text
    """
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ." 
    string = re.sub(r"([?.!,¿])", r" \1 ", string)
    string = re.sub(r'[" "]+', " ", string)
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    string = re.sub(r"[^a-zA-Z?.!,¿]+", " ", string)
    return string.strip().lower()

class ToxicDataset(Dataset):

    def __init__(self, tokenizer: BertTokenizer, dataframe: pd.DataFrame):
        self.tokenizer = tokenizer
        self.pad_idx = tokenizer.pad_token_id
        self.df = dataframe

    def row_to_tensor(self, tokenizer: BertTokenizer, row: pd.Series, max_len: int=120):
        tokens = tokenizer.encode(clear_str(row["comment_text"]), add_special_tokens=True, max_length=max_len, truncation=True)
        x = torch.LongTensor(tokens)

        labels = ['non_toxic', 'toxic']
        y = torch.FloatTensor(row[labels])
        return x, y

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index: int):
        return self.row_to_tensor(self.tokenizer, self.df.iloc[index])

In [21]:
from functools import partial
from torch.nn.utils.rnn import pad_sequence


def collate_fn(batch: torch.Tensor, device: torch.device) -> (torch.Tensor, torch.Tensor):
    """
    merges a list of samples to form a mini-batch of Tensors
    :param batch: batch of data
    :param device: cpu or gpu
    :return: 
    """
    x, y = list(zip(*batch))
    x = pad_sequence(x, batch_first=True, padding_value=0)
    y = torch.stack(y)
    return x.to(device), y.to(device)

collate_fn = partial(collate_fn, device=device)

In [22]:
bert_model_name = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

train_dataset = ToxicDataset(tokenizer, df_train)
val_dataset = ToxicDataset(tokenizer, df_valid)

BATCH_SIZE = 32
train_sampler = RandomSampler(train_dataset)
dev_sampler = RandomSampler(val_dataset)

train_iterator = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                            sampler=train_sampler, collate_fn=collate_fn)

val_iterator = DataLoader(val_dataset, batch_size=BATCH_SIZE,
                          sampler=dev_sampler, collate_fn=collate_fn)

# Build the model

In [23]:
class BertClassifier(nn.Module):
    def __init__(self, bert: BertModel, num_classes: int):
        super().__init__()
        self.bert = bert
        self.classifier = nn.Linear(bert.config.hidden_size, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask=None):
        x = self.bert(input_ids, attention_mask=attention_mask)
        cls_x = x[1] 
        cls_x = self.classifier(cls_x)
        out = self.softmax(cls_x)
        return out

# Training

In [24]:
from sklearn.metrics import roc_auc_score

def train(model, iterator, optimizer, scheduler, loss_func):
    model.train()
    total_loss = 0

    iteration = 0
    for x, y in tqdm(iterator):
        iteration += 1
        optimizer.zero_grad()
        mask = (x != 0).float()
        outputs = model(x, attention_mask=mask)
        loss = loss_func(outputs, y)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()

    print(f"Train loss {total_loss / len(iterator)}\n")

def evaluate(model, iterator, loss_func, verbose=True):
    model.eval()
    pred = []
    true = []
    with torch.no_grad():
        total_loss = 0
        for x, y in iterator:
            mask = (x != 0).float()
            outputs = model(x, attention_mask=mask)
            loss = loss_func(outputs, y)
            total_loss += loss
            outputs = torch.argmax(outputs, dim=1)
            y = torch.argmax(y, dim=1)
            true += y.cpu().numpy().tolist()
            pred += outputs.cpu().numpy().tolist()
    true = np.array(true)
    pred = np.array(pred)

    labels = ['toxic']
    if verbose:
        print("\nROC_AUC for labels:")
        for i, name in enumerate(labels):
            print(f" * {name} - {roc_auc_score(true, pred)}")
        print(f"\nEVALUATE LOSS -  {total_loss / len(iterator)}\n")

    return true, pred

In [25]:
from transformers import get_linear_schedule_with_warmup

EPOCH_NUM = 1
lr = 2e-5

bert      = BertModel.from_pretrained(bert_model_name)
model     = BertClassifier(bert, 2).to(device)
loss_func = nn.BCELoss()
optimizer = optim.AdamW(model.parameters(), lr=lr)

# The triangle learning rate advances linearly until half of the first epoch, then linearly decays.
w_steps = 10 ** 3
t_steps = len(train_iterator) * EPOCH_NUM - w_steps
scheduler = get_linear_schedule_with_warmup(optimizer, w_steps, t_steps)

In [26]:
torch.cuda.empty_cache()

for i in range(EPOCH_NUM):
    print('=' * 50, f"EPOCH {i + 1}", '=' * 50)
    print("\nTRAINING\n")
    train(model, train_iterator, optimizer, scheduler, loss_func)
    print("\nEVALUATING\n")
    evaluate(model, val_iterator, loss_func)


TRAINING


100%|██████████| 625/625 [03:58<00:00,  2.62it/s]


Train loss 0.3446442441403866


EVALUATING

ROC_AUC for labels:
 * toxic - 0.9089999999999999

EVALUATE LOSS -  0.20584282279014587


# Save model

In [27]:
PATH_OUT = "../models/bert-detoxification/"

torch.save(model.state_dict(), PATH_OUT + "model-final.pt")