In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/Toxic_Comment_Classification

Mounted at /content/drive
/content/drive/MyDrive/Toxic_Comment_Classification


In [None]:
!pip install torch
!pip install transformers
!pip install pandas
!pip install scikit-learn

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/db/98c3ea1a78190dac41c0127a063abf92bd01b4b0b6970a6db1c2f5b66fa0/transformers-4.0.1-py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 9.1MB/s 
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 16.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 40.6MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=2d7eb8db79889aedad6

In [None]:
import os
import time

import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import torch
from torch import nn 
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

from transformers import BertForSequenceClassification
from transformers import AdamW
from transformers import BertTokenizer
from transformers import get_linear_schedule_with_warmup

In [None]:
writer = SummaryWriter()

In [None]:
epochs = 4
batch_size = 16
max_token_len = 256
log_interval = 10
store_interval = 500
checkpoint_storing_path = os.path.join(os.getcwd(), "checkpoints")

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")

In [None]:
def compute_metrics(logits, labels):
    preds = torch.zeros_like(logits)
    preds[logits >= 0.5] = 1
    labels_np = labels.cpu().numpy()
    logits_np = logits.cpu().numpy()
    precision, recall, f1, _ = precision_recall_fscore_support(labels, logits, average='binary')
    acc = accuracy_score(labels, logits)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
class ToxicCommentDataset(Dataset):
    def __init__(self, csv_path, tokenizer, max_token_len=256, device="cpu", transform=None):
        self.list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

        dataset_df = pd.read_csv(csv_path)
        self.sentence_list = dataset_df["comment_text"].tolist()
        self.labels = torch.from_numpy(dataset_df[list_classes].values).float()

        self.tokenizer = tokenizer
        self.max_token_len = max_token_len
        self.device = device
        self.transform = transform

    def __len__(self):
        return len(self.sentence_list)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        encoding = tokenizer(self.sentence_list[idx], return_tensors='pt', padding="max_length", truncation=True, max_length=self.max_token_len)
        sample = {
            'input_ids': encoding['input_ids'].view(-1).to(device), 
            'attention_mask': encoding['attention_mask'].view(-1).to(device),
            'label': self.labels[idx].to(device)
        }

        if self.transform:
            sample = self.transform(sample)

        return sample

In [None]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased', 
    do_lower_case=True, 
)

# encoding = tokenizer(list_sentences_train[:10], return_tensors='pt', padding=True, truncation=True)
# input_ids = encoding['input_ids']
# attention_mask = encoding['attention_mask']

# print(input_ids.shape)
# print(attention_mask.shape)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_csv_path = os.path.join(os.getcwd(), 'dataset', 'train.csv')
train_dataset = ToxicCommentDataset(
    train_csv_path, 
    tokenizer, 
    max_token_len, 
    device
)

train_data_loader = DataLoader(
    dataset=train_dataset, 
    batch_size=batch_size,
    shuffle=True
)
num_batches = int(len(train_dataset)/batch_size) + 1

In [None]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', 
    # 'nlpaueb/legal-bert-small-uncased',
    return_dict=True, 
    output_attentions=True,
    num_labels=len(list_classes)
)
model.to(device)

In [None]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)

In [None]:
criterion = nn.BCEWithLogitsLoss()

In [None]:
num_warmup_steps = 200
num_train_steps = epochs*num_batches
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_train_steps)

In [None]:
model.train()
for epoch in range(epochs):
    losses = 0.
    num_trained_seq = 0
    start_time = time.time()
    for batch_idx, batch in enumerate(train_data_loader):
        optimizer.zero_grad()

        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        label = batch['label']

        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, label)

        loss.backward()
        optimizer.step()

        current_step = epoch*num_batches + (batch_idx + 1)
        writer.add_scalar("Loss/train", loss, current_step)

        if current_step % store_interval == 0:
            checkpoint_full_path = os.path.join(checkpoint_storing_path, f"checkpoint_{current_step}.bin")
            model.save_pretrained(checkpoint_full_path)

        current_batch_size = len(batch)
        num_trained_seq += current_batch_size
        losses += current_batch_size*loss.item()
        
        if (batch_idx + 1) % log_interval == 0:
            current_loss = losses / num_trained_seq
            elapsed = time.time() - start_time
            print('epoch: {:3d} | step: {:5d} | batch: {:5d} | lr: {:5.6f} | ms/batch: {:5.2f} | loss: {:5.3f}'.format(
                epoch, 
                current_step,
                (batch_idx + 1),
                optimizer.param_groups[0]['lr'],
                elapsed * 1000 / log_interval,
                current_loss
            ))

            losses = 0.
            num_trained_seq = 0
            start_time = time.time()

        scheduler.step()

writer.flush()
writer.close()

Read outputs of the fine-tuned model
