In [1]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/Toxic_Comment_Classification

Mounted at /content/drive
/content/drive/MyDrive/Toxic_Comment_Classification


In [2]:
!pip install torch
!pip install transformers
!pip install pandas
!pip install scikit-learn

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/db/98c3ea1a78190dac41c0127a063abf92bd01b4b0b6970a6db1c2f5b66fa0/transformers-4.0.1-py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 8.3MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 30.1MB/s 
[?25hCollecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 54.6MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=c98d4cfe9323f

In [3]:
import os
import time
import tqdm

import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import torch
from torch import nn 
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

from transformers import BertForSequenceClassification
from transformers import AdamW
from transformers import BertTokenizer
from transformers import get_linear_schedule_with_warmup

In [4]:
batch_size = 16
max_token_len = 256
log_interval = 10

checkpoint_name = 'checkpoint_39500.bin'
checkpoint_storing_path = os.path.join(os.getcwd(), "checkpoints")
checkpoint_full_path = os.path.join(checkpoint_storing_path, checkpoint_name)

DATASET_PATH = os.path.join(os.getcwd(), 'dataset')

In [5]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")

In [6]:
class ToxicCommentDataset(Dataset):
    def __init__(self, csv_path, tokenizer, max_token_len=256, device="cpu", mode="train", transform=None):
        self.list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

        dataset_df = pd.read_csv(csv_path)
        self.sentence_list = dataset_df["comment_text"].tolist()
        self.sentence_ids = dataset_df["id"].tolist()

        if mode in ["train", "val"]:
            self.use_label = True
        elif mode == "test":
            self.use_label = False
        else:
            raise ValueError("Argument: 'mode' should be 'train', 'val', or 'test'")

        self.tokenizer = tokenizer
        self.max_token_len = max_token_len
        self.device = device
        self.transform = transform

    def __len__(self):
        return len(self.sentence_list)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        encoding = tokenizer(self.sentence_list[idx], return_tensors='pt', padding="max_length", truncation=True, max_length=self.max_token_len)
        sample = {
            # 'input_text': self.sentence_list[idx],
            'input_ids': encoding['input_ids'].view(-1).to(device), 
            'attention_mask': encoding['attention_mask'].view(-1).to(device),
            'sentence_id': self.sentence_ids[idx],
        }
        if self.use_label:
            sample.update({'label': self.labels[idx].to(device)})

        if self.transform:
            sample = self.transform(sample)

        return sample

In [7]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased', 
    do_lower_case=True, 
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [8]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
test_csv_path = os.path.join(DATASET_PATH, 'test.csv')
test_dataset = ToxicCommentDataset(
    test_csv_path, 
    tokenizer, 
    max_token_len, 
    device,
    mode="test"
)

test_data_loader = DataLoader(
    dataset=test_dataset, 
    batch_size=batch_size,
    shuffle=False
)
num_batches = int(len(test_dataset)/batch_size) + 1

In [9]:
criterion = nn.BCEWithLogitsLoss()

Read outputs of the fine-tuned model


In [10]:
model = BertForSequenceClassification.from_pretrained(
    checkpoint_full_path, 
    output_attentions=False,
    return_dict=True, 
    num_labels=len(list_classes)
)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [12]:
sample_submission = pd.read_csv(os.path.join(DATASET_PATH, "sample_submission.csv"), index_col='id')
model.eval()

with torch.no_grad():
    for batch_idx, batch in enumerate(test_data_loader):

        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        sentence_ids = batch['sentence_id']

        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.sigmoid(outputs.logits).cpu().numpy()

        sample_submission.loc[sentence_ids, list_classes] = predictions

        print(batch_idx, num_batches)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
4571 9573
4572 9573
4573 9573
4574 9573
4575 9573
4576 9573
4577 9573
4578 9573
4579 9573
4580 9573
4581 9573
4582 9573
4583 9573
4584 9573
4585 9573
4586 9573
4587 9573
4588 9573
4589 9573
4590 9573
4591 9573
4592 9573
4593 9573
4594 9573
4595 9573
4596 9573
4597 9573
4598 9573
4599 9573
4600 9573
4601 9573
4602 9573
4603 9573
4604 9573
4605 9573
4606 9573
4607 9573
4608 9573
4609 9573
4610 9573
4611 9573
4612 9573
4613 9573
4614 9573
4615 9573
4616 9573
4617 9573
4618 9573
4619 9573
4620 9573
4621 9573
4622 9573
4623 9573
4624 9573
4625 9573
4626 9573
4627 9573
4628 9573
4629 9573
4630 9573
4631 9573
4632 9573
4633 9573
4634 9573
4635 9573
4636 9573
4637 9573
4638 9573
4639 9573
4640 9573
4641 9573
4642 9573
4643 9573
4644 9573
4645 9573
4646 9573
4647 9573
4648 9573
4649 9573
4650 9573
4651 9573
4652 9573
4653 9573
4654 9573
4655 9573
4656 9573
4657 9573
4658 9573
4659 9573
4660 9573
4661 9573
4662 9573
4663 9573
4664 

In [14]:
sample_submission.to_csv("submission.csv")

In [None]:
# kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f submission.csv -m "Message"