In [1]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/Toxic_Comment_Classification

Mounted at /content/drive
/content/drive/MyDrive/Toxic_Comment_Classification


In [2]:
!pip install torch
!pip install transformers
!pip install pandas
!pip install scikit-learn

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/db/98c3ea1a78190dac41c0127a063abf92bd01b4b0b6970a6db1c2f5b66fa0/transformers-4.0.1-py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 5.8MB/s 
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 30.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 34.9MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=66589059977fc800515

In [3]:
import os
import time

import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import torch
from torch import nn 
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

from transformers import BertForSequenceClassification
from transformers import AdamW
from transformers import BertTokenizer
from transformers import get_linear_schedule_with_warmup

In [4]:
batch_size = 1
max_token_len = 256
log_interval = 10

checkpoint_name = 'checkpoint_39500.bin'
checkpoint_storing_path = os.path.join(os.getcwd(), "checkpoints")
checkpoint_full_path = os.path.join(checkpoint_storing_path, checkpoint_name)

DATA_FILENAME = "toxic_comment_details.json"
toxic_comment_details = {}

In [5]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")

In [6]:
def compute_metrics(logits, labels):
    preds = torch.zeros_like(logits)
    preds[logits >= 0.5] = 1

    preds_np = preds.numpy()
    logits_np = logits.detach().numpy()
    labels_np = labels.detach().numpy()
    precision, recall, f1, _ = precision_recall_fscore_support(labels_np, preds_np, average='binary')
    acc = accuracy_score(labels_np, logits_np)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [7]:
class ToxicCommentDataset(Dataset):
    def __init__(self, csv_path, tokenizer, max_token_len=256, device="cpu", transform=None):
        self.list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

        dataset_df = pd.read_csv(csv_path)
        self.sentence_list = dataset_df["comment_text"].tolist()
        self.labels = torch.from_numpy(dataset_df[list_classes].values).float()

        self.tokenizer = tokenizer
        self.max_token_len = max_token_len
        self.device = device
        self.transform = transform

    def __len__(self):
        return len(self.sentence_list)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        encoding = tokenizer(self.sentence_list[idx], return_tensors='pt', padding="max_length", truncation=True, max_length=self.max_token_len)
        sample = {
            'input_text': self.sentence_list[idx],
            'input_ids': encoding['input_ids'].view(-1).to(device), 
            'attention_mask': encoding['attention_mask'].view(-1).to(device),
            'label': self.labels[idx].to(device)
        }

        if self.transform:
            sample = self.transform(sample)

        return sample

In [8]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased', 
    do_lower_case=True, 
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [9]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_csv_path = os.path.join(os.getcwd(), 'dataset', 'train.csv')
train_dataset = ToxicCommentDataset(
    train_csv_path, 
    tokenizer, 
    max_token_len, 
    device
)

train_data_loader = DataLoader(
    dataset=train_dataset, 
    batch_size=batch_size,
    shuffle=True
)
num_batches = int(len(train_dataset)/batch_size) + 1

In [10]:
criterion = nn.BCEWithLogitsLoss()

Read outputs of the fine-tuned model


In [None]:
model = BertForSequenceClassification.from_pretrained(
    checkpoint_full_path, 
    output_attentions=False,
    return_dict=True, 
    num_labels=len(list_classes)
)
model.to(device)

In [13]:
model.eval()

labels = torch.zeros([0])
logits = torch.zeros([0, 6])
with torch.no_grad():
    for batch_idx, batch in enumerate(train_data_loader):
        input_text = batch['input_text']
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        label = batch['label']

        outputs = model(input_ids, attention_mask=attention_mask)
        # loss = criterion(outputs.logits, label)

        # tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
        # num_non_padding = len(tokens) - tokens.count('[PAD]')
        # tokens = tokens[: num_non_padding]
        # attention_holder = outputs.attentions[-3:]

        # toxic_comment_details[f"{toxic_index}"] = {
        #     "attention_weights": [torch.squeeze(att)[:, : num_non_padding, : num_non_padding].tolist() for att in attention_holder],
        #     "tokens": tokens,
        #     "y_logit": torch.sigmoid(torch.squeeze(outputs.logits)).tolist(),
        #     "y": torch.squeeze(label).tolist(),
        # }

            # toxic_index = toxic_index + 1

        # if (toxic_index + 1) == 15:
        #     break

        # print(toxic_index + 1)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24


Output Details to json file


In [14]:
import json
with open(f"toxic_comment_details.json", "w") as f:
    toxic_comment_details = json.loads(
        json.dumps(toxic_comment_details), parse_float=lambda x: round(float(x), 3)
    )
    json.dump(toxic_comment_details, f, indent=2)

Tensorboard


In [15]:
# !pip install tensorboard
# tensorboard --logdir=/content/drive/MyDrive/Toxic_Comment_Classification/runs/Dec15_08-45-00_cd7758b89179

SyntaxError: ignored