<a href="https://colab.research.google.com/github/bcmin1018/NLP/blob/main/Classification/Toxic_comment_classification/Notebooks/Toxic_comment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.19.1-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 20.9 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 45.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 19.5 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 3.8 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed huggingface-hub-0

In [2]:
import torch
import torch.nn as nn
import transformers
import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_notebook

In [3]:
import os, sys 
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!gdown --id 1-0kJHNDTDLvx8U7hrN9kq9T3aLaWI6wW
!gdown --id 1fh2Wtzjx7t9mIv6-ntUB-dv27nfJfFba

Downloading...
From: https://drive.google.com/uc?id=1-0kJHNDTDLvx8U7hrN9kq9T3aLaWI6wW
To: /content/jigsaw-unintended-bias-train(clean).csv
100% 1.43G/1.43G [00:26<00:00, 54.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=1fh2Wtzjx7t9mIv6-ntUB-dv27nfJfFba
To: /content/validation.csv
100% 3.18M/3.18M [00:00<00:00, 251MB/s]


In [5]:
train_data_path= "./jigsaw-unintended-bias-train(clean).csv"
valid_data_path= "./validation.csv"
train_data = pd.read_csv(train_data_path, lineterminator='\n', usecols=['clean_comment_text','toxic'])
valid_data = pd.read_csv(valid_data_path, usecols=['comment_text','toxic'])

In [6]:
df_train = train_data[0:16]
df_valid = valid_data[0:16]

In [7]:
# dataset.py
class BERTDataset:
  def __init__(self, comment_text, target):
    self.comment_text = comment_text
    self.target = target
    self.tokenizer = TOKENIZER
    self.max_len = MAX_LEN

  def __len__(self):
    return len(self.comment_text)

  def __getitem__(self, item):
    comment_text = str(self.comment_text[item])
    comment_text = " ".join(comment_text.split())

    inputs = self.tokenizer.encode_plus(
        comment_text,
        None,
        add_special_tokens=True,
        truncation=True,
        max_length = self.max_len
    )

    ids = inputs["input_ids"]
    mask = inputs['attention_mask']
    token_type_ids = inputs["token_type_ids"]

    padding_length = self.max_len - len(ids)
    ids = ids + ([0] * padding_length)
    mask = mask + ([0] * padding_length)
    token_type_ids = token_type_ids + ([0] * padding_length)

    return {
        'ids' : torch.tensor(ids, dtype=torch.long),
        'mask' : torch.tensor(mask, dtype=torch.long),
        'token_type_ids' : torch.tensor(token_type_ids, dtype=torch.long),
        'target' : torch.tensor(self.target[item], dtype=torch.float)
    }

In [8]:
# model.py

class BERTBaseUncased(nn.Module):
  def __init__(self):
    super(BERTBaseUncased, self).__init__()
    self.bert = transformers.BertModel.from_pretrained('bert-base-multilingual-cased')
    self.bert_drop = nn.Dropout(0.3)
    #mean, max pooling (768 * 2)
    self.out = nn.Linear(768, 1)

  def forward(self, ids, mask, token_type_ids):
    outputs = self.bert(
        ids,
        attention_mask = mask,
        token_type_ids = token_type_ids
    )

    # mean_pooling = torch.mean(outputs[1], 1)
    # max_pooling, _ = torch.max(outputs[1], 1)
    # cat = torch.cat((mean_pooling, max_pooling), 1)

    # bo = self.bert_drop(cat)
    # output = self.out(bo)
    output = self.out(outputs[1])
    return output

In [9]:
#engine.py

def loss_fn(outputs, targets):
  return F.cross_entropy(outputs, targets.view(-1, 1).float())

def train(data_loader, model, optimizer, device, scheduler):
  model.train()

  for batch_idx, data in tqdm(enumerate(data_loader), total=len(data_loader)):
    ids = data['ids']
    token_type_ids = data['token_type_ids']
    mask = data['mask']
    targets = data['target']

    ids = ids.to(device, dtype=torch.long)
    token_type_ids = token_type_ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
    targets = targets.to(device, dtype=torch.long)

    optimizer.zero_grad()
    outputs = model(
        ids = ids,
        mask = mask,
        token_type_ids = token_type_ids
    )

    loss = loss_fn(outputs, targets)
    loss.backward()
    optimizer.step()
    scheduler.step()

def eval(data_loader, model, device):
  model.eval()
  fin_targets = []
  fin_outputs = []
  with torch.no_grad():
    for batch_idx, data in tqdm(enumerate(data_loader), total=len(data_loader)):
      ids = data['ids']
      token_type_ids = data['token_type_ids']
      mask = data['mask']
      targets = data['target']

      ids = ids.to(device, dtype=torch.long)
      token_type_ids = token_type_ids.to(device, dtype=torch.long)
      mask = mask.to(device, dtype=torch.long)
      targets = targets.to(device, dtype=torch.long)

      outputs = model(
        ids = ids,
        mask = mask,
        token_type_ids = token_type_ids
      )

      fin_targets.extend(targets.cpu().detach().numpy().tolist())
      fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

  return fin_outputs, fin_targets

In [15]:
# config.py

MAX_LEN = 100
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 5
LR = 3e-5
MODEL_PATH = "/content/drive/MyDrive/Colab Notebooks/model.pt"
TOKENIZER = transformers.BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True)

In [11]:
# train.py

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn import model_selection
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader, Dataset
from torch.nn import functional as F

In [12]:
train_dataset = BERTDataset(
    comment_text = df_train.clean_comment_text.values,
    target = df_train.toxic.values
)

train_data_loader = DataLoader(
    train_dataset,
    batch_size = TRAIN_BATCH_SIZE,
    num_workers = 2
)
valid_dataset = BERTDataset(
    comment_text = df_valid.comment_text.values,
    target = df_valid.toxic.values
)

valid_data_loader = DataLoader(
    valid_dataset,
    batch_size = VALID_BATCH_SIZE,
    num_workers = 1
)

In [17]:
device = torch.device("cuda")
model = BERTBaseUncased()
model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
  {'params' : [p for n, p in param_optimizer if not any (nd in n for nd in no_decay)], 'weight_decay': 0.001},
  {'params' : [p for n, p in param_optimizer if any (nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)
optimizer = AdamW(optimizer_parameters, lr = LR)
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = num_train_steps)

best_accuracy = 0
for epoch in range(EPOCHS):
  train(train_data_loader, model, optimizer, device, scheduler)
  outputs, targets = eval(valid_data_loader, model, device)
  # targets = np.array(targets) >= 0.5
  accuracy = metrics.roc_auc_score(targets, outputs)
  print((f"AUC Score = {accuracy}"))
  if accuracy > best_accuracy:
    torch.save(model.state_dict(), MODEL_PATH)
    best_accuarcy = accuracy

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1/1 [00:00<00:00,  1.95it/s]
100%|██████████| 1/1 [00:00<00:00,  4.71it/s]


AUC Score = 0.33333333333333337


100%|██████████| 1/1 [00:00<00:00,  2.11it/s]
100%|██████████| 1/1 [00:00<00:00,  5.36it/s]


AUC Score = 0.33333333333333337


100%|██████████| 1/1 [00:00<00:00,  2.07it/s]
100%|██████████| 1/1 [00:00<00:00,  5.17it/s]


AUC Score = 0.33333333333333337


100%|██████████| 1/1 [00:00<00:00,  2.00it/s]
100%|██████████| 1/1 [00:00<00:00,  4.93it/s]


AUC Score = 0.33333333333333337


100%|██████████| 1/1 [00:00<00:00,  2.21it/s]
100%|██████████| 1/1 [00:00<00:00,  4.77it/s]


AUC Score = 0.33333333333333337


In [117]:
def run():
  # # df1 = pd.read_csv
  # df_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/jigsaw-unintended-bias-train(clean).csv',lineterminator='\n', usecols=['clean_comment_text','toxic'])
  # df_train = df_train.iloc[0:100]

  # # df_train = pad.concat([df1, df2], axis=0, reset_index(drop=True)

  # df_valid = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/validation.csv')

  train_dataset = BERTDataset(
      comment_text = df_train.clean_comment_text.values,
      target = df_train.toxic.values
  )

  train_data_loader = DataLoader(
      train_dataset,
      batch_size = TRAIN_BATCH_SIZE,
      num_workers = 2
  )

  valid_dataset = BERTDataset(
     comment_text = df_valid.comment_text.values,
     target = df_valid.toxic.values
  )

  valid_data_loader = DataLoader(
      valid_dataset,
      batch_size = VALID_BATCH_SIZE,
      num_workers = 1
  )

  device = torch.device("cuda")
  model = BERTBaseUncased()
  model.to(device)

  param_optimizer = list(model.named_parameters())
  no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
  optimizer_parameters = [
    {'params' : [p for n, p in param_optimizer if not any (nd in n for nd in no_decay)], 'weight_decay': 0.001},
    {'params' : [p for n, p in param_optimizer if any (nd in n for nd in no_decay)], 'weight_decay': 0.0}
  ]

  num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)
  optimizer = AdamW(optimizer_parameters, lr = LR)
  scheduler = get_linear_schedule_with_warmup(optimizer, 
                                              num_warmup_steps = 0,
                                              num_training_steps = num_train_steps)
  
  best_accuracy = 0
  for epoch in range(EPOCHS):
    train(train_data_loader, model, optimizer, device, scheduler)
    outputs, targets = eval(valid_data_loader, model, device)
    targets = np.array(targets) >= 0.5
    accuracy = metrics.roc_auc_score(targets, outputs)
    print((f"AUC Score = {accuracy}"))
    if accuracy > best_accuracy:
      torch.save(model.state_dict(), MODEL_PATH)
      best_accuarcy = accuracy

In [118]:
run()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/1 [00:00<?, ?it/s]


RuntimeError: ignored

테스트코드

In [88]:
bert = transformers.BertModel.from_pretrained('bert-base-multilingual-cased').to(device)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [121]:
for batch_idx, data in tqdm(enumerate(train_data_loader), total=len(train_data_loader)):
   ids = data['ids']
   token_type_ids = data['token_type_ids']
   mask = data['mask']
   targets = data['target']

   ids = ids.to(device, dtype=torch.long)
   token_type_ids = token_type_ids.to(device, dtype=torch.long)
   mask = mask.to(device, dtype=torch.long)
   targets = targets.to(device, dtype=torch.long)
   break

  0%|          | 0/1 [00:00<?, ?it/s]


In [124]:
optimizer.zero_grad()
outputs = model(
    ids = ids,
    mask = mask,
    token_type_ids = token_type_ids
)

In [125]:
outputs

tensor([[-0.2003],
        [-0.1755],
        [-0.2037],
        [-0.1443],
        [-0.2161],
        [-0.1625],
        [-0.2250],
        [-0.2844],
        [-0.3807],
        [-0.1863],
        [-0.2765],
        [-0.2312],
        [-0.2147],
        [-0.2694],
        [-0.3177],
        [-0.1931]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [127]:
targets.view(-1,1)

tensor([[0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0]], device='cuda:0')

In [130]:
loss = loss_fn(outputs, targets.float())

In [131]:
loss

tensor(-0., device='cuda:0', grad_fn=<DivBackward1>)

In [90]:
outputs = bert(
        ids,
        attention_mask = mask,
        token_type_ids = token_type_ids
)

In [101]:
out = nn.Linear(768, 1).to(device)

In [102]:
out(outputs[1])

tensor([[0.4839],
        [0.4113],
        [0.4250],
        [0.3696],
        [0.3300],
        [0.2585],
        [0.2929],
        [0.2718],
        [0.4765],
        [0.4958],
        [0.4319],
        [0.3376],
        [0.3643],
        [0.4480],
        [0.2355],
        [0.3274]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [106]:
device = torch.device("cuda")
model = BERTBaseUncased().to(device)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [51]:
# bert = transformers.BertModel.from_pretrained('bert-base-multilingual-cased').to(device)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [107]:
outputs = model(
        ids = ids,
        mask = mask,
        token_type_ids = token_type_ids)

In [109]:
len(outputs)

16

In [111]:
outputs, targets = eval(valid_data_loader, model, device)

100%|██████████| 1/1 [00:00<00:00,  2.27it/s]


In [112]:
metrics.roc_auc_score(targets, outputs)

0.4

In [87]:
max_indices

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0], device='cuda:0')

In [63]:
np.array(targets) >= 0.5

array([False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False])

In [60]:
targets = np.array(targets) >= 0.5
    accuracy = metrics.roc_auc_score(targets, outputs)

AttributeError: ignored

In [None]:
out = nn.Linear(768, 1).to(device)

In [None]:
output = out(outputs[0])

In [None]:
o = torch.sigmoid(output[0]).cpu().detach().numpy().tolist()

In [None]:
t = targets.cpu().detach().numpy().tolist()
t

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [None]:
train_loss = F.cross_entropy(output, targets.view(-1, 1))

In [None]:
t2 = np.array(t) >= 0.5
t2

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False])

In [None]:
accuracy = metrics.roc_auc_score(t2, o)

ValueError: ignored

In [57]:
accuracy = metrics.roc_auc_score(targets, outputs)

ValueError: ignored