<a href="https://colab.research.google.com/github/bcmin1018/NLP/blob/main/Classification/Toxic_comment_classification/Notebooks/Toxic_comment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [2]:
import torch
import torch.nn as nn
import transformers
import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_notebook
from transformers import BertTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn import model_selection
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader, Dataset
from torch.nn import functional as F

In [3]:
import os, sys 
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
#train
!gdown --id 1-0kJHNDTDLvx8U7hrN9kq9T3aLaWI6wW
#valid
!gdown --id 1fh2Wtzjx7t9mIv6-ntUB-dv27nfJfFba
#test
!gdown --id 1K6bu85ANf1niUOUP1j4Zdd1mEN7wdIVT
#sample_submission
!gdown --id 1gyVdX4syHrDCum7e6qiDOFkXVcbY2tGO

Downloading...
From: https://drive.google.com/uc?id=1-0kJHNDTDLvx8U7hrN9kq9T3aLaWI6wW
To: /content/jigsaw-unintended-bias-train(clean).csv
100% 1.43G/1.43G [00:12<00:00, 116MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1fh2Wtzjx7t9mIv6-ntUB-dv27nfJfFba
To: /content/validation.csv
100% 3.18M/3.18M [00:00<00:00, 136MB/s]
Downloading...
From: https://drive.google.com/uc?id=1K6bu85ANf1niUOUP1j4Zdd1mEN7wdIVT
To: /content/test.csv
100% 28.8M/28.8M [00:00<00:00, 165MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1gyVdX4syHrDCum7e6qiDOFkXVcbY2tGO
To: /content/sample_submission.csv
100% 627k/627k [00:00<00:00, 92.2MB/s]


In [5]:
train_data_path= "./jigsaw-unintended-bias-train(clean).csv"
valid_data_path= "./validation.csv"
test_data_path ="./test.csv"
sample_submission_path = "./sample_submission.csv"

train_data = pd.read_csv(train_data_path, lineterminator='\n', usecols=['clean_comment_text','toxic'])
valid_data = pd.read_csv(valid_data_path, usecols=['comment_text','toxic'])
test_data = pd.read_csv(test_data_path, usecols=['content'])
sample_submission = pd.read_csv(sample_submission_path)

In [6]:
print(len(train_data))
print(len(valid_data))
print(len(test_data))

1902194
8000
63812


In [7]:
df_train = train_data[0:50000]
df_valid = valid_data
df_test = test_data

In [8]:
# dataset.py
class BERTDataset:
  def __init__(self, comment_text, target):
    self.comment_text = comment_text
    self.target = target
    self.tokenizer = TOKENIZER
    self.max_len = MAX_LEN

  def __len__(self):
    return len(self.comment_text)

  def __getitem__(self, item):
    comment_text = str(self.comment_text[item])
    comment_text = " ".join(comment_text.split())

    inputs = self.tokenizer.encode_plus(
        comment_text,
        None,
        add_special_tokens=True,
        truncation=True,
        max_length = self.max_len
    )

    ids = inputs["input_ids"]
    mask = inputs['attention_mask']
    token_type_ids = inputs["token_type_ids"]

    padding_length = self.max_len - len(ids)
    ids = ids + ([0] * padding_length)
    mask = mask + ([0] * padding_length)
    token_type_ids = token_type_ids + ([0] * padding_length)

    return {
        'ids' : torch.tensor(ids, dtype=torch.long),
        'mask' : torch.tensor(mask, dtype=torch.long),
        'token_type_ids' : torch.tensor(token_type_ids, dtype=torch.long),
        'target' : torch.tensor(self.target[item], dtype=torch.float)
    }

In [9]:
# model.py

class BERTBaseUncased(nn.Module):
  def __init__(self):
    super(BERTBaseUncased, self).__init__()
    self.bert = transformers.BertModel.from_pretrained('bert-base-multilingual-cased')
    self.bert_drop = nn.Dropout(0.3)
    # mean, max pooling (768 * 2)
    self.out = nn.Linear(768 * 2, 1)

  def forward(self, ids, mask, token_type_ids):
    outputs = self.bert(
        ids,
        attention_mask = mask,
        token_type_ids = token_type_ids
    )

    mean_pooling = torch.mean(outputs[0], 1)
    max_pooling, _ = torch.max(outputs[0], 1)
    cat = torch.cat((mean_pooling, max_pooling), 1)

    bo = self.bert_drop(cat)
    output = self.out(bo)
    return output

In [10]:
#engine.py

def loss_fn(outputs, targets):
  return nn.BCEWithLogitsLoss()(outputs, targets.float().view(-1, 1))

def train(data_loader, model, optimizer, device, scheduler):
  model.train()
  avg_loss = 0
  for batch_idx, data in enumerate(data_loader):
    ids = data['ids']
    token_type_ids = data['token_type_ids']
    mask = data['mask']
    targets = data['target']

    ids = ids.to(device, dtype=torch.long)
    token_type_ids = token_type_ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
    targets = targets.to(device, dtype=torch.long)

    optimizer.zero_grad()
    outputs = model(
        ids = ids,
        mask = mask,
        token_type_ids = token_type_ids
    )

    loss = loss_fn(outputs, targets)
    if batch_idx % 10 == 0:
      print(f'train_batch_idx={batch_idx}, train_loss={loss}')
    
    loss.backward()
    optimizer.step()
    scheduler.step()


def eval(data_loader, model, device):
  model.eval()
  fin_targets = []
  fin_outputs = []
  with torch.no_grad():
    for batch_idx, data in enumerate(data_loader):
      ids = data['ids']
      token_type_ids = data['token_type_ids']
      mask = data['mask']
      targets = data['target']

      ids = ids.to(device, dtype=torch.long)
      token_type_ids = token_type_ids.to(device, dtype=torch.long)
      mask = mask.to(device, dtype=torch.long)
      targets = targets.to(device, dtype=torch.long)

      outputs = model(
        ids = ids,
        mask = mask,
        token_type_ids = token_type_ids
      )

      fin_targets.extend(targets.cpu().detach().numpy().tolist())
      fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

  return fin_outputs, fin_targets

In [11]:
# train.py
def run(train_data_loader, valid_data_loader):
  device = torch.device("cuda")
  model = BERTBaseUncased()
  model.to(device)

  param_optimizer = list(model.named_parameters())
  no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
  optimizer_parameters = [
    {'params' : [p for n, p in param_optimizer if not any (nd in n for nd in no_decay)], 'weight_decay': 0.001},
    {'params' : [p for n, p in param_optimizer if any (nd in n for nd in no_decay)], 'weight_decay': 0.0}
  ]

  num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)
  optimizer = AdamW(optimizer_parameters, lr = LR)
  scheduler = get_linear_schedule_with_warmup(optimizer, 
                                              num_warmup_steps = 0,
                                              num_training_steps = num_train_steps)

  best_accuracy = 0
  for epoch in range(EPOCHS):
    train(train_data_loader, model, optimizer, device, scheduler)
    outputs, targets = eval(valid_data_loader, model, device)
    # targets = np.array(targets) >= 0.5
    accuracy = metrics.roc_auc_score(targets, outputs)
    print((f"AUC Score = {accuracy}"))
    if accuracy > best_accuracy:
      torch.save(model.state_dict(), MODEL_PATH)
      best_accuarcy = accuracy

In [14]:
# config.py

MAX_LEN = 100
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 3
LR = 1e-5
MODEL_PATH = "/content/drive/MyDrive/Colab Notebooks/model.pt"
TOKENIZER = transformers.BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True)

In [13]:
train_dataset = BERTDataset(
    comment_text = df_train.clean_comment_text.values,
    target = df_train.toxic.values
)
train_data_loader = DataLoader(
    train_dataset,
    batch_size = TRAIN_BATCH_SIZE,
    num_workers = 2,
    shuffle=True
)
valid_dataset = BERTDataset(
    comment_text = df_valid.comment_text.values,
    target = df_valid.toxic.values
)

valid_data_loader = DataLoader(
    valid_dataset,
    batch_size = VALID_BATCH_SIZE,
    num_workers = 1,
    shuffle=True
)

In [15]:
run(train_data_loader, valid_data_loader)

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


train_batch_idx=0, train_loss=0.20101475715637207
train_batch_idx=10, train_loss=0.012041833251714706
train_batch_idx=20, train_loss=0.0037121104542165995
train_batch_idx=30, train_loss=0.0019787410274147987
train_batch_idx=40, train_loss=0.001551773282699287
train_batch_idx=50, train_loss=0.0018560555763542652
train_batch_idx=60, train_loss=0.0016893008723855019
train_batch_idx=70, train_loss=0.4332113564014435
train_batch_idx=80, train_loss=0.0016852023545652628
train_batch_idx=90, train_loss=0.003285888582468033
train_batch_idx=100, train_loss=0.004842825699597597
train_batch_idx=110, train_loss=0.009270334616303444
train_batch_idx=120, train_loss=0.005157594569027424
train_batch_idx=130, train_loss=0.0031806884799152613
train_batch_idx=140, train_loss=0.002654857002198696
train_batch_idx=150, train_loss=0.0015433438820764422
train_batch_idx=160, train_loss=0.0013995054177939892
train_batch_idx=170, train_loss=0.0015293131582438946
train_batch_idx=180, train_loss=0.00187973119318485

In [16]:
#test_dataset.py
class BERTDatasetTest:
    def __init__(self, comment_text):
        self.comment_text = comment_text
        self.tokenizer = TOKENIZER
        self.max_length = MAX_LEN

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, item):
        comment_text = str(self.comment_text[item])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
        )
        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]
        
        padding_length = self.max_length - len(ids)
        
        ids = ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long)
        }

In [17]:
test_dataset = BERTDatasetTest(
    comment_text = df_test.content.values
)

test_data_loader = DataLoader(
    test_dataset,
    batch_size = VALID_BATCH_SIZE,
    num_workers = 1,
    shuffle=False
)

In [18]:
# prediction.py

def pred(data_loader):
  device = torch.device("cuda")
  model = BERTBaseUncased().to(device)
  model.load_state_dict(torch.load('/content/drive/MyDrive/Colab Notebooks/model.pt'))
  model.eval()
  fin_targets = []
  fin_outputs = []
  with torch.no_grad():
    for batch_idx, data in enumerate(data_loader):
      ids = data['ids']
      token_type_ids = data['token_type_ids']
      mask = data['mask']

      ids = ids.to(device, dtype=torch.long)
      token_type_ids = token_type_ids.to(device, dtype=torch.long)
      mask = mask.to(device, dtype=torch.long)

      outputs = model(
        ids = ids,
        mask = mask,
        token_type_ids = token_type_ids
      )

      fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
  return fin_outputs

In [19]:
prediction = pred(test_data_loader)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [23]:
sample_submission.loc[:, 'toxic'] = np.array(prediction)
sample_submission.to_csv('/content/drive/MyDrive/Colab Notebooks/sample_submission.csv', index=False)

# 테스트코드

In [None]:
device = torch.device("cuda")
bert = transformers.BertModel.from_pretrained('bert-base-multilingual-cased').to(device)

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
device = torch.device("cuda")
# device = torch.device('cpu')
model = BERTBaseUncased().to(device)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RuntimeError: ignored

In [None]:
for batch_idx, data in tqdm(enumerate(valid_data_loader), total=len(valid_data_loader)):
   ids = data['ids']
   token_type_ids = data['token_type_ids']
   mask = data['mask']
   targets = data['target']

   ids = ids.to(device, dtype=torch.long)
   token_type_ids = token_type_ids.to(device, dtype=torch.long)
   mask = mask.to(device, dtype=torch.long)
   targets = targets.to(device, dtype=torch.long)
   break

  0%|          | 0/7 [00:00<?, ?it/s]


In [None]:
print(ids)
print(token_type_ids)
print(mask)
print(targets)

tensor([[   101,  10247,    120,  ...,      0,      0,      0],
        [   101,    173,  10237,  ...,      0,      0,      0],
        [   101,  13697, 100025,  ...,      0,      0,      0],
        ...,
        [   101,  21583,  10123,  ...,  10418,    102,      0],
        [   101,  10414,  53543,  ...,      0,      0,      0],
        [   101,  10294,  19325,  ...,      0,      0,      0]])
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
tensor([0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0])


In [None]:
bert = transformers.BertModel.from_pretrained('bert-base-multilingual-cased').to(device)
outputs = bert(
        ids,
        attention_mask = mask,
        token_type_ids = token_type_ids
    )

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
outputs[0].shape

torch.Size([16, 100, 768])

In [None]:
torch.max(outputs[0], 1)

torch.return_types.max(values=tensor([[0.8341, 0.8097, 1.5347,  ..., 1.4086, 0.9169, 0.8884],
        [0.5375, 0.4361, 1.7747,  ..., 0.7926, 0.3058, 0.3963],
        [0.7145, 0.3961, 1.2925,  ..., 1.0917, 0.5830, 1.3070],
        ...,
        [0.2507, 1.0089, 1.3007,  ..., 0.7810, 0.5252, 1.1897],
        [0.4364, 0.8533, 1.5533,  ..., 1.3696, 0.9986, 1.4865],
        [0.5254, 0.4245, 1.7083,  ..., 1.2749, 1.0420, 1.2905]],
       device='cuda:0', grad_fn=<MaxBackward0>), ...)

In [None]:
mean_pooling = torch.mean(outputs[0], 1)
max_pooling = torch.max(outputs[0], 1)[0]
print(mean_pooling)
print(max_pooling)

tensor([[ 0.0586, -0.2672, -0.1470,  ...,  0.0343,  0.0246,  0.3047],
        [-0.5249,  0.1883,  0.0353,  ...,  0.3419,  0.2138,  0.0042],
        [-0.1869, -0.5305,  0.3124,  ...,  0.0755, -0.0814,  0.4429],
        ...,
        [-0.0969, -0.1655,  0.3459,  ...,  0.3441,  0.0966,  0.3250],
        [ 0.0120, -0.3983,  0.1737,  ...,  0.0564,  0.1642,  0.0659],
        [-0.3242, -0.3836,  0.1232,  ...,  0.2428, -0.0313,  0.4149]],
       grad_fn=<MeanBackward1>)
tensor([[1.0278, 0.4594, 1.3534,  ..., 1.2225, 0.8621, 1.6045],
        [0.3587, 0.7318, 1.4748,  ..., 1.2207, 0.9245, 0.7660],
        [0.5443, 0.4104, 1.4537,  ..., 1.1046, 0.9373, 1.3056],
        ...,
        [0.9907, 1.0772, 1.5230,  ..., 1.4751, 1.0895, 1.1570],
        [0.8003, 0.5594, 1.3066,  ..., 1.1569, 1.0621, 0.9255],
        [0.4644, 0.5846, 1.5499,  ..., 1.6675, 1.2249, 1.4496]],
       grad_fn=<MaxBackward0>)


In [None]:
cat = torch.cat((mean_pooling, max_pooling),1)
print(cat)

tensor([[ 0.0586, -0.2672, -0.1470,  ...,  1.2225,  0.8621,  1.6045],
        [-0.5249,  0.1883,  0.0353,  ...,  1.2207,  0.9245,  0.7660],
        [-0.1869, -0.5305,  0.3124,  ...,  1.1046,  0.9373,  1.3056],
        ...,
        [-0.0969, -0.1655,  0.3459,  ...,  1.4751,  1.0895,  1.1570],
        [ 0.0120, -0.3983,  0.1737,  ...,  1.1569,  1.0621,  0.9255],
        [-0.3242, -0.3836,  0.1232,  ...,  1.6675,  1.2249,  1.4496]],
       grad_fn=<CatBackward0>)


In [None]:
out = nn.Linear(768*2, 1).to(device)

In [None]:
pred = out(cat)

In [None]:
pred

tensor([[0.3792],
        [0.5491],
        [0.2730],
        [0.3773],
        [0.4561],
        [0.2064],
        [0.2225],
        [0.3269],
        [0.2739],
        [0.5427],
        [0.3959],
        [0.5336],
        [0.5073],
        [0.6041],
        [0.2007],
        [0.4927]], grad_fn=<AddmmBackward0>)

In [None]:
def loss_fn(outputs, targets):
  return nn.BCEWithLogitsLoss()(outputs, targets.float().view(-1, 1))

In [None]:
loss_fn(pred, targets)

tensor(0.7774, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)

In [None]:
outputs = model(
    ids = ids,
    mask = mask,
    token_type_ids = token_type_ids
)

In [None]:
outputs

tensor([[-0.1379],
        [-0.1046],
        [-0.1857],
        [-0.0685],
        [-0.1720],
        [-0.1240],
        [-0.0849],
        [-0.0676],
        [-0.0986],
        [-0.1321],
        [-0.0929],
        [-0.1777],
        [-0.0874],
        [-0.2104],
        [-0.0668],
        [-0.0582]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [None]:
targets

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')

In [None]:
F.cross_entropy(targets.view(-1, 1).float(), outputs)

tensor(-0., device='cuda:0', grad_fn=<DivBackward1>)

In [None]:
loss = loss_fn(targets.float(), outputs)

RuntimeError: ignored

In [None]:
loss

tensor(-0., device='cuda:0', grad_fn=<DivBackward1>)

In [None]:
outputs = bert(
        ids,
        attention_mask = mask,
        token_type_ids = token_type_ids
)

In [None]:
out(outputs[1])

tensor([[0.4839],
        [0.4113],
        [0.4250],
        [0.3696],
        [0.3300],
        [0.2585],
        [0.2929],
        [0.2718],
        [0.4765],
        [0.4958],
        [0.4319],
        [0.3376],
        [0.3643],
        [0.4480],
        [0.2355],
        [0.3274]], device='cuda:0', grad_fn=<AddmmBackward0>)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# bert = transformers.BertModel.from_pretrained('bert-base-multilingual-cased').to(device)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
outputs = model(
        ids = ids,
        mask = mask,
        token_type_ids = token_type_ids)

In [None]:
outputs

tensor([[-0.6342],
        [-0.5819],
        [-0.5279],
        [-0.5547],
        [ 0.1543],
        [-0.4455],
        [-0.5732],
        [-0.6738],
        [-0.4307],
        [-0.5857],
        [-0.3221],
        [-0.5132],
        [ 0.1230],
        [-0.4605],
        [-0.5499],
        [-0.6433]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [None]:
outputs, targets = eval(valid_data_loader, model, device)

100%|██████████| 13/13 [00:01<00:00,  9.99it/s]


In [None]:
outputs

[[0.4376993477344513],
 [0.5473430752754211],
 [0.4304685592651367],
 [0.3696940839290619],
 [0.3778122663497925],
 [0.434233158826828],
 [0.43217146396636963],
 [0.4821430742740631],
 [0.3803848922252655],
 [0.3944300413131714],
 [0.5571368336677551],
 [0.3973301947116852],
 [0.43737703561782837],
 [0.5570463538169861],
 [0.4305109679698944],
 [0.4362465739250183],
 [0.4069691002368927],
 [0.42568057775497437],
 [0.3767576813697815],
 [0.43503254652023315],
 [0.42154985666275024],
 [0.4401720464229584],
 [0.3826456367969513],
 [0.4007130265235901],
 [0.3964690566062927],
 [0.43989676237106323],
 [0.4022636413574219],
 [0.3940058946609497],
 [0.4127604365348816],
 [0.4376904368400574],
 [0.3904869258403778],
 [0.39702335000038147],
 [0.4382345974445343],
 [0.5560024380683899],
 [0.3747329115867615],
 [0.3790303170681],
 [0.43346527218818665],
 [0.4012080430984497],
 [0.4120001792907715],
 [0.5309929847717285],
 [0.4065152406692505],
 [0.5681859850883484],
 [0.38822999596595764],
 [0.42

In [None]:
metrics.roc_auc_score(targets, outputs)

0.4

In [None]:
max_indices

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0], device='cuda:0')

In [None]:
np.array(targets) >= 0.5

array([False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False])

In [None]:
targets = np.array(targets) >= 0.5
    accuracy = metrics.roc_auc_score(targets, outputs)

AttributeError: ignored

In [None]:
out = nn.Linear(768, 1).to(device)

In [None]:
output = out(outputs[0])

In [None]:
o = torch.sigmoid(output[0]).cpu().detach().numpy().tolist()

In [None]:
t = targets.cpu().detach().numpy().tolist()
t

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [None]:
train_loss = F.cross_entropy(output, targets.view(-1, 1))

In [None]:
t2 = np.array(t) >= 0.5
t2

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False])

In [None]:
accuracy = metrics.roc_auc_score(t2, o)

ValueError: ignored

In [None]:
accuracy = metrics.roc_auc_score(targets, outputs)

ValueError: ignored