<a href="https://colab.research.google.com/github/as9786/ML-DLPratice/blob/main/Pytorch/BERT_torch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Module

In [1]:
import torch
import torch.nn as nn

import tensorflow as tf
from tensorflow.keras.datasets import imdb

# 장치 설정

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using {}".format(device))

Using cuda


# Data

In [3]:
!git clone https://github.com/e9t/nsmc.git

fatal: destination path 'nsmc' already exists and is not an empty directory.


In [4]:
import pandas as pd

In [5]:
train_df = pd.read_csv('./nsmc/ratings_train.txt', sep='\t')
test_df = pd.read_csv('./nsmc/ratings_test.txt', sep='\t')

In [6]:
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

train_df = train_df.sample(frac=0.1, random_state=999)
test_df = test_df.sample(frac=0.1, random_state=999)

In [7]:
train_df.shape

(15000, 3)

In [8]:
from torch.utils.data import Dataset, DataLoader

In [9]:
class NsmcDataset(Dataset):
    ''' Naver Sentiment Movie Corpus Dataset '''
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx, 1]
        label = self.df.iloc[idx, 2]
        return text, label

In [10]:
nsmc_train_dataset = NsmcDataset(train_df)
train_loader = DataLoader(nsmc_train_dataset, batch_size=2, shuffle=True, num_workers=2)

# 모형

In [11]:
!pip install pytorch-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch-transformers
  Downloading pytorch_transformers-1.2.0-py3-none-any.whl (176 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.4/176.4 KB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 KB[0m [31m63.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting boto3
  Downloading boto3-1.26.48-py3-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.7/132.7 KB[0m [31m17.2 MB/s[0m eta [36m

In [11]:
import numpy as np
from pytorch_transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from torch.optim import Adam
import torch.nn.functional as F

In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased')
model.to(device)

100%|██████████| 995526/995526 [00:00<00:00, 3200320.90B/s]
100%|██████████| 625/625 [00:00<00:00, 417692.80B/s]
100%|██████████| 714314041/714314041 [00:16<00:00, 43565416.48B/s]


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [13]:
optimizer = Adam(model.parameters(), lr=1e-6)

In [14]:
itr = 1
p_itr = 500
epochs = 1
total_loss = 0
total_len = 0
total_correct = 0


model.train()
for epoch in range(epochs):
    
    for text, label in train_loader:
        optimizer.zero_grad()
        
        # encoding and zero padding
        encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
        padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
        
        sample = torch.tensor(padded_list)
        sample, label = sample.to(device), label.to(device)
        labels = torch.tensor(label)
        outputs = model(sample, labels=labels)
        loss, logits = outputs

        pred = torch.argmax(F.softmax(logits), dim=1)
        correct = pred.eq(labels)
        total_correct += correct.sum().item()
        total_len += len(labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        
        if itr % p_itr == 0:
            print('[Epoch {}/{}] Iteration {} -> Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch+1, epochs, itr, total_loss/p_itr, total_correct/total_len))
            total_loss = 0
            total_len = 0
            total_correct = 0

        itr+=1

  labels = torch.tensor(label)
  pred = torch.argmax(F.softmax(logits), dim=1)


[Epoch 1/1] Iteration 500 -> Train Loss: 0.6941, Accuracy: 0.526
[Epoch 1/1] Iteration 1000 -> Train Loss: 0.6959, Accuracy: 0.505
[Epoch 1/1] Iteration 1500 -> Train Loss: 0.6944, Accuracy: 0.528
[Epoch 1/1] Iteration 2000 -> Train Loss: 0.6905, Accuracy: 0.536
[Epoch 1/1] Iteration 2500 -> Train Loss: 0.6855, Accuracy: 0.568
[Epoch 1/1] Iteration 3000 -> Train Loss: 0.6624, Accuracy: 0.604
[Epoch 1/1] Iteration 3500 -> Train Loss: 0.6304, Accuracy: 0.653
[Epoch 1/1] Iteration 4000 -> Train Loss: 0.6009, Accuracy: 0.671
[Epoch 1/1] Iteration 4500 -> Train Loss: 0.5631, Accuracy: 0.729
[Epoch 1/1] Iteration 5000 -> Train Loss: 0.5598, Accuracy: 0.716
[Epoch 1/1] Iteration 5500 -> Train Loss: 0.5544, Accuracy: 0.720
[Epoch 1/1] Iteration 6000 -> Train Loss: 0.5099, Accuracy: 0.746
[Epoch 1/1] Iteration 6500 -> Train Loss: 0.5172, Accuracy: 0.752
[Epoch 1/1] Iteration 7000 -> Train Loss: 0.5170, Accuracy: 0.750
[Epoch 1/1] Iteration 7500 -> Train Loss: 0.4956, Accuracy: 0.760


In [16]:
# evaluation
model.eval()

nsmc_eval_dataset = NsmcDataset(test_df)
eval_loader = DataLoader(nsmc_eval_dataset, batch_size=2, shuffle=False, num_workers=2)

total_loss = 0
total_len = 0
total_correct = 0

for text, label in eval_loader:
    encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
    padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
    sample = torch.tensor(padded_list)
    sample, label = sample.to(device), label.to(device)
    labels = torch.tensor(label)
    outputs = model(sample, labels=labels)
    _, logits = outputs

    pred = torch.argmax(F.softmax(logits), dim=1)
    correct = pred.eq(labels)
    total_correct += correct.sum().item()
    total_len += len(labels)

print('Test accuracy: ', total_correct / total_len)

  labels = torch.tensor(label)
  pred = torch.argmax(F.softmax(logits), dim=1)


Test accuracy:  0.7654
