# Bert_En_movie_review_finetuning
참고 : https://gmihaila.github.io/tutorial_notebooks/bert_inner_workings/

In [None]:
!pip install transformers

In [3]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader,Dataset
from transformers import BertModel, BertTokenizer, BertForSequenceClassification

import pandas as pd
import numpy as np
from IPython.display import display

In [5]:
from warnings import filterwarnings
filterwarnings('ignore')

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [7]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)
display(df.head())
print(df.shape)

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1


(6920, 2)


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df[0].values, df[1].values, random_state=42, test_size=0.2)

In [42]:
def encode(data, tokenizer):
    tokenized_text = tokenizer.encode_plus(data,
                                        max_length=50,
                                        add_special_tokens = True,
                                        pad_to_max_length=True,
                                        return_attention_mask=True)
    return tokenized_text['input_ids'], tokenized_text['attention_mask'], tokenized_text['token_type_ids']

In [43]:
X_train_encoded = pd.Series(X_train).map(lambda x:encode(x, tokenizer))
temp = np.concatenate(X_train_encoded.values).reshape(-1,3,50)
train_input_ids = temp[:,0,:]
train_attention_masks = temp[:,1,:]
train_token_type_ids = temp[:,2,:]

In [44]:
X_test_encoded = pd.Series(X_test).map(lambda x:encode(x, tokenizer))
temp = np.concatenate(X_test_encoded.values).reshape(-1,3,50)
test_input_ids = temp[:,0,:]
test_attention_masks = temp[:,1,:]
test_token_type_ids = temp[:,2,:]

In [45]:
class MapDataset(Dataset):
    def __init__(self, input_ids, attention_masks, token_type_ids, label):
        self.input_ids = torch.Tensor(input_ids).long()
        self.attention_masks = torch.Tensor(attention_masks).long()
        self.token_type_ids = torch.Tensor(token_type_ids).long()
        self.label = torch.Tensor(label).long()
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self,idx):
        return (self.input_ids[idx],self.token_type_ids[idx],self.attention_masks[idx]), self.label[idx]

In [46]:
trainDS = MapDataset(train_input_ids, train_attention_masks, train_token_type_ids, y_train)
testDS = MapDataset(test_input_ids, test_attention_masks, test_token_type_ids, y_test)

In [48]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
#                     # You can increase this for multi-class tasks.   
    # output_attentions = False, # Whether the model returns attentions weights.
    # output_hidden_states = False, # Whether the model returns all hidden-states.
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [31]:
output = model(*[torch.Tensor(train_input_ids)[:4].long(),
                torch.Tensor(train_attention_masks)[:4].long(),
                torch.Tensor(train_token_type_ids)[:4].long()])
output['logits'].shape

torch.Size([4, 2])

In [50]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
batch_size = 32
epochs = 10
lr = 2e-5
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss().to(device)
trainloader = torch.utils.data.DataLoader(trainDS, batch_size=batch_size, shuffle=True)
testloader = torch.utils.data.DataLoader(testDS, batch_size=batch_size, shuffle=False)

In [51]:
model = model.to(device)

for epoch in range(epochs):
    train_correct = 0
    train_loss = 0
    model.train()
    for (input_ids, attention_mask, token_type_ids), targets in trainloader:
        input_ids, attention_mask, token_type_ids, targets = input_ids.to(device), attention_mask.to(device), token_type_ids.to(device), targets.to(device)
        optimizer.zero_grad()
        pred = model(*[input_ids,attention_mask,token_type_ids])['logits']   # <-- 이 부분 때문에 skorch 구현은 하지 않음
        loss = criterion(pred, targets)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()*input_ids.size(0)
        train_correct += (pred.argmax(1) == targets).sum().item()
    train_loss /= len(trainloader.dataset)
    train_acc = train_correct / len(trainloader.dataset)

    val_correct = 0 
    val_loss = 0
    model.eval()
    with torch.no_grad():
        for (input_ids, attention_mask, token_type_ids), targets in testloader:
            input_ids, attention_mask, token_type_ids, targets = input_ids.to(device), attention_mask.to(device), token_type_ids.to(device), targets.to(device)
            pred = model(*[input_ids,attention_mask,token_type_ids])['logits']
            loss = criterion(pred, targets)
            val_loss += loss.item()*input_ids.size(0)
            val_correct += (pred.argmax(1) == targets).sum().item()
    val_loss /= len(testloader.dataset)
    val_acc = val_correct / len(testloader.dataset)

    print(f'Epoch {epoch+1} of {epochs}')
    print(f'Train Loss: {train_loss:.4f} | Train Acc: {train_acc*100:.2f}%')
    print(f'Val Loss: {val_loss:.4f} | Val Acc: {val_acc*100:.2f}%')
    print('---------------------------------')

Epoch 1 of 10
Train Loss: 0.6786 | Train Acc: 55.55%
Val Loss: 0.6360 | Val Acc: 64.81%
---------------------------------
Epoch 2 of 10
Train Loss: 0.6318 | Train Acc: 63.87%
Val Loss: 0.5610 | Val Acc: 75.14%
---------------------------------
Epoch 3 of 10
Train Loss: 0.5532 | Train Acc: 72.33%
Val Loss: 0.5341 | Val Acc: 70.01%
---------------------------------
Epoch 4 of 10
Train Loss: 0.4772 | Train Acc: 77.78%
Val Loss: 0.5598 | Val Acc: 69.22%
---------------------------------
Epoch 5 of 10
Train Loss: 0.4119 | Train Acc: 82.21%
Val Loss: 0.4510 | Val Acc: 80.20%
---------------------------------
Epoch 6 of 10
Train Loss: 0.3807 | Train Acc: 83.35%
Val Loss: 0.4948 | Val Acc: 77.96%
---------------------------------
Epoch 7 of 10
Train Loss: 0.3694 | Train Acc: 84.28%
Val Loss: 0.4334 | Val Acc: 81.43%
---------------------------------
Epoch 8 of 10
Train Loss: 0.3408 | Train Acc: 85.66%
Val Loss: 0.4618 | Val Acc: 80.13%
---------------------------------
Epoch 9 of 10
Train Loss