# fine-tune BERT MRPC

In [157]:
import re
import time
import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from transformers import BertConfig, BertTokenizer, BertForSequenceClassification

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

In [2]:
## setup tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
model = BertForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")

In [3]:
num_params = sum(p.numel() for p in model.parameters())

In [4]:
print(f"{num_params:,}")

108,311,810


In [5]:
d_train = pd.read_csv('../data/text_clean/train.csv')
d_test = pd.read_csv('../data/text_clean/test.csv')

## inference using BERT MRPC

In [6]:
t1 = d_train.title_1_pre.to_list()
t2 = d_train.title_2_pre.to_list()

In [15]:
start = 0
end = 256
y_pred_batch = []
for batch_index in tqdm(range(int(len(t1)/256) + 1)):
    t1_batch = t1[start:end]
    t2_batch = t2[start:end]
    
    title_encode = tokenizer(t1_batch, t2_batch, return_tensors="pt", padding=True)
    
    y_pred = model(**title_encode)
    y_pred = torch.softmax(y_pred[0], dim=1).argmax(1)
    
    y_pred_batch.extend(y_pred.tolist())
    
    start += 256
    end += 256

100%|██████████| 40/40 [18:07<00:00, 27.20s/it]


In [16]:
y_pred = np.array(y_pred_batch)
y_true = d_train.Label.values

In [34]:
print("f score", f1_score(y_true, y_pred, average='macro'))

print("accuracy", accuracy_score(y_true, y_pred))

f score 0.4233435022802566
accuracy 0.4939581491305629


## Fine tune BERT

In [24]:
train, test = train_test_split(d_train)

In [29]:
train.Label.value_counts()

1    4430
0    3204
Name: Label, dtype: int64

In [28]:
test.Label.value_counts()

1    1413
0    1132
Name: Label, dtype: int64

In [104]:
class ShopeeDataset():
    def __init__(self, data, test):
        train, val = train_test_split(data, random_state=123)
        
        train.reset_index(drop=True, inplace=True)
        val.reset_index(drop=True, inplace=True)
        
        self.dataset = {
            'train': (train, train.shape[0]),
            'val': (val, val.shape[0]),
            'test': (test, test.shape[0])
        }
        
        self.set_split(split='train')
        
    def set_split(self, split='train'):
        self.data, self.length = self.dataset[split]
    
    def __getitem__(self, idx):
        t1 = self.data.loc[idx, 'title_1_pre']
        t2 = self.data.loc[idx, 'title_2_pre']
        label = self.data.loc[idx, 'Label']
        
        return t1, t2, label
        
    def __len__(self):
        return self.length

In [105]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

In [106]:
dataset = ShopeeDataset(train, test)

In [107]:
def tokenizer_encode(data):
    t1, t2, label = list(zip(*data))
    title_encode = tokenizer(t1, t2, return_tensors="pt", padding=True)
    label = torch.LongTensor(label)
    
    return title_encode, label

In [153]:
def calculate_accuracy(y, y_pred):
    
    n_correct = torch.eq(y, y_pred).sum().item()
    accuracy = (n_correct/y_pred.shape[0])*100
    
    return accuracy

In [165]:
for epoch in range(1, 101):
    running_loss = 0
    running_loss_v = 0
    running_acc = 0
    running_acc_v = 0
    
    start = time.time()
    
    model.train()
    dataset.set_split(split='train')
    data_gen = DataLoader(dataset, batch_size=128, collate_fn=tokenizer_encode)
    for batch_index, (X, y) in enumerate(data_gen, 1):
        optimizer.zero_grad()
        
        logit = model(**X)[0]
        y_pred = torch.softmax(logit, dim=1).argmax(1)
        
        loss = criterion(logit, y)
        running_loss += (loss.item() - running_loss) / batch_index
        
        accuracy = calculate_accuracy(y, y_pred)
        running_acc += (accuracy - running_acc) / batch_index
        
        loss.backward()
        optimizer.step()
        break
        
    model.eval()
    dataset.set_split(split='val')
    data_gen = DataLoader(dataset, batch_size=128, collate_fn=tokenizer_encode)
    for batch_index, (X, y) in enumerate(data_gen, 1):
        logit = model(**X)[0]
        y_pred = torch.softmax(logit, dim=1).argmax(1)
        
        loss = criterion(logit, y)
        running_loss_v += (loss.item() - running_loss_v) / batch_index
        
        accuracy = calculate_accuracy(y, y_pred)
        running_acc_v += (accuracy - running_acc_v) / batch_index
        break
        
    duration = time.time() - start
    print(f"epoch: {epoch} | time: {duration:.2f}s")
    print(f"\ttrain loss: {running_loss:.2f} | train acc: {running_acc:.2f}")
    print(f"\tval loss: {running_loss_v:.2f} | val acc: {running_acc_v:.2f}")

epoch: 1 | time: 49.43s
	train loss: 0.73 | train acc: 50.78
	val loss: 2.19 | val acc: 55.47
epoch: 2 | time: 52.15s
	train loss: 1.93 | train acc: 57.81
	val loss: 0.73 | val acc: 44.53
epoch: 3 | time: 51.79s
	train loss: 0.73 | train acc: 43.75
	val loss: 0.78 | val acc: 55.47
epoch: 4 | time: 53.00s
	train loss: 0.76 | train acc: 57.81
	val loss: 0.72 | val acc: 55.47
epoch: 5 | time: 51.87s
	train loss: 0.71 | train acc: 56.25
	val loss: 0.69 | val acc: 55.47
epoch: 6 | time: 53.33s
	train loss: 0.69 | train acc: 57.81
	val loss: 0.70 | val acc: 44.53
epoch: 7 | time: 52.16s
	train loss: 0.71 | train acc: 48.44
	val loss: 0.72 | val acc: 44.53
epoch: 8 | time: 53.05s
	train loss: 0.76 | train acc: 40.62
	val loss: 0.70 | val acc: 44.53
epoch: 9 | time: 52.73s
	train loss: 0.72 | train acc: 46.88
	val loss: 0.69 | val acc: 55.47
epoch: 10 | time: 53.31s
	train loss: 0.69 | train acc: 55.47
	val loss: 0.70 | val acc: 55.47
epoch: 11 | time: 52.04s
	train loss: 0.69 | train acc: 57.

KeyboardInterrupt: 