# Reference

- How to fine-tune BERT for text classification https://arxiv.org/pdf/1905.05583.pdf
- BERT fine-tuning tutorial with Pytorch https://mccormickml.com/2019/07/22/BERT-fine-tuning/

## Importing required modules

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import torch.nn as nn
import torch
import re
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertModel, BertTokenizer, get_linear_schedule_with_warmup
from transformers import RobertaTokenizer, RobertaModel
%matplotlib inline

## Reading data into memory and spliting them into training set and validation set

In [None]:
train_pd = pd.read_csv('train.csv')
train_text, val_text, train_label, val_label = train_test_split(train_pd['text'], train_pd['target'],
                                                                test_size = 0.2, random_state = 2020)

## Preprocessing data

In [None]:
def remove_url(text):
    compiler = re.compile(r'https?://\S+|www\.\S+')
    
    output = compiler.sub('', text)
    
    return output

def remove_html(text):
    compiler = re.compile(r'<.*?>')
    
    output = compiler.sub('', text)
    
    return output

# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    compiler = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    output = compiler.sub('', text)
    
    return output

def remove_hashtag(text):
    compiler = re.compile(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)")
    output = compiler.sub('', text)
    
    return output

def data_cleaning(text):
    text = remove_url(text)
    text = remove_html(text)
    text = remove_emoji(text)
    text = remove_hashtag(text)
    
    return text

In [None]:
train_data = [{'text' : text, 'target' : label} for text, label in zip(train_text, train_label)]
val_data = [{'text' : text, 'target' : label} for text, label in zip(val_text, val_label)]

In [None]:
pretraining = 'roberta-base'
encoder = RobertaTokenizer.from_pretrained(pretraining, do_lower_case = True)
#pretraining = 'bert-base-uncased'
#encoder = BertTokenizer.from_pretrained(pretraining, do_lower_case = True)

In [None]:
class disaster_data(Dataset):
    
    def __init__(self, dataset, encoder):
        super(disaster_data, self).__init__()
        self.encoder = encoder
        self.data = dataset
        self.text = [row['text'] for row in self.data]
        self.labels = [row['target'] for row in self.data]
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        
        holder = {}
        encoded = encoder.batch_encode_plus([self.text[index]], max_length = 30, truncation = True, pad_to_max_length = True)
        holder['embedding'] = torch.tensor(encoded['input_ids']).squeeze()
        holder['mask'] = torch.tensor(encoded['attention_mask']).squeeze()
        holder['label'] = float(self.labels[index])
        return holder

In [None]:
class NLPModel(nn.Module):
    
    def __init__(self, pretraining):
        super(NLPModel, self).__init__()
        #self.Bert = BertModel.from_pretrained(pretraining)
        self.Bert = RobertaModel.from_pretrained(pretraining)
        self.hidden_size = self.Bert.config.hidden_size
        self.cls = nn.Linear(self.hidden_size, 1)
        self.dropout = nn.Dropout(0.4)
    
    def forward(self, x, mask):
        cls_emb = self.Bert(x, mask)[1]
        prediction = self.cls(self.dropout(cls_emb))
        return prediction

In [None]:
lr = 1e-5
lr_bert_decay = 0.95
epochs = 3
path = 'model.pth'

In [None]:
trainSet = disaster_data(train_data, encoder)
valSet = disaster_data(val_data, encoder)
trainLoader = DataLoader(trainSet, batch_size = 16, shuffle = True)
valLoader = DataLoader(valSet, batch_size = 16, shuffle = False)
gpu = torch.device('cuda')
model = NLPModel(pretraining).to(gpu)
lr_bert_config = []
for index in range(len(model.Bert.encoder.layer)):
    holder = {'params' : model.Bert.encoder.layer[-(index + 1)].parameters(),
              'lr' : lr * (lr_bert_decay ** index)}
    lr_bert_config.append(holder)
criterion = nn.BCEWithLogitsLoss(reduction = 'sum')
optimizer_cls = AdamW(model.cls.parameters(), lr)
optimizer_bert = AdamW(lr_bert_config)
scheduler = get_linear_schedule_with_warmup(optimizer = optimizer_bert,
                                            num_warmup_steps = 150,
                                            num_training_steps = len(trainLoader) * epochs)

In [None]:
def train():
    
    model.train()
    
    total_loss = 0
    
    for step, batch in enumerate(trainLoader):
        
        emb = batch['embedding'].to(gpu)
        label = batch['label'].to(gpu)
        mask = batch['mask'].to(gpu)
        
        output = model(emb, mask).double()
        
        loss = criterion(output, label[:, None])
        total_loss += loss.item()
        
        optimizer_bert.zero_grad()
        optimizer_cls.zero_grad()
        loss.backward()
        
        optimizer_bert.step()
        optimizer_cls.step()
        
        scheduler.step()
        
    return round(total_loss / len(trainSet), 5)

In [None]:
def evaluate():
    
    model.eval()
    
    total_loss = 0
    pred = []
    labelSet = []
    
    for batch in valLoader:
        
        emb = batch['embedding'].to(gpu)
        label = batch['label'].to(gpu)
        mask = batch['mask'].to(gpu)
        
        with torch.no_grad():
            output = model(emb, mask).double()
            
            loss = criterion(output, label[:, None])
            total_loss += loss.item()
            pred.extend(output.cpu().detach().numpy().tolist())
            labelSet.extend(label.cpu().detach().numpy().tolist())
    
    pred = np.array(pred) >= 0.5
    accuracy = accuracy_score(labelSet, pred)
    
    return pred, round(total_loss / len(valSet), 5), round(accuracy, 4)

In [None]:
def run():
    log = []
    best_eval_loss = float('inf')
    for epoch in range(epochs):
        train_loss = train()
        print(f'Epoch {epoch + 1}')
        print(f'Training loss : {train_loss}')
        print('Evaluating...')
        pred, eval_loss, accuracy = evaluate()
        print(f'Validation loss : {eval_loss} | Accuracy : {accuracy}')
        if eval_loss < best_eval_loss:
            best_eval_loss = eval_loss
            torch.save(model.state_dict(), path)
            print(f'New eval loss was generated, the current best one is {best_eval_loss}')
        log.append({'epoch' : epoch + 1,
                    'train_loss' : train_loss,
                    'eval_loss' : eval_loss,
                    'best_eval_loss' : best_eval_loss})
    return log

In [None]:
def plot_error(log):
    
    timestamp = []
    train_error = []
    val_error = []
    for dic in log:
        timestamp.append(dic['epoch'])
        train_error.append(dic['train_loss'])
        val_error.append(dic['eval_loss'])
        
    plt.figure(figsize = (10, 5))
    
    plt.subplot(1, 2, 1)
    train_error_plot = sns.lineplot(x = timestamp, y = train_error)
    train_error_plot.set(xlabel = 'Epochs', ylabel = 'Training Error')
    
    plt.subplot(1, 2, 2)
    val_error_plot = sns.lineplot(x = timestamp, y = val_error)
    val_error_plot.set(xlabel = 'Epochs', ylabel = 'Validation Error')

In [None]:
log = run()
plot_error(log)

In [None]:
class Disaster_test_set(Dataset):
    
    def __init__(self, dataset, encoder):
        super(Disaster_test_set, self).__init__()
        self.encoder = encoder
        self.data = dataset
        self.text = [text for text in self.data['text']]
        self.id = dataset['id']
        
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, index):
        holder = {}
        encoded = encoder.batch_encode_plus([self.text[index]], max_length = 30, truncation = True, pad_to_max_length = True)
        holder['id'] = self.id[index]
        holder['embedding'] = torch.tensor(encoded['input_ids']).squeeze()
        holder['mask'] = torch.tensor(encoded['attention_mask']).squeeze()
        return holder

In [None]:
test_pd = pd.read_csv('test.csv')
testSet = Disaster_test_set(test_pd, encoder)
testLoader = DataLoader(testSet, batch_size = 32, shuffle = False)
model = NLPModel(pretraining)
model.load_state_dict(torch.load(path))
model = model.to(gpu)

In [None]:
def predict():
    
    ids = []
    prediction = []
    for batch in testLoader:
        emb = batch['embedding'].to(gpu)
        mask = batch['mask'].to(gpu)
        ids.extend(batch['id'].tolist())
        
        output = model(emb, mask).squeeze().detach().cpu()
        output = np.array(output) >= 0.5
        prediction.extend(output.astype(int).tolist())
    return ids, prediction

In [None]:
id, pred = predict()
testFrame = pd.DataFrame.from_dict({'id' : id,
                                    'target' : pred})
testFrame.to_csv('submisson.csv', index = None)