## Importing required modules

In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from transformers import BertModel, BertTokenizer

## Reading data into memory and spliting them into training set and validation set

In [None]:
train_pd = pd.read_csv('train.csv')
train_text, val_text, train_label, val_label = train_test_split(train_pd['text'], train_pd['target'],
                                                                test_size = 0.3, random_state = 2020)
train_data = [{'text' : text, 'target' : label} for text, label in zip(train_text, train_label)]
val_data = [{'text' : text, 'target' : label} for text, label in zip(val_text, val_label)]

In [None]:
pretraining = 'bert-base-uncased'
encoder = BertTokenizer.from_pretrained(pretraining)

In [None]:
class disaster_data(Dataset):
    
    def __init__(self, dataset, encoder):
        super(disaster_data, self).__init__()
        self.encoder = encoder
        self.data = dataset
        self.text = [row['text'] for row in self.data]
        self.labels = [row['target'] for row in self.data]
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        
        holder = {}
        encoded = encoder.batch_encode_plus([self.text[index]], max_length = 30, truncation = True, pad_to_max_length = True)
        holder['embedding'] = torch.tensor(encoded['input_ids']).squeeze()
        holder['mask'] = torch.tensor(encoded['attention_mask']).squeeze()
        holder['label'] = float(self.labels[index])
        return holder

In [None]:
class NLPModel(nn.Module):
    
    def __init__(self, pretraining, inter_size = [1024, 16]):
        super(NLPModel, self).__init__()
        assert type(inter_size) is list, 'inter_size must be list'
        assert len(inter_size) == 2, 'The Length of inter_size must be 2'
        self.Bert = BertModel.from_pretrained(pretraining)
        self.hidden_size = self.Bert.config.hidden_size
        self.cls = nn.Sequential(
            nn.Linear(self.hidden_size, inter_size[0]),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(inter_size[0], inter_size[1]),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(inter_size[1], 1)
        )
    
    def forward(self, x, mask):
        cls_emb = self.Bert(x, mask)[0][:, 0, :]
        prediction = self.cls(cls_emb)
        return prediction

In [None]:
lr_cls = 0.0001
epochs = 100

In [None]:
trainSet = disaster_data(train_data, encoder)
valSet = disaster_data(val_data, encoder)
trainLoader = DataLoader(trainSet, batch_size = 64, shuffle = True)
valLoader = DataLoader(valSet, batch_size = 64, shuffle = False)
gpu = torch.device('cuda')
model = NLPModel(pretraining).to(gpu)
#class_weights = compute_class_weight('balanced', classes = [0, 1], y = [row['target'] for row in train])
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.cls.parameters(), lr_cls)

In [None]:
for param in model.Bert.parameters():
    param.required_grad = False

In [None]:
def train():
    
    model.train()
    
    total_loss = 0
    
    for step, batch in enumerate(trainLoader):
        
        emb = batch['embedding'].to(gpu)
        label = batch['label'].to(gpu)
        mask = batch['mask'].to(gpu)
        
        output = model(emb, mask).double()
        
        loss = criterion(output, label[:, None])
        total_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    return round(total_loss / len(trainLoader), 5)

In [None]:
def evaluate():
    
    model.eval()
    
    total_loss = 0
    pred = torch.tensor([])
    labelSet = torch.tensor([])
    
    for batch in valLoader:
        
        emb = batch['embedding'].to(gpu)
        label = batch['label'].to(gpu)
        mask = batch['mask'].to(gpu)
        
        with torch.no_grad():
            output = model(emb, mask).double()
            
            loss = criterion(output, label[:, None])
            total_loss += loss.item()
            pred = torch.cat([pred, torch.round(torch.sigmoid(output)).squeeze().float().cpu()])
            labelSet = torch.cat([labelSet, label.float().cpu()])
            
    accuracy = torch.eq(pred, labelSet).sum().item() / len(valLoader)
    return pred, round(total_loss / len(valLoader), 5), round(accuracy, 2)

In [None]:
for epoch in range(epochs):
    best_eval_loss = float('inf')
    train_loss = train()
    print(f'Epoch {epoch + 1}')
    print(f'Training loss : {train_loss}')
    print('Evaluating...')
    pred, eval_loss, accuracy = evaluate()
    print(f'Validation loss : {eval_loss} | Accuracy : {accuracy}')
    if eval_loss < best_eval_loss:
        best_eval_loss = eval_loss
        print(f'New eval loss was generated, the current best one is {best_eval_loss}')