# NLP with Disaster Tweets

This notebook contains code for training BERT using Pytorch and Hugging Face on the dataset used in the Kaggle Competition Natural Language Processing with Disaster Tweets (https://www.kaggle.com/competitions/nlp-getting-started/)

# Dataset

## Files

train.csv - the training set

test.csv - the test set

sample_submission.csv - a sample submission file to submit the predictions on Kaggle

## Columns

id - id for each tweet

text - the text of the tweet

location - the location the tweet was sent from

keyword - a particular keyword from the tweet

target - in train.csv only, this denotes whether a tweet is about a real disaster (1) or not (0)

### The goal is to predict if the tweet is about a disaster or not.

In [1]:
# Import Libraries
import os
import gc
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from torch.nn import functional as F
import transformers
from torch.utils.data import DataLoader, Dataset

In [2]:
#Model hyperparameter configs
class CONFIG:
    MODEL_PATH = './model/'
    SAVE_EVERT = 10
    EPOCHS = 10
    BATCH_SIZE = 16
    LEARNING_RATE = 1e-5
    TRAIN_TEST_SPLIT = 0.3

In [3]:
# BERT CLass
class BERT(nn.Module):
    def __init__(self, bert_model_name, num_labels, dropout=0.1, freeze_bert=True):
        super(BERT, self).__init__()
        # Import BERT from HuggingFace Transformer library
        self.bert = transformers.BertModel.from_pretrained(bert_model_name)
        # Freeze weights of Pretrained BERT Model. These weights will not be
        # updated during training.
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        # Add Dropout Layer
        self.dropout = nn.Dropout(dropout)
        # Add a Classifier Layer for finetune the model and get the output=number of labels.
        self.classifier = nn.Linear(768, num_labels)
        self.num_labels = num_labels
    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        # Forward Function to pass the data.
        # Pass the data thriugh BERT
        _, pooled_output = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, return_dict=False)
        # Add Dropout Layer to the BERT Output
        pooled_output = self.dropout(pooled_output)
        # Get Classifier Output Logits
        logits = self.classifier(pooled_output)
        return logits

In [4]:
class DisasterTweetsDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=512, train=True):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.df = df
        self.train = train
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Get Text Data
        text = str(self.df.iloc[idx]["text"])
        if self.train:
            # Get labels
            targets = torch.tensor(self.df.iloc[idx]["target"], dtype=torch.long)
        
        # Use the BERT Tokenizer on the Inputs.
        # Add special tokens for start and end of text, pad the input texts to get them in equal length,
        # Get attention mask on the padded inputs, get token_type_ids and return the outputs in the form 
        # of Pytorch Tensor.
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors="pt",
        )
        input_ids, attention_mask, token_type_ids = inputs["input_ids"], inputs["attention_mask"], inputs["token_type_ids"]
        # In case of training, there is an additional targets output.
        if self.train:
            return {
                "input_ids": input_ids,
                "attention_mask": attention_mask,
                "token_type_ids": token_type_ids,
                "targets": targets,
                }
        else:
            return {
                "input_ids": input_ids,
                "attention_mask": attention_mask,
                "token_type_ids": token_type_ids,
                }


In [5]:
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
print("loading data")
# Get Train and Validation Data. Get the Train-Val split from the CONFIG.
df = pd.read_csv("../input/nlp-getting-started/train.csv")
train_df, valid_df = train_test_split(df, test_size=CONFIG.TRAIN_TEST_SPLIT, random_state=42)
test_df = pd.read_csv("../input/nlp-getting-started/test.csv")
# Get the sample submission to submit the prediction to Kaggle.
sub = pd.read_csv("../input/nlp-getting-started/sample_submission.csv")

In [6]:
# Load the bert model to the device. Number of labels is 1 because there is one output field that can be 
# 0 or 1 i.e. it is a disaster (1) or not (0).
model = BERT("../input/huggingface-bert/bert-base-cased", num_labels=1, dropout=0.1).to(device)
# Get tokenizer output.
tokenizer = transformers.BertTokenizer.from_pretrained("../input/huggingface-bert/bert-base-cased")
# Get train, valid and test datasets
train_dataset = DisasterTweetsDataset(train_df, tokenizer, max_len=512, train=True)
valid_dataset = DisasterTweetsDataset(valid_df, tokenizer, max_len=512, train=True)
test_dataset = DisasterTweetsDataset(test_df, tokenizer, max_len=512, train=False)
# Use AdamW Optimizer
optimizer = transformers.AdamW(model.parameters(), lr=2e-5)
# Use Learning rate scheduler
scheduler = transformers.get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_dataset) * 5,
)
# Loss Function
criterion = nn.BCEWithLogitsLoss()

In [8]:
def train_one_epoch(model, optimizer, scheduler, train_dataloader, device):
    # Call the model training function.
    model.train()
    # Training Loss
    total_loss = 0
    bar = tqdm(enumerate(train_dataloader), total=len(train_dataloader))
    for i, data in bar:
        # Zero grad for optimizer before starting the training batch.
        optimizer.zero_grad()
        # Pass the input_ids, attention_mask, token_type_ids to the model.
        input_ids = data["input_ids"].to(device).squeeze(1)
        attention_mask = data["attention_mask"].to(device).squeeze(1)
        token_type_ids = data["token_type_ids"].to(device).squeeze(1)
        targets = data["targets"].to(device).unsqueeze(1)
        out = model(input_ids, attention_mask, token_type_ids)
        # Get the model loss.
        loss = criterion(out, targets.float())
        bar.set_postfix({
                "Train Loss": "{:.6f}".format(abs(loss)),}
        )
        # Backpropagate the loss.
        loss.backward()
        # Update the model parameters.
        optimizer.step()
        # Update the learning rate scheduler.
        scheduler.step()
        # Add the batch loss to the epoch loss.
        total_loss += loss.item()
    # Normalize the loss.
    return total_loss / len(train_dataloader)

In [9]:
def valid_one_epoch(model, valid_dataloader, device):
    # Change the model to eval mode.
    model.eval()
    # Validation Loss
    total_loss = 0
    with torch.no_grad():
        bar = tqdm(enumerate(valid_dataloader), total=len(valid_dataloader))
        for i, data in bar:
            # Pass the input_ids, attention_mask, token_type_ids to the model.
            input_ids = data["input_ids"].to(device).squeeze(1)
            attention_mask = data["attention_mask"].to(device).squeeze(1)
            token_type_ids = data["token_type_ids"].to(device).squeeze(1)
            targets = data["targets"].to(device).unsqueeze(1)
            out = model(input_ids, attention_mask, token_type_ids)
            # Get the model loss.
            loss = criterion(out, targets.float())
            bar.set_postfix({
                "Valid Loss": "{:.6f}".format(abs(loss)),}
            )
            # Add the batch loss to the epoch loss.
            total_loss += loss.item()
    # Normalize the loss.
    return total_loss / len(valid_dataloader)


In [10]:
def get_dataloader(train_dataset, valid_dataset, batch_size=CONFIG.BATCH_SIZE):
    # Get the Dataloader for Train and Validation datasets.
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
    return train_dataloader, valid_dataloader


In [11]:
# Store train and valid losses after each epoch.
train_losses = []
valid_losses = []
def train():
    # Get the train and valid dataloader
    train_dataloader, valid_dataloader = get_dataloader(train_dataset, valid_dataset)
    # Update the best valid loss each
    best_valid_loss = float("inf")
    for epoch in range(1, CONFIG.EPOCHS + 1):
        # Get the train loss for the epoch.
        train_loss = train_one_epoch(model, optimizer, scheduler, train_dataloader, device)
        # Get the validation loss for the epoch.
        valid_loss = valid_one_epoch(model, valid_dataloader, device)
        # Store the train and valid loss in the list.
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)
        print(f"Epoch {epoch}: Training Loss: {train_loss:.4f}, Validation Loss: {valid_loss:.4f}")
        # Update the best valid loss each time a model gets better performance.
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), "best_model.bin")    


In [12]:
train()

### Train Loss = 0.6576
### Validation Loss= 0.6454