In [None]:
# This mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive')

# TODO: Enter the foldername in your Drive where you have saved the unzipped
# assignment folder, e.g. 'cs231n/assignments/assignment1/'
FOLDERNAME = 'Kaggle/Feedback_Prize_ELL/Silver'
assert FOLDERNAME is not None, "[!] Enter the foldername."

# Now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))
%cd /content/drive/My\ Drive/$FOLDERNAME

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/Kaggle/Feedback_Prize_ELL/Silver


In [None]:
!pip install transformers
!pip install -q iterative-stratification==0.1.7
!pip install wandb

In [None]:
import os
import gc
import sys
import random
import platform

import numpy as np
import pandas as pd
from rich import progress
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.cuda.amp import GradScaler, autocast
from torch.utils.data import Dataset, DataLoader

import transformers
from transformers import get_cosine_schedule_with_warmup

from sklearn.metrics import mean_squared_error
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import wandb
import warnings
warnings.simplefilter('ignore')

In [None]:
TRAIN_PATH = "../input/train.csv"
TEST_PATH = "../input/test.csv"

In [None]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

# Custom Functions

In [None]:
def wandb_log(**kwargs):
    for k, v in kwargs.item():
        wandb.log({k: v})

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed=42)

## MCRMSE - Mean Columnwise Root Mean Square Error
Basically mean (RMSE) of all columns where each column is a separate target variable.

$$MCRMSE = \frac{1}{m} \sum_{j=1}^m \sqrt{\frac{1}{n} \sum_{i=1}^{n} (y_{ij} - \hat{y}_{ij})^2}$$

where:
- $m$ is the number of predicted variables (number of columns)
- $n$ is number of test samples (number of rows)
- $y_{ij}$ - $i^{th}$ actual value of $j^{th}$ variable.
- $\hat{y}_{ij}$ - $i^{th}$ predicted value of $j^{th}$ variable.

The part inside the $\sqrt{(\cdot)}$ is the original RMSE of each column.

In [None]:
def MCRMSE(y_trues, y_preds):
    '''
    Both y_trues and y_preds is N*M where
    '''

    y_trues = np.asarray(y_trues)
    y_preds = np.asarray(y_preds)
    scores = []
    m = y_trues.shape[1]

    for j in range(m):
        # take entire column j from both y_true and y_pred
        y_true = y_trues[:, j]
        y_pred = y_preds[:, j]
        # calculate rmse
        rmse = mean_squared_error(y_true, y_pred, squared=False)
        scores.append(rmse)

    mcrmse = np.mean(scores)
    return mcrmse


In [22]:
Config = {
    'TRAIN_BS': 16,
    'VALID_BS': 16,
    'MODEL_NAME': 'roberta-large',
    'TOKENIZER': transformers.AutoTokenizer.from_pretrained('roberta-base', use_fast=True),
    'NUM_WORKERS': 8,
    # GradScaler helps harness the computational benefits of mixed-precision training while addressing the challenges of numerical stability.
    # It's particularly useful when training deep learning models on GPUs with mixed-precision capabilities to achieve faster training times and better resource utilization.
    'scaler': GradScaler(),
    'FILE_PATH': '../input/feedback-prize-english-language-learning/train.csv',
    # Uses for multi-target regression
    # combines the qualities of both MSE and MAE.
    # It's smooth around zero but less sensitive to outliers compared to MSE.
    # Mathematically, it switches between an L1 loss and an L2 loss based on a threshold.
    'LOSS': 'SmoothL1Loss',
    'EVAL_METRIC': 'MCRMSE',
    'NB_EPOCHS': 5,
    'SPLITS': 5,
    'T_0': 20,
    'η_min': 1e-4,
    'fc_dropout': 0.2,
    'betas': (0.9, 0.999),
    'MAX_LEN': 200,
    'N_LABELS': 6,
    'LR': 2e-4,
    'competition': 'feedback_3',
    '_wandb_kernel': 'tanaym',
}


Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

## Weights and Biases

WandB is a developer tool for companies turn deep learning research projects into deployed software by helping teams track their models, visualize model performance and easily automate training and improving models. We will use their tools to log hyperparameters and output metrics from your runs, then visualize and compare results and quickly share findings with your colleagues.



In [None]:
# TODO

# Dataset

In [None]:
class FeedBackDataset(Dataset):
    def __init__(self, data, is_test=False):
        self.data = data
        self.is_test = is_test

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # .values access the underlying numpy array of DataFrame
        text = self.data['full_text'].values
        labels = self.data[['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']].values
        inputs = self._tokenize_texts(text[idx])

        if not self.is_test:
            targets = torch.tensor(labels[idx], dtype=torch.float)
            return inputs, targets

    def _tokenize_texts(self, text):
        # .encode_plus provides more flexibity and customization than .encode
        inputs = Config['TOKENIZER'].encode_plus(
            text,
            return_tensors=None, # here we dont return tensors but simply a dict
            add_special_tokens=True,
            max_length=Config['MAX_LEN'],
            pad_to_max_length=True,
            truncation=True
        )

        for k, v in inputs.items():
            # convert to tensor.long, enhance precision (64bits)
            inputs[k] = torch.tensor(v, dtype=torch.long)

        return inputs


# Data preprocessing

We could apply typical text preprocessing steps such as removing stopwords and punctuations are generally not necessary or even recommended when training BERT and similar transformer-based models. BERT is designed to learn from the raw text data and handle various linguistic features, including stopwords and punctuation, on its own. Applying traditional preprocessing steps can actually hinder the performance of transformer models like BERT. Here's why:

\\

 1. **Contextual Learning**: BERT is a contextual language model that learns representations by considering the surrounding words in a sentence. Removing stopwords can disrupt the context and structure of the sentence, which is crucial for BERT's understanding.

2. **Positional Information**: Transformer models rely on positional information of words in a sentence. Punctuation marks contribute to this positional information, and removing them might negatively impact the model's ability to understand sentence structure.

3. **Fine-tuning**: When fine-tuning BERT for specific downstream tasks (e.g., sentiment analysis, text classification), it's recommended to keep the input text as close to the original as possible. Removing stopwords might remove important information that the model needs to make accurate predictions.

4. **Efficiency**: Transformer models like BERT can handle a wide range of linguistic variations and complexities. Removing stopwords might not provide significant benefits in terms of model efficiency or performance improvement.

5. **Tokenization**: BERT tokenizes input text into subword tokens, including punctuation and special characters. Removing punctuation before tokenization could lead to tokenization mismatch between your preprocessed data and the model's tokenizer.

But we could apply lowercasing and remove links, remove words containing numbers.

In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.
    - Reason why we remove digits/ numbers from the text is because we reduce noise if we use models that based on word
    embeddings such as Word2Vec, BERT etc. These models are designed to capture semantic relationships between words.
    - When performing text analysis, it's often useful to treat different forms of a word (e.g., "run," "running," "ran") as the same word.
    Including digits within words (e.g., "running123") could prevent effective word normalization.
    '''
    text = str(text).lower()
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
# Text processing using regular expression is much faster than simple for loop + condition checking.
train['full_text'] = train['full_text'].apply(lambda x: clean_text(x))
print(len(train))
test['full_text'] = test['full_text'].apply(lambda x: clean_text(x))
print(len(test))

3911
3


# Model

The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.

In [None]:
class FeedBackModel(nn.Module):
    def __init__(self):
        super(FeedBackModel, self).__init__()
        self.backbone = transformers.AutoModel.from_pretrained(Config['MODEL_NAME'])
        self.drop = nn.Dropout(0.3)

        if 'large' in Config['MODEL_NAME']:
            self.fc = nn.Linear(1024, Config['N_LABELS'])
        else:
            self.fc = nn.Linear(768, Config['N_LABELS'])

    def foward(self, input_ids, attention_mask):
        '''
        output_dicts return dict contains 2 keys:
        (1). last_hidden_states: raw hidden_states.
        (2). pooler_outputs: [CLS] pooled: tensor representing pooled output of [CLS] token. People usually use this token for fine-tuning classification task. We can try average
        pooling. In this project we will use average pooling instead
        '''

        # Get RoBERTA outputs
        outputs_dict = self.backbone(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)

        # Use average pooling over all tokens
        # calculated by taking the weighted average of the token embeddings using the attention mask to account for padding tokens.
        # This allows you to compute a meaningful average over the actual tokens in each sequence.
        avg_pooled_output = torch.sum(outputs_dict['last_hidden_state'] * attention_mask.unsqueeze(-1), dim=1) / attention_mask.sum(dim=1, keepdim=True)

        output = self.drop(avg_pooled_output)
        output = self.fc(output)
        return output


# Optimizer

In [None]:
def yield_optimizer(model):
    """
    Returns optimizer for specific parameters
    """
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.003,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]
    return transformers.AdamW(optimizer_parameters, lr=Config['LR'])

# Trainer Class

In [27]:
class Trainer:
    def __init__(self, dataloaders, optimizer, model, loss_fns, scheduler, device='cuda:0'):
        # dataloaders
        self.train_loader, self.valid_loader = dataloaders
        # define loss functions
        self.train_loss_fn, self.valid_loss_fn = loss_fns

        self.scheduler = scheduler
        self.optimizer = optimizer
        self.model = model

        self.device = torch.device(device)

    # Training procedure
    def train_one_epoch(self):
        def _convert_if_not_tensor(self, x, dtype):
            if dtype == "infer":
                dtype = x.dtype
            if self._tensor_check(x):
                return x.to(self.device, dtype=dtype)
            else:
                return torch.tensor(x, dtype=dtype, device=self.device)

        def _tensor_check(self, x):
            return isinstance(x, torch.Tensor)

        '''
        Trains the model for 1 epoch
        '''
        self.model.train()

        # training loop for 1 epoch
        train_pbar = tqdm(enumerate(self.train_loader), total=len(self.train_loader))
        train_preds, train_targets = [], []
        total_loss = 0.0

        for bnum, (inputs, targets) in train_pbar:
            for k, v in inputs.items():
                inputs[k] = self._convert_if_not_tensor(v, dtype="infer")

            targets = self._convert_if_not_tensor(targets, dtype=torch.float)

            with autocast(enabled=True):
                # forward pass
                outputs = self.model(inputs)

                # calculate loss
                loss = self.train_loss_fn(outputs, targets)
                # accumulate batch loss
                total_loss += loss.item()

                # backpropogation
                Config['scaler'].scale(loss).backward()
                Config['scaler'].step(self.optimizer)
                Config['scaler'].update()
                self.optimizer.zero_grad()
                # scheduler adjusts the learning rate or other hyperparameters of an optimization
                self.scheduler.step()

                # Console log
                train_pbar.set_description('loss: {:.2f}'.format(loss.item()))

            train_targets.expend(targets.cpu().detach().numpy().tolist())
            train_preds.expend(outputs.cpu().detach().numpy().tolist())

        # tidy
        del outputs, targets, inputs, loss_itm, loss
        gc.collect()
        torch.cuda.empty_cache()

        return {'train_preds': train_preds, 'train_targets': train_targets, 'loss_history': total_loss/len(self.train_loader)}

    #-------------------------------------------
    def valid_one_epoch(self):
        '''
        Validate the model for 1 epoch
        '''
        self.model.eval()

        valid_pbar = tqdm(enumerate(self.valid_loader), total=len(self.valid_loader))
        valid_preds, valid_targets = [], []
        total_valid_loss = 0.0

        for idx, (inputs, targets) in valid_pbar:
            for k, v in inputs.items():
                inputs[k] = self._convert_if_not_tensor(v, dtype='infer')

            targets = self._convert_if_not_tensor(targets, dtype=torch.float)

            outputs = self.model(inputs)
            valid_loss = self.valid_loss_fn(outputs, targets)
            # accumulate batch loss
            total_valid_loss += valid_loss.item()

            valid_pbar.set_description('val_loss: {:.2f}'.format(valid_loss.item()))

            valid_targets.extend(targets.cpu().detach().numpy().tolist())
            valid_preds.extend(outputs.cpu().detach().numpy().tolist())

        # tidy
        del outputs, inputs, targets, valid_loss
        gc.collect()
        torch.cuda.empty_cache()

        return {'valid_preds': valid_preds, 'valid_targets': valid_targets, 'loss_history': total_valid_loss/len(self.valid_loader)}

    #---------------------------------------------

    def fit(self, epochs=10):
        '''
        Complete training and validation process
        '''
        best_loss = int(1e+7)
        best_model = None
        # total_loss for train/ valid across n epochs
        train_history, valid_history = [], []

        # Training for each epoch
        for epx in range(epochs):
            print('Epoch: {}/{}'.format(epx+1, epochs))

            train_outputs = self.train_one_epoch()
            train_preds, train_targets, train_loss = train_outputs.values()
            train_history.append(train_loss)

            train_mcrmse = MCRMSE(train_targets, train_preds)
            print('Training MCRMSE: {:.4f}'.format(train_mcrmse))


            valid_outputs = self.valid_one_epoch()
            valid_preds, valid_targets, valid_loss = valid_outputs.values()
            valid_history.append(valid_loss)

            valid_mcrmse = MCRMSE(valid_targets, valid_preds)
            print('Validation MCRMSE: {:.4f}'.format(valid_mcrmse))

            if valid_mcrmse < best_loss:
                best_loss = valid_mcrmse
                best_model = model
                path = '/Models/val_loss{}'.format(best_loss)
                torch.save(self.best_model.state_dict(), path)
                print('Model saved! Validation Loss: {:.4f}'.format(best_loss))

        np.save('train_history', train_history)
        np.save('valid_history', valid_history)

        return {'best_valid_mcrmse': best_loss, # best across all epochs
                'best_model': best_model,
                'train_history': train_history,
                'valid_history': valid_history}


# Training

In [None]:
# Setup environment for training. Use GPU.
if torch.cuda.is_available():
    print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))
    DEVICE = torch.device('cuda:0')
else:
    print("\n[INFO] GPU not found. Using CPU: {}\n".format(platform.processor()))
    DEVICE = torch.device('cpu')

data = train
# Shuffle and reset index for the training data
data = data.sample(frac=1).reset_index(drop=True)


[INFO] GPU not found. Using CPU: x86_64



In [None]:
text = data[['full_text']]
labels = data[['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']]
per_fold_predictions = {}

In [None]:
# Use Multilabel Stratified KFolds training and cross validation because we are dealing with multi-targets regression
kf = MultilabelStratifiedKFold(n_splits=Config['SPLITS'], shuffle=True)

In [None]:
model = FeedBackModel().to(DEVICE)

for fold, (train_idx, valid_idx) in enumerate(kf.split(X=text, y=labels.values)):
    '''
    fold (int): current index of the fold
    train_idx (np.array): 1D numpy array of indices from the data dedicated for train dataset in this fold
    valid_idx (np.array): 1D numpy array of indices from the data dedicated for valid dataset in this fold
    '''
    print('Fold: {}'.format(fold))

    # Partition
    train_data = data.loc(train_idx)
    valid_data = data.loc(valid_idx)

    # DataLoader customizable class
    train_set = FeedBackDataset(train_data)
    valid_set = FeedBackDataset(valid_data)

    # Load to DataLoader
    train_loader = DataLoader(
        train_set,
        batch_size = Config['TRAIN_BS'],
        shuffle = True,
        num_workers = 8
    )
    valid_loader = DataLoader(
        valid_set,
        batch_size = Config['VALID_BS'],
        shuffle = True,
        num_workers = 8
    )

    # Training Loop
    optimizer = yield_optimizer(model)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer,
        T_0 = Config['T_0'],
        eta_min = Config['η_min']
    )
    # Define loss function
    train_loss_fn, valid_loss_fn = nn.SmoothL1Loss(), nn.SmoothL1Loss()

    # Trainer module
    trainer = Trainer(
        dataloaders = (train_loader, valid_loader),
        loss_fns = (train_loss_fn, valid_loss_fn),
        optimizer = optimizer,
        model = model,
        scheduler = scheduler
    )

    best_predictor = trainer.fit(
        epochs = Config['NB_EPOCHS']
    )
    # Move the model back to cpu
    model.cpu()

    per_fold_predictions['fold_{}'.format(fold)] = best_predictor

    del best_predictor, trainer, train_loss_fn, valid_loss_fn, model, optimizer, scheduler
    del train_data, valid_data, train_set, valid_set, train_loader, valid_loader, train_idx, valid_idx

    gc.collect()
    torch.cuda.empty_cache()


# Testing

In [28]:
best_mcrmse = int(1e+7)
best_model = None

# Compare best_valid_mcrmse from best_model across K-Folds
for _, predictor in per_fold_predictions.items():
    if best_mcrmse < predictor['best_valid_mcrmse']:
        best_mcrmse = predictor['best_valid_mcrmse']
        best_model = predictor['best_model']


fold: 0, train_idx: (3129,), valid_idx: (782,)
fold: 1, train_idx: (3129,), valid_idx: (782,)
fold: 2, train_idx: (3128,), valid_idx: (783,)
fold: 3, train_idx: (3129,), valid_idx: (782,)
fold: 4, train_idx: (3129,), valid_idx: (782,)
