In [None]:
# !pip install transformers

In [None]:
import pandas as pd
import numpy as np
import sklearn

from category_encoders import OrdinalEncoder
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import (
    RobertaConfig, RobertaTokenizer, RobertaModel,
    get_linear_schedule_with_warmup
)

import time

import random
import os

In [None]:
transformers.__version__ # 2.11.0

In [None]:
sklearn.__version__ # 0.23.1

In [None]:
torch.__version__ # 1.5.0

# Recommendation - run on kaggle kernels as required packages come with its environment

# CUDA must be available in environment
> Because training was done with CUDA available

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
SEED = 23

random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True

### Import

In [2]:
df_train = pd.read_csv("data/Train.csv")
df_test = pd.read_csv("data/Test.csv")

sample_submission = pd.read_csv("data/SampleSubmission.csv")

### Convert texts to lowercase and remove duplicate texts

In [None]:
df_train.text = df_train.text.str.lower()
df_train = df_train.drop(df_train[df_train[["text", "label"]].duplicated()].index).reset_index(drop = True)

df_test.text = df_test.text.str.lower()

In [None]:
df_train.shape, df_test.shape

### Encode target labels

In [None]:
le = OrdinalEncoder(cols = ["label"], return_df = False, mapping = [{"col": "label", "mapping": {"Depression": 0, "Alcohol": 1, "Suicide": 2, "Drugs": 3}}])
df_train.label = le.fit_transform(df_train.label)[:,0]

### Obtain maximum word length of a sample in train and test

In [None]:
df_train['text'].apply(lambda x:len(str(x).split())).max(), df_test['text'].apply(lambda x:len(str(x).split())).max()

In [None]:
class Process_Data(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, task = "train"):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.task = task
        
    def __getitem__(self, index):
        text = self.data.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens = True,
            max_length = self.max_len,
            pad_to_max_length = True,
            return_token_type_ids = True
        )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        out = {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
        }
        
        # Set train data targets which is not applicable to test data
        if self.task == "train":
            out.update(
                {
                    'targets': torch.tensor(self.data.label[index], dtype = torch.long)
                }
            )
            
        return out
    
    def __len__(self):
        return self.len

In [None]:
def train_evaluate_predict(model, train_data, valid_data, test_data, loss_fn, lr, epochs, batch_size, warm_up_prop, device, n_samples_train, n_samples_val):
    
    train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True, num_workers = 0)
    valid_loader = DataLoader(valid_data, batch_size = batch_size, shuffle = False, num_workers = 0)
    test_loader = DataLoader(test_data, batch_size = batch_size, shuffle = False, num_workers = 0)
    
    num_training_steps = epochs * len(train_loader)
    num_warmup_steps = int(warm_up_prop * num_training_steps)
    optimizer = transformers.AdamW(model.parameters(), lr = lr, weight_decay = 0.1)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)
    
    validation_loss = 0
    
    for epoch in range(epochs):
        # Train
        model.train()
        start_time = time.time()
        
        batch_losses_train = []
        n_correct = 0
        avg_loss = 0
        
        for batch_index, data in enumerate(train_loader, 0):
            
            input_ids = data["ids"].to(device, dtype = torch.long)
            attention_mask = data["mask"].to(device, dtype = torch.long)
            targets = data["targets"].to(device, dtype = torch.long)

            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, targets)
            
            batch_losses_train.append(loss.item())
            _, preds = torch.max(outputs, dim = 1)
            n_correct += torch.sum(preds == targets)
            
            loss.backward()            
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

        epoch_loss = np.mean(batch_losses_train)
            
            
        # Evaluate
        model.eval()
        
        batch_losses_val = []
        n_correct_val = 0
        avg_val_loss = 0
        
        with torch.no_grad():
            for batch_index, data in enumerate(valid_loader, 0):
                input_ids = data['ids'].to(device, dtype = torch.long)
                attention_mask = data['mask'].to(device, dtype = torch.long)
                targets = data['targets'].to(device, dtype = torch.long)
                
                val_outputs = model(input_ids, attention_mask)
                loss = loss_fn(val_outputs, targets)
                
                batch_losses_val.append(loss.item())
                _, val_preds = torch.max(val_outputs, dim = 1)
                n_correct_val += torch.sum(val_preds == targets)
                
        epoch_loss_val = np.mean(batch_losses_val)
        
        if epoch == epochs - 1:
            # Store val_loss of last epoch to get final loss 
            validation_loss += epoch_loss_val
            
        
                
        dt = time.time() - start_time
        lr = scheduler.get_last_lr()[0]
        
        print(f'Epoch {epoch + 1}/{epochs} \t lr={lr:.1e} \t t={dt:.0f}s \t loss={epoch_loss:.4f}, acc={n_correct.double() / n_samples_train:.4f} \t val_loss={epoch_loss_val:.4f}, val_acc={n_correct_val.double() / n_samples_val:.4f}')

    # Predict on test set 
    batch_outputs_test = []
    
    with torch.no_grad():
        for batch_index, data in enumerate(test_loader, 0):
            ids = data["ids"].to(device, dtype = torch.long)
            mask = data["mask"].to(device, dtype = torch.long)

            test_outputs = model(ids, mask)
            test_outputs = F.softmax(test_outputs, dim = 1)
            test_outputs = test_outputs.cpu().detach().numpy()
            batch_outputs_test.append(test_outputs)
    
    return validation_loss, np.vstack(batch_outputs_test)

# Modelling

## Multiple (5) runs of 5 cv folds - to reduce variability - 25 total runs
> Each run with a different seed used in sampling data

## RoBERTa

In [None]:
class RoBERTaModel(torch.nn.Module):
    def __init__(self, freeze = False, n_layers = 12, n_attn_heads = 12):
        super(RoBERTaModel, self).__init__()
        
        self.config = RobertaConfig.from_pretrained("roberta-base")
        
        # Config
        self.config.num_hidden_layers = n_layers
        self.config.num_attention_heads = n_attn_heads
        self.config.output_hidden_states = False
        self.config.output_attentions = False
        
        # Roberta Model
        self.roberta = RobertaModel.from_pretrained("roberta-base", config = self.config)
        if freeze:
            for p in self.roberta.parameters():
                p.requires_grad = False
        
        # Dropout
        self.dropout = nn.Dropout(0.6)
        torch.manual_seed(SEED)
        
        # Classifier
        self.classifier = torch.nn.Linear(self.config.hidden_size, 4)
    
    def forward(self, ids, mask):
        sequence_outputs, pooled_output = self.roberta(ids, mask)
        
        # Output with Multi-sample dropout x5 - stacked
        output = torch.stack([self.dropout(pooled_output) for _ in range(5)], dim = 0)
        # Average dropouts
        output = torch.mean(output, dim = 0)
        torch.manual_seed(SEED)
        # Output logits
        logits = self.classifier(output)
        return logits

In [None]:
%%time

seeds = [16, 32, 42, 64, 128]

validation_losses_per_run = []
test_predictions_per_run = []

# 5 runs
for seed in seeds:
    run_count = seeds.index(seed) + 1
    print(f'Run {run_count}')

    # 5 cv folds
    kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = seed)
    splits = list(kfold.split(df_train, df_train.label))

    validation_losses_per_fold = []
    test_predicitons_per_fold = []  

    # Folds
    for i, (train_idx, valid_idx) in enumerate(splits):
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        # Use default RoBERTa params
        model = RoBERTaModel()
        model.to(device)
        loss_fn = torch.nn.CrossEntropyLoss().to(device)
        max_len = 35

        # Data split
        train_set = df_train.iloc[train_idx].reset_index(drop = True)
        valid_set = df_train.iloc[valid_idx].reset_index(drop = True)
        test_set = df_test.copy()

        # Process Data into format required by Transformer model
        train_set = Process_Data(train_set, tokenizer, max_len, "train")
        valid_set = Process_Data(valid_set, tokenizer, max_len, "train")
        test_set = Process_Data(test_set, tokenizer, max_len, "test")

        print(f'Fold {i + 1}')

        # Train, evaluate, predict
        validation_loss, test_prediciton = train_evaluate_predict(model, train_data = train_set, valid_data = valid_set, test_data = test_set, loss_fn = loss_fn, lr = 5e-5, epochs = 3, batch_size = 8, warm_up_prop = 0, device = device, n_samples_train = len(train_set), n_samples_val = len(valid_set))
        # Obtain validation result per fold
        validation_losses_per_fold.append(validation_loss)
        # Obtain test predictions per fold
        test_predicitons_per_fold.append(test_prediciton)

    # Obtain validation result per run
    validation_losses_per_run.append(np.mean(validation_losses_per_fold))
    # Obtain test predictions per run
    test_predictions_per_run.append(np.mean(test_predicitons_per_fold, axis = 0))
    
    # Print result per run
    print(f'Run {run_count} >>> Avg val_loss={np.mean(validation_losses_per_fold)}, S/Dev={np.std(validation_losses_per_fold)}')

print("=" * 100)
# Print summary validation result of all runs
print(f'Total avg val_loss={np.mean(validation_losses_per_run)}, S/Dev={np.std(validation_losses_per_run)}')    

In [None]:
# Total avg val_loss=0.38818032202124597, S/Dev=0.018028698830688163 - raw_data, 35max_len

## Aggregate test predictions

In [None]:
test_preds = np.mean(test_predictions_per_run, axis = 0)

In [None]:
sample_submission.Depression = test_preds[:,0]
sample_submission.Alcohol = test_preds[:,1]
sample_submission.Suicide = test_preds[:,2]
sample_submission.Drugs = test_preds[:,3]

In [None]:
sample_submission.head(10)

In [None]:
sample_submission.iloc[:,1:].idxmax(axis = 1).value_counts()

In [None]:
sample_submission.to_csv("submission.csv", index = False)