In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sun Aug 21 02:15:38 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   63C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import output
output.enable_custom_widget_manager()

In [3]:
!pip -V

pip 21.1.3 from /usr/local/lib/python3.7/dist-packages/pip (python 3.7)


In [4]:
!pip install --no-cache-dir transformers sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
import os

import logging
from types import SimpleNamespace
from pathlib import Path
from datetime import datetime
import math

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from transformers import TrainingArguments, Trainer
from tqdm import tqdm
from scipy.special import softmax
from IPython.core.display import display, HTML

from transformers import DataCollatorWithPadding
from datasets import Dataset, load_metric


# From this Gist: https://gist.github.com/ihoromi4/b681a9088f348942b01711f251e5f964
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [7]:
import torch
from torch import nn
from transformers import AutoConfig, AutoModelForSequenceClassification
from transformers.models.deberta_v2.modeling_deberta_v2 import ContextPooler
from transformers.models.deberta_v2.modeling_deberta_v2 import StableDropout
from transformers.modeling_outputs import TokenClassifierOutput

In [8]:
device = 'cpu'
if torch.cuda.is_available:
    device = 'cuda'
print(device)

cuda


In [9]:
labels = ['Adequate', 'Effective', 'Ineffective']

In [10]:
base_path = 'drive/MyDrive/Comps/FeedbackPrize'

In [11]:
data_2021_path = 'drive/MyDrive/Comps/FeedbackPrize/feedback-prize-2021'
data_2022_path = 'drive/MyDrive/Comps/FeedbackPrize/feedback-prize-effectiveness'

train_df = pd.read_csv(os.path.join(data_2022_path, 'train.csv'))
test_df = pd.read_csv(os.path.join(data_2022_path, 'test.csv'))

## Using 2021 data predicting topics and join pred topics to 2022 data set

In [12]:
topic_pred_df = pd.read_csv('drive/MyDrive/Comps/FeedbackPrize/topics/topic_model_feedback.csv')
topic_pred_df = topic_pred_df.drop(columns={'prob'})
topic_pred_df = topic_pred_df.rename(columns={'id': 'essay_id'})

topic_meta_df = pd.read_csv('drive/MyDrive/Comps/FeedbackPrize/topics/topic_model_metadata.csv')
topic_meta_df = topic_meta_df.rename(columns={'Topic': 'topic', 'Name': 'topic_name'}).drop(columns=['Count'])
topic_meta_df.topic_name = topic_meta_df.topic_name.apply(lambda n: ' '.join(n.split('_')[1:]))

topic_pred_df = topic_pred_df.merge(topic_meta_df, on='topic', how='left')
topic_pred_df['essay_id'] = topic_pred_df.essay_id.apply(lambda x: x[6:])

train_df = train_df.merge(topic_pred_df, on='essay_id', how='left')

## Set Config

In [13]:
config = SimpleNamespace()

config.seed = 420
config.model_name = 'microsoft/deberta-v3-base'
config.output_path = Path('drive/MyDrive/Comps/FeedbackPrize/output')
config.input_path = Path('drive/MyDrive/Comps/FeedbackPrize/feedback-prize-effectiveness')

config.n_folds = 4
config.lr = 1e-5
config.weight_decay = 0.01
config.epochs = 4
config.batch_size = 16
config.gradient_accumulation_steps = 1
config.warm_up_ratio = 0.1
config.max_len = 384
config.hidden_dropout_prob = 0.1
config.label_smoothing_factor = 0.
config.eval_per_epoch = 2

logging.disable(logging.WARNING)

seed_everything(config.seed)

## Using CV

In [14]:
cv = StratifiedKFold(n_splits=config.n_folds, shuffle=True, random_state=config.seed)
train_df['fold'] = -1
for fold_num, (train_idxs, test_idxs) in enumerate(cv.split(train_df.index, train_df.discourse_effectiveness, train_df.essay_id)):
    train_df.loc[test_idxs, ['fold']] = fold_num

In [15]:
train_df.to_csv(config.output_path / 'train_folds.csv', index=False)

## Tokenizer

In [16]:
config.model_name

'microsoft/deberta-v3-base'

In [17]:
tokenizer = AutoTokenizer.from_pretrained(config.model_name, use_fast=True)
tokenizer.model_max_length = config.max_len

  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"


In [18]:
tokenizer

PreTrainedTokenizerFast(name_or_path='microsoft/deberta-v3-base', vocab_size=128000, model_max_len=384, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [19]:
def get_essay(essay_fns):
  essay_cache = {}

  output = []
  for essay_fn in essay_fns:
      if essay_fn not in essay_cache:
          essay_txt = open(essay_fn).read().strip().lower()
          essay_cache[essay_fn] = essay_txt
      output.append(essay_cache[essay_fn])

  return output

def tokenizer_func(x):
  return tokenizer(x["inputs"], get_essay(x['essay_fn']), truncation=True, max_length=config.max_len)

def add_inputs(df, basepath):
  df['essay_fn'] = basepath + '/' + df.essay_id + '.txt'
  df['inputs'] = df.discourse_type.str.lower() + ' ' + tokenizer.sep_token + ' ' + df.topic_name + ' ' + tokenizer.sep_token + ' ' + df.discourse_text.str.lower()
  return df

In [20]:
train_df = add_inputs(train_df, str(config.input_path / 'train'))

## Model

In [21]:
def get_dropouts(num, start_prob, increment):
    return [StableDropout(start_prob + (increment * i)) for i in range(num)]  

class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class CustomModel(nn.Module):
    def __init__(self, backbone):
        super(CustomModel, self).__init__()
        
        self.model = backbone
        self.config = self.model.config
        self.num_labels = self.config.num_labels

        # self.pooler = ContextPooler(self.config)
        self.pooler = MeanPooling()
        
        self.classifier = nn.Linear(self.config.hidden_size, self.num_labels)
    
        self.dropouts = get_dropouts(num=5, start_prob=config.hidden_dropout_prob - 0.02, increment=0.01)
    
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None
    ):
        outputs = self.model.deberta(
            input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        
        encoder_layer = outputs[0]
        pooled_output = self.pooler(encoder_layer, attention_mask)
                      
        # Multi-sample dropout.
        num_dps = float(len(self.dropouts))
        for ii, drop in enumerate(self.dropouts):
            if ii == 0:
                logits = (self.classifier(drop(pooled_output)) / num_dps)
            else:
                logits += (self.classifier(drop(pooled_output)) / num_dps)

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            logits = logits.view(-1, self.num_labels)
            loss = loss_fn(logits, labels.view(-1))

        output = (logits,) + outputs[1:]

        return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)

In [22]:
def get_backbone_config():
    model_config = AutoConfig.from_pretrained(config.model_name, num_labels=3)
    model_config.hidden_dropout_prob = config.hidden_dropout_prob
    return model_config

In [23]:
def get_model():
    model_config = get_backbone_config()

    model = AutoModelForSequenceClassification.from_pretrained(
        config.model_name,
        config=model_config,
    )
    # print(model)
    return CustomModel(model)

In [24]:
backbone_config = get_backbone_config()
backbone_config.save_pretrained(os.path.join(base_path, 'backbone_config'))

In [25]:
model = get_model()

## Training

In [26]:
# add metric
metric = load_metric('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [27]:
# train_df = train_df.sample(n=150)
# config.epochs = 1

In [28]:
def do_fold(fold_num):
    train_data  = train_df.query(f'fold != {fold_num}').reset_index(drop=True)

    val_data  = train_df.query(f'fold == {fold_num}').reset_index(drop=True)
    
    # Add 2021 to train data.
    # train_data = pd.concat([train_data, train_2021_filt_df[['inputs', 'essay_fn', 'discourse_effectiveness']]]).sample(frac=1., random_state=config.seed).reset_index(drop=True)
    print(f'Train data size: {train_data.shape}')

    train_dataset = Dataset.from_pandas(train_data[['inputs', 'essay_fn', 'discourse_effectiveness']]).rename_column('discourse_effectiveness', 'label').class_encode_column("label")
    val_dataset = Dataset.from_pandas(val_data[['inputs', 'essay_fn', 'discourse_effectiveness']]).rename_column('discourse_effectiveness', 'label').class_encode_column("label")

    train_tok_dataset = train_dataset.map(tokenizer_func, batched=True, remove_columns=('inputs', 'essay_fn'))
    val_tok_dataset = val_dataset.map(tokenizer_func, batched=True, remove_columns=('inputs', 'essay_fn'))

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding='longest')

    num_steps = len(train_data) / config.batch_size / config.gradient_accumulation_steps
    eval_steps = num_steps // config.eval_per_epoch
    print(f'Num steps: {num_steps}, eval steps: {eval_steps}')

    args = TrainingArguments(
        output_dir=config.output_path,
        learning_rate=config.lr,
        warmup_ratio=config.warm_up_ratio,
        lr_scheduler_type='cosine',
        fp16=True,
        per_device_train_batch_size=config.batch_size,
        per_device_eval_batch_size=config.batch_size * 2,
        num_train_epochs=config.epochs,
        weight_decay=config.weight_decay,
        # report_to="wandb",

        evaluation_strategy='steps',
        eval_steps=eval_steps, 
        save_strategy='steps',
        save_steps=eval_steps,
        
        load_best_model_at_end=True,
        gradient_accumulation_steps=config.gradient_accumulation_steps,
        label_smoothing_factor=config.label_smoothing_factor,
        save_total_limit=3  # Prevents running out of disk space.
    )

    model = get_model()

    trainer = Trainer(
        model,
        args,
        train_dataset=train_tok_dataset,
        eval_dataset=val_tok_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    trainer.train()
    
    trainer.save_model(config.output_path / f'fold_{fold_num}')
    
    outputs = trainer.predict(val_tok_dataset)

    val_data[labels] = softmax(outputs.predictions, axis=1)
    
    !rm -rf {config.output_path / 'checkpoint'}*
    
    return val_data

In [29]:
val_preds_df = pd.DataFrame()

val_data = do_fold(0)

val_preds_df = pd.concat([val_preds_df, val_data])

Train data size: (27573, 10)


Casting to class labels:   0%|          | 0/28 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/3 [00:00<?, ?ba/s]

Casting to class labels:   0%|          | 0/10 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/28 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

Num steps: 1723.3125, eval steps: 861.0




Step,Training Loss,Validation Loss,Accuracy
861,0.8439,0.708888,0.689513
1722,0.7116,0.695787,0.691362
2583,0.647,0.635075,0.721497
3444,0.6315,0.646345,0.715405
4305,0.549,0.610615,0.744887
5166,0.5272,0.580629,0.750979
6027,0.4679,0.595911,0.752502
6888,0.4645,0.600083,0.75272


In [30]:
for fold in range(1, config.n_folds):
    val_data = do_fold(fold)
    val_preds_df = pd.concat([val_preds_df, val_data])

Train data size: (27574, 10)


Casting to class labels:   0%|          | 0/28 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/3 [00:00<?, ?ba/s]

Casting to class labels:   0%|          | 0/10 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/28 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

Num steps: 1723.375, eval steps: 861.0




Step,Training Loss,Validation Loss,Accuracy
861,0.8613,0.727034,0.671309
1722,0.717,0.693855,0.692852
2583,0.6449,0.673546,0.708846
3444,0.6187,0.658143,0.712545
4305,0.5415,0.682671,0.718203
5166,0.5225,0.62637,0.738984
6027,0.47,0.670809,0.725057
6888,0.4723,0.652972,0.733435


Train data size: (27574, 10)


Casting to class labels:   0%|          | 0/28 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/3 [00:00<?, ?ba/s]

Casting to class labels:   0%|          | 0/10 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/28 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

Num steps: 1723.375, eval steps: 861.0




Step,Training Loss,Validation Loss,Accuracy
861,0.8479,0.732993,0.677293
1722,0.7386,0.763576,0.676314
2583,0.649,0.628702,0.727451
3444,0.6202,0.710415,0.693505
4305,0.5417,0.615064,0.746382
5166,0.5252,0.606988,0.744424
6027,0.4677,0.62933,0.741704
6888,0.4647,0.628013,0.742357


Train data size: (27574, 10)


Casting to class labels:   0%|          | 0/28 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/3 [00:00<?, ?ba/s]

Casting to class labels:   0%|          | 0/10 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/28 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

Num steps: 1723.375, eval steps: 861.0




Step,Training Loss,Validation Loss,Accuracy
861,0.8492,0.77425,0.651507
1722,0.7261,0.688508,0.695572
2583,0.6367,0.718843,0.692308
3444,0.6312,0.615051,0.735067
4305,0.5581,0.610693,0.738331
5166,0.5093,0.616982,0.743553
6027,0.4724,0.62929,0.731368
6888,0.4557,0.634217,0.737243


In [31]:
val_preds_df.drop(columns=['inputs']).to_csv(config.output_path / 'val_preds.csv', index=False)
val_preds_df = pd.read_csv(config.output_path / 'val_preds.csv')