In [1]:
import os
os.getcwd()

'/content'

In [2]:
os.listdir()

['.config', 'drive', 'sample_data']

In [3]:
!pip install sentencepiece
!pip install datasets
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
from tqdm.auto import tqdm
from pathlib import Path
from types import SimpleNamespace
import logging

from datasets import Dataset

import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
from scipy.special import softmax
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss

# To work around the aggressive HuggingFace log spam.
logging.disable(logging.WARNING)

# From this Gist: https://gist.github.com/ihoromi4/b681a9088f348942b01711f251e5f964
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [5]:
data_path_2021 = 'drive/MyDrive/Comps/FeedbackPrize/feedback-prize-2021'
data_path_2022 = 'drive/MyDrive/Comps/FeedbackPrize/feedback-prize-effectiveness'

train_2021_df = pd.read_csv(os.path.join(data_path_2021, 'train.csv'))
total_size_2021 = len(train_2021_df)
num_essays_2021 = train_2021_df.id.nunique()

train_df = pd.read_csv('drive/MyDrive/Comps/FeedbackPrize/output/train_folds.csv')
total_size_2022 = len(train_df)
num_essays_2022 = train_df.essay_id.nunique()

In [6]:
ids_2022 = set(train_df.essay_id.unique())
ids_2021 = set(train_2021_df.id.unique())

#number of new essays
print(len(ids_2021 - ids_2022))

11403


In [7]:
train_2021_df['in_2022'] = train_2021_df.id.apply(lambda x: x in ids_2022)
test_df = pd.read_csv(os.path.join(data_path_2022, 'test.csv'))

## Set Config Parameters

In [8]:
config = SimpleNamespace()

config.n_folds = 4
config.seed = 420
config.lr = 1e-5
config.weight_decay = 0.01
config.epochs = 4
config.batch_size = 16
config.warm_up_ratio = 0.1
config.max_len = 384
config.hidden_dropout_prob = 0.2
config.label_smoothing_factor = 0
config.config_path = 'drive/MyDrive/Comps/FeedbackPrize/backbone_config'
config.output_path = Path('drive/MyDrive/Comps/FeedbackPrize/output')
config.model_path = Path('drive/MyDrive/Comps/FeedbackPrize/output')
config.input_path = Path('drive/MyDrive/Comps/FeedbackPrize/feedback-prize-effectiveness')

In [9]:
transformers.logging.set_verbosity_error()
seed_everything(config.seed)

## Generate Topics

In [10]:
train_2021_df = train_2021_df.rename(columns={'id': 'essay_id'})
topic_pred_df = pd.read_csv('drive/MyDrive/Comps/FeedbackPrize/topics/topic_model_feedback.csv')
topic_pred_df = topic_pred_df.drop(columns={'prob'})
topic_pred_df = topic_pred_df.rename(columns={'id': 'essay_id'})

topic_meta_df = pd.read_csv('drive/MyDrive/Comps/FeedbackPrize/topics/topic_model_metadata.csv')
topic_meta_df = topic_meta_df.rename(columns={'Topic': 'topic', 'Name': 'topic_name'}).drop(columns=['Count'])
topic_meta_df.topic_name = topic_meta_df.topic_name.apply(lambda n: ' '.join(n.split('_')[1:]))

topic_pred_df = topic_pred_df.merge(topic_meta_df, on='topic', how='left')

In [11]:
topic_pred_df['essay_id'] = topic_pred_df.essay_id.apply(lambda x: x[6:])

In [12]:
train_2021_df = train_2021_df.merge(topic_pred_df, on='essay_id', how='left')

In [13]:
#train_2021_df

In [14]:
# topic_pred_df = pd.read_csv('drive/MyDrive/Comps/FeedbackPrize/topics/topic_model_feedback.csv')
# topic_pred_df = topic_pred_df.drop(columns={'prob'})
# topic_pred_df = topic_pred_df.rename(columns={'id': 'essay_id'})

# topic_meta_df = pd.read_csv('drive/MyDrive/Comps/FeedbackPrize/topics/topic_model_metadata.csv')
# topic_meta_df = topic_meta_df.rename(columns={'Topic': 'topic', 'Name': 'topic_name'}).drop(columns=['Count'])
# topic_meta_df.topic_name = topic_meta_df.topic_name.apply(lambda n: ' '.join(n.split('_')[1:]))

# topic_pred_df = topic_pred_df.merge(topic_meta_df, on='topic', how='left')
# topic_pred_df['essay_id'] = topic_pred_df.essay_id.apply(lambda x: x[6:])

# train_df = train_df.merge(topic_pred_df, on='essay_id', how='left')

## Prepare Data

In [15]:
labels = ['Adequate', 'Effective', 'Ineffective']
tokenizer = AutoTokenizer.from_pretrained(config.model_path / 'fold_0')

def tokenizer_func(x):
    return tokenizer(x["inputs"], get_essay(x['essay_fn']), truncation=True, return_overflowing_tokens=False)

def get_essay(essay_fns):
    essay_cache = {}

    output = []
    for essay_fn in essay_fns:
        if essay_fn not in essay_cache:
            essay_txt = open(essay_fn).read()
            essay_cache[essay_fn] = essay_txt
        output.append(essay_cache[essay_fn])

    return output

def add_inputs(df, basepath):
    df['essay_fn'] = basepath + '/' + df.essay_id + '.txt'
    df['inputs'] = df.discourse_type + ' ' + tokenizer.sep_token + ' ' + df.topic_name + ' ' + tokenizer.sep_token + ' ' + df.discourse_text
    return df

train_2021_df = add_inputs(train_2021_df, 'drive/MyDrive/Comps/FeedbackPrize/feedback-prize-2021/train')
train_df = add_inputs(train_df, str(config.input_path / 'train'))

In [16]:
import torch
from torch import nn
from transformers import AutoConfig, AutoModelForSequenceClassification
from transformers.models.deberta_v2.modeling_deberta_v2 import ContextPooler
from transformers.models.deberta_v2.modeling_deberta_v2 import StableDropout
from transformers.modeling_outputs import TokenClassifierOutput
from transformers import DebertaV2ForSequenceClassification

def get_dropouts(num, start_prob, increment):
    return [StableDropout(start_prob + (increment * i)) for i in range(num)]  

class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class CustomModel(nn.Module):
    def __init__(self, backbone):
        super(CustomModel, self).__init__()
        
        self.model = backbone
        self.config = self.model.config
        self.num_labels = self.config.num_labels

        # self.pooler = ContextPooler(self.config)
        self.pooler = MeanPooling()
        
        self.classifier = nn.Linear(self.config.hidden_size, self.num_labels)
    
        self.dropouts = get_dropouts(num=5, start_prob=config.hidden_dropout_prob - 0.02, increment=0.01)
    
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None
    ):
        outputs = self.model.deberta(
            input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        
        encoder_layer = outputs[0]
        pooled_output = self.pooler(encoder_layer, attention_mask)
                      
        # Multi-sample dropout.
        num_dps = float(len(self.dropouts))
        for ii, drop in enumerate(self.dropouts):
            if ii == 0:
                logits = (self.classifier(drop(pooled_output)) / num_dps)
            else:
                logits += (self.classifier(drop(pooled_output)) / num_dps)

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            logits = logits.view(-1, self.num_labels)
            loss = loss_fn(logits, labels.view(-1))

        output = (logits,) + outputs[1:]

        return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)

In [17]:
def get_backbone_config():
    model_config = AutoConfig.from_pretrained(os.path.join(config.config_path, 'config.json'), num_labels=3)
    model_config.hidden_dropout_prob = config.hidden_dropout_prob
    return model_config

In [18]:
def get_model():
    model_config = AutoConfig.from_pretrained(os.path.join(config.config_path, 'config.json'))
    model = DebertaV2ForSequenceClassification(model_config)
    
    return CustomModel(model)

In [19]:
train_2021_df = train_2021_df.sample(n=100)
train_df = train_df.sample(n=100)

In [20]:
all_2021_data = np.zeros((config.n_folds, len(train_2021_df), len(labels)))
all_val_preds = []

for fold_num in range(config.n_folds):
    print(f'Do fold {fold_num}')

    tokenizer = AutoTokenizer.from_pretrained(config.model_path / f'fold_{fold_num}')
    tokenizer.model_max_length = config.max_len

    model = get_model()

    state_dict = torch.load(config.model_path / f'fold_{fold_num}/pytorch_model.bin')
    model.load_state_dict(state_dict)  

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding='longest')

    args = TrainingArguments(
        output_dir=config.output_path,
        learning_rate=config.lr,
        lr_scheduler_type='cosine',
        fp16=True,
        evaluation_strategy='epoch',
        per_device_train_batch_size=config.batch_size,
        per_device_eval_batch_size=config.batch_size * 2,
        report_to="none",
        save_strategy='no',
        label_smoothing_factor=config.label_smoothing_factor
    )
    
    trainer = Trainer(
        model,
        args,
        tokenizer=tokenizer,
        data_collator=data_collator
    )

    # Make predictions on the OOF data (to verify my model works okay).
    val_df = train_df.query(f'fold == {fold_num}').reset_index(drop=True)
    val_dataset = Dataset.from_pandas(val_df[['inputs', 'essay_fn']])
    
    print('Predict on 2022 dataset')
    val_tok_dataset = val_dataset.map(tokenizer_func, batched=True, remove_columns=('inputs', 'essay_fn'))
    val_preds = trainer.predict(val_tok_dataset)
    val_preds_softmax = softmax(val_preds.predictions, axis=1)
    val_df[labels] = val_preds_softmax
    all_val_preds.append(val_df)
    
    # Make predictions on 2021 data
    print('Predict on 2021 dataset')
    val_dataset_2021 = Dataset.from_pandas(train_2021_df[['inputs', 'essay_fn']])
    val_tok_dataset_2021 = val_dataset_2021.map(tokenizer_func, batched=True, remove_columns=('inputs', 'essay_fn'))
    outputs_2021 = trainer.predict(val_tok_dataset_2021) 
    softmax_outputs_2021 = softmax(outputs_2021.predictions, axis=1)
    
    all_2021_data[fold_num] = softmax_outputs_2021

Do fold 0
Predict on 2022 dataset


  0%|          | 0/1 [00:00<?, ?ba/s]

Predict on 2021 dataset


  0%|          | 0/1 [00:00<?, ?ba/s]

Do fold 1
Predict on 2022 dataset


  0%|          | 0/1 [00:00<?, ?ba/s]

Predict on 2021 dataset


  0%|          | 0/1 [00:00<?, ?ba/s]

Do fold 2
Predict on 2022 dataset


  0%|          | 0/1 [00:00<?, ?ba/s]

Predict on 2021 dataset


  0%|          | 0/1 [00:00<?, ?ba/s]

Do fold 3
Predict on 2022 dataset


  0%|          | 0/1 [00:00<?, ?ba/s]

Predict on 2021 dataset


  0%|          | 0/1 [00:00<?, ?ba/s]

In [21]:
val_preds_df = pd.concat(all_val_preds)
log_loss(val_preds_df['discourse_effectiveness'], val_preds_df[labels])

0.6213690185546875

In [24]:
preds_2021 = np.mean(all_2021_data, axis=0)
train_2021_df = train_2021_df.rename(columns={'id': 'essay_id'})
train_2021_df_output = train_2021_df.drop(columns=['discourse_start', 'discourse_end', 'discourse_type_num', 'predictionstring', 'inputs'])
train_2021_df_output[labels] = preds_2021
train_2021_df_output['discourse_effectiveness'] = train_2021_df_output[labels].idxmax(axis=1)
train_2021_df_output.to_csv(os.path.join(config.output_path, 'train_2021_preds.csv'), index=False)

In [23]:
train_2021_df_output.head()

Unnamed: 0,essay_id,discourse_id,discourse_text,discourse_type,in_2022,topic,topic_name,essay_fn,Adequate,Effective,Ineffective,discourse_effectiveness
99386,F4DC1DBD3275,1622053000000.0,"In the electoral college system, even the majo...",Claim,False,15,electoral college electoral college vote,drive/MyDrive/Comps/FeedbackPrize/feedback-pri...,0.046257,0.952515,0.001269,Effective
5665,F773373F9AF3,1623084000000.0,Even though some people would say that it is v...,Counterclaim,False,13,driving phone phones cell,drive/MyDrive/Comps/FeedbackPrize/feedback-pri...,0.913696,0.002682,0.083504,Adequate
8732,DFB4C78A64F1,1622090000000.0,9When looking at the world from a birds eye vi...,Lead,False,5,car cars usage pollution,drive/MyDrive/Comps/FeedbackPrize/feedback-pri...,0.036198,0.94812,0.015279,Effective
3412,6973D20B45C0,1622809000000.0,It's good that they are cracking down on distr...,Concluding Statement,False,13,driving phone phones cell,drive/MyDrive/Comps/FeedbackPrize/feedback-pri...,0.777344,0.167755,0.054817,Adequate
108447,F654D80CEA0E,1620073000000.0,So I think you should go with Policy 1 because...,Claim,False,8,phones cell cell phones school,drive/MyDrive/Comps/FeedbackPrize/feedback-pri...,0.919434,0.056877,0.023487,Adequate
