In [3]:
import logging
import os
import random
import sys
from dataclasses import dataclass, field
from typing import Optional
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
import numpy as np
from datasets import load_dataset, load_metric
from transformers import EvalPrediction

import transformers
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoModelForPreTraining,
    AutoTokenizer,
    EvalPrediction,
    HfArgumentParser,
    PretrainedConfig,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers.trainer_utils import is_main_process

logger = logging.getLogger(__name__)
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss, MSELoss
import pandas as pd
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

import matplotlib.pyplot as plt
%matplotlib inline

# Custom dataset

In [None]:
eat_with_folds = pd.read_pickle('./eat_with_folds.pkl')

for fold in range(10):
    print("fold n#{}".format(fold))
    train = eat_with_folds[eat_with_folds['fold'] != fold]
    val = eat_with_folds[eat_with_folds['fold'] == fold]
    
    break

In [None]:
# # Loading a dataset from local csv files
# # datasets = load_dataset("csv", data_files={"train": 'eat_train.csv', "validation": 'eat_test.csv'})

# class EATDataset(Dataset):
#     """Custom EAT Dataset class"""

#     def __init__(self, df, tokenizer):
#         # Extracts the tokens and offsets(positions of A, B, and P)
#         self.tokens, self.y = [], []
# #         self.tokens = tokenizer(df['story'].apply(lambda x: ' '.join(x)).tolist(), padding='max_length',
# #                                         max_length=256, truncation=True)
#         self.y = df['breakpoint'].replace(-1, 0).values

#         for ix, row in df.iterrows():
# #             one_story = tokenizer(" ".join(row['story']), padding='max_length',
# #                                         max_length=256, truncation=True)
            
#             one_story = []
#             one_story_attentions = []
#             one_story_token_type_ids = []
            
#             for ix1 in range(1, 4):
#                 for ix2 in range(ix1, 5):
#                     if ix1 != ix2:
#                         sent = row['story'][:0]
#                         sent2 = row['story'][ix2]
#                         tokenized = tokenizer.encode_plus(sent, 
#                                                           sent2,
#                                         padding='max_length',
#                                         max_length=50, truncation=True)

#                         one_story.append(tokenized['input_ids'])
#                         one_story_attentions.append(tokenized['attention_mask'])

#             self.tokens.append(one_story)
    
# #         y = np.zeros(shape=(2,), dtype=bool)
# #         y[row['label']] = True
# #         self.y.append(y)

#     def __len__(self):
#         return len(self.y)

#     def __getitem__(self, idx):
#         item = {key: torch.tensor(val) for key, val in self.tokens[idx].items()}
#         item['labels'] = torch.tensor(self.y[idx])
#         return item

In [None]:
# Loading a dataset from local csv files
# datasets = load_dataset("csv", data_files={"train": 'eat_train.csv', "validation": 'eat_test.csv'})

class EATDataset(Dataset):
    """Custom EAT Dataset class"""

    def __init__(self, df, tokenizer):
        # Extracts the tokens and offsets(positions of A, B, and P)
        self.tokens, self.y = [], []
#         self.tokens = tokenizer(df['story'].apply(lambda x: ' '.join(x)).tolist(), padding='max_length',
#                                         max_length=256, truncation=True)
        self.y = df['breakpoint'].replace(-1, 0).values
#         self.y = df['label'].values
        for ix, row in df.iterrows():
            first_sentence = " ".join(row['story'][0:2])
            rest = " ".join(row['story'][2:])

            one_story = tokenizer.encode_plus(first_sentence, rest, padding='max_length',
                                        max_length=100, truncation=True)
            
            
            self.tokens.append(one_story)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val) for key, val in self.tokens[idx].items()}
        item['labels'] = torch.tensor(self.y[idx])
        return item

In [None]:
# print(np.quantile([len(one_story['input_ids']) for one_story in self.tokens], 0.97))

In [None]:
# # Loading a dataset from local csv files
# # datasets = load_dataset("csv", data_files={"train": 'eat_train.csv', "validation": 'eat_test.csv'})

# class EATDataset(Dataset):
#     """Custom EAT Dataset class"""

#     def __init__(self, df, tokenizer):
#         # Extracts the tokens and offsets(positions of A, B, and P)
#         self.tokens, self.y = [], []
# #         self.tokens = tokenizer(df['story'].apply(lambda x: ' '.join(x)).tolist(), padding='max_length',
# #                                         max_length=256, truncation=True)
#         self.y = df['breakpoint'].replace(-1, 0).values
        
#         one_story_sentences = list()
#         for ix, row in df.iterrows():
#             for ix in range(5):
                
#                 first_sentence = row['story'][ix]
#                 rest = " ".join(row['story'][ix+1:])

#                 one_story = tokenizer.encode_plus(first_sentence, rest, padding='max_length',
#                                             max_length=256, truncation=True)
            
#                 one_story_sentences.append(one_story)
                
#             self.tokens.append(one_story_sentences)

#     def __len__(self):
#         return len(self.y)

#     def __getitem__(self, idx):
#         item = {key: torch.tensor(val) for key, val in self.tokens[idx].items()}
#         item['labels'] = torch.tensor(self.y[idx])
#         return item

In [None]:
def compute_metrics(pred: EvalPrediction):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
# A useful fast method:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
label_list = eat_with_folds['breakpoint'].replace(-1, 0).unique()#eat_with_folds['label'].unique() #datasets["train"].unique("label")
model_name_or_path = 'roberta-large-mnli'
label_list.sort()  # Let's sort it for determinism
num_labels = len(label_list)

In [None]:
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    output_hidden_states = True
)

In [None]:
# Load pretrained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path,
    use_fast=True,
)

In [None]:
from transformers.modeling_albert import AlbertPreTrainedModel, AlbertModel
from torch import nn

In [None]:
from transformers.modeling_roberta import RobertaPreTrainedModel, RobertaModel, RobertaClassificationHead, SequenceClassifierOutput
from torch import nn

class RobertaClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(1024, 1024)
        self.dropout = nn.Dropout(0.1)
        self.final = nn.Linear(1024, 6)

    def forward(self, features, **kwargs):
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.final(x)
        return x
    
    
class RobertaForSequenceClassification(RobertaPreTrainedModel):
    _keys_to_ignore_on_load_missing = [r"position_ids"]

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.roberta = RobertaModel(config, add_pooling_layer=False)
        self.classifier = RobertaClassificationHead(config)
        self.init_weights()
        
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        
        sequence_output = outputs[2][-7]
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[3:] #[-1]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,#[-1],
            attentions=outputs.attentions,
        )

In [None]:
model = RobertaForSequenceClassification.from_pretrained(
    model_name_or_path,
    from_tf=False,
    config=config,
)

In [None]:
# model.classifier.out_proj = nn.Linear(in_features=1024, out_features=6, bias=True)
# model.init_weights()

In [None]:
# model = AutoModelForSequenceClassification.from_pretrained(
#     model_name_or_path,
#     from_tf=False,
#     config=config,
# )

In [None]:
datasets = {}
datasets['train'] = EATDataset(train, tokenizer)
datasets['val'] = EATDataset(val, tokenizer)

In [None]:
train_loader = DataLoader(
    datasets['train'],
    batch_size=16,
    num_workers=2,
    pin_memory=True,
    shuffle=False,
    drop_last=False
)

In [None]:
b = tokenizer.encode_plus("I ate the apple", "I have thrown the apple away", padding='max_length',
                                        max_length=64, truncation=True)

In [None]:
a = iter(train_loader)
b = next(a)

In [None]:
# weimerw

In [None]:
h = model(torch.Tensor(b['input_ids']).unsqueeze(0).long(), torch.Tensor(b['attention_mask']).unsqueeze(0).long())

In [None]:
model

In [None]:
h

# Train

In [None]:
# non_label_column_names = [name for name in datasets["train"].column_names if name != "label"]
# sentence1_key, sentence2_key = non_label_column_names[0], None

In [None]:
pad_to_max_length = True
max_seq_length = 256

In [None]:
# Padding strategy
if pad_to_max_length:
    padding = "max_length"
    max_length = max_seq_length
else:
    # We will pad later, dynamically at batch creation, to the max sequence length in each batch
    padding = False
    max_length = None

In [None]:
# def preprocess_function(examples):
#     # Tokenize the texts
#     args = (
#         (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
#     )
#     result = tokenizer(*args, padding=padding, max_length=max_length, truncation=True)

#     # Map labels to IDs (not necessary for GLUE tasks)
#     if label_to_id is not None and "label" in examples:
#         result["label"] = [label_to_id[l] for l in examples["label"]]
#     return result

# datasets = preprocess_function() for i in datasets (, batched=True, load_from_cache_file=False)

# label_to_id = {v: i for i, v in enumerate(label_list)}

In [None]:
train_dataset = datasets["train"]
eval_dataset = datasets["val"]

In [None]:
# # Log a few random samples from the training set:
# for index in random.sample(range(len(train_dataset)), 3):
#     logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

# Own training

In [None]:
# for param in model.base_model.parameters():
#     param.requires_grad = False

In [None]:
training_args = TrainingArguments(output_dir='./deleteme',#output_dir=f'./roberta_fold{fold}_task2',
                                overwrite_output_dir=True, 
                                do_train=True, 
                                do_eval=True,
                                evaluation_strategy='epoch',
                                per_device_train_batch_size=8,
                                per_device_eval_batch_size=8,
                                gradient_accumulation_steps=1,
                                learning_rate=2e-07,
                                weight_decay=0.0, 
                                adam_beta1=0.9, 
                                adam_beta2=0.999, 
                                adam_epsilon=1e-08, 
                                max_grad_norm=1.0, 
                                num_train_epochs=40.0,
                                max_steps=-1, 
                                warmup_steps=0,
                                logging_dir='runs/whataver', 
                                logging_first_step=False, 
                                logging_steps=100, 
                                load_best_model_at_end=True,
                                metric_for_best_model='f1',
                                seed=42, 
                                eval_steps=100,
                                dataloader_num_workers=0)

In [None]:
####################### TRAIN ######################
# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
    data_collator=default_data_collator,
)

In [None]:
trainer.train(
    model_path=None
)

# trainer.save_model()  # Saves the tokenizer too for easy upload

In [None]:
to_picture = [{'layer': '24', 'bert': 0.37, 'roberta': 0.56},
{'layer': '23', 'bert': 0.3734, 'roberta': 0.69},
{'layer': '22', 'bert': 0.4, 'roberta': 0.71},
{'layer': '21', 'bert': 0.45, 'roberta': 0.7056},
{'layer': '20', 'bert': 0.38, 'roberta': 0.6815},
{'layer': '19', 'bert': 0.36, 'roberta': 0.5578},
{'layer': '18', 'bert': 0.34, 'roberta': 0.4897}]

In [None]:
to_picture = pd.DataFrame(to_picture).set_index('layer').sort_index()

In [None]:
with plt.style.context('ggplot'):
    f, ax = plt.subplots(1, 1, figsize=(16, 9), dpi=300);
    ax.plot(to_picture['roberta'], lw=1.5, color='tab:red')
    # Decorations    
    plt.tick_params(axis="both", which="both", bottom=False, top=False,    
                    labelbottom=True, left=False, right=False, labelleft=True)        

    # Lighten borders
    plt.gca().spines["top"].set_alpha(.3)
    plt.gca().spines["bottom"].set_alpha(.3)
    plt.gca().spines["right"].set_alpha(.3)
    plt.gca().spines["left"].set_alpha(.3)

    plt.title('Effect of layer choice on F1 score', fontsize=34)
    plt.xlabel('Layer', fontsize=22)
    plt.ylabel('F1 score', fontsize=22)
    plt.yticks(fontsize=22) 
    plt.xticks(fontsize=22) 
    # plt.yticks(range(y_LL, y_UL, y_interval), [str(y) for y in range(y_LL, y_UL, y_interval)], fontsize=12)    
    # plt.xticks(range(0, df.shape[0], 12), df.date.values[::12], horizontalalignment='left', fontsize=12)    
    # plt.ylim(y_LL, y_UL)    
    # plt.xlim(-2, 80)    
#     plt.show()
    plt.savefig('layer_vs_f1_roberta.jpg', dpi=300)
    # ax.text(5, 0.5, 'f1 score', fontsize=14, color='tab:red')

In [None]:
pd.DataFrame()

-1 0.548 acc, 0.37 f1
-2 0.567 acc, 0.3734 f1
-3 0.5769 acc, 0.4 f1
-4 0.58 acc, 0.45 f1
-5 0.54 acc, 0.38 f1

In [None]:
trainer.evaluate(eval_dataset=eval_dataset)

# Full cross-validation

In [None]:
res = []
for fold in range(0, 10):
    print("fold n#{}".format(fold))
    train = eat_with_folds[eat_with_folds['fold'] != fold]
    val = eat_with_folds[eat_with_folds['fold'] == fold]
    
    config = AutoConfig.from_pretrained(
        model_name_or_path,
        num_labels=num_labels,
        output_hidden_states = True
    )
    
    # Load pretrained model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        model_name_or_path,
        use_fast=True,
    )
    
    model = RobertaForSequenceClassification.from_pretrained(
        model_name_or_path,
        from_tf=False,
        config=config,
    )
    
    datasets = {}
    datasets['train'] = EATDataset(train, tokenizer)
    datasets['val'] = EATDataset(val, tokenizer)
    
    train_dataset = datasets["train"]
    eval_dataset = datasets["val"]
    
    training_args = TrainingArguments(output_dir='tmp',
                                overwrite_output_dir=True, 
                                do_train=True, 
                                do_eval=True,
                                evaluation_strategy='epoch',
                                per_device_train_batch_size=8,
                                per_device_eval_batch_size=8,
                                gradient_accumulation_steps=1,
                                learning_rate=2e-06,
                                weight_decay=0.0, 
                                adam_beta1=0.9, 
                                adam_beta2=0.999, 
                                adam_epsilon=1e-08, 
                                max_grad_norm=1.0, 
                                num_train_epochs=40.0,
                                max_steps=-1, 
                                warmup_steps=0,
                                logging_dir='runs/whataver', 
                                logging_first_step=False, 
                                logging_steps=100, 
                                load_best_model_at_end=True,
                                metric_for_best_model='f1',
                                seed=42, 
                                eval_steps=100,
                                dataloader_num_workers=0)
    
    ####################### TRAIN ######################
    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
        data_collator=default_data_collator,
    )
    
    trainer.train(
        model_path=None
    )
    
    res.append(trainer.evaluate(eval_dataset=eval_dataset))
    break
#     trainer.save_model(f'roberta_large_mnli_task_1_fold_{fold}')  # Saves the tokenizer too for easy upload

In [8]:
res

NameError: name 'res' is not defined

In [None]:
print(f"Acc is {np.mean([i['eval_accuracy'] for i in res])}")
print(f"STD of acc is {np.std([i['eval_accuracy'] for i in res])}")

In [None]:
res

# Load best performing

In [None]:
for fold in range(0, 1):
    
    # because we already pre-trained
    model_name_or_path = f'./roberta_fold{fold}_task2'
    
    print("fold n#{}".format(fold))
    train = eat_with_folds[eat_with_folds['fold'] != fold]
    val = eat_with_folds[eat_with_folds['fold'] == fold]
    
    config = AutoConfig.from_pretrained(
        model_name_or_path,
        num_labels=num_labels,
        output_hidden_states = False
    )
    
    # Load pretrained model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        model_name_or_path,
        use_fast=True,
    )
    
    model = AutoModel.from_pretrained(
        model_name_or_path,
        from_tf=False,
        config=config,
    )
    
    datasets = {}
    datasets['train'] = EATDataset(train, tokenizer)
    datasets['val'] = EATDataset(val, tokenizer)
    
    train_dataset = datasets["train"]
    eval_dataset = datasets["val"]
    
    
#     training_args = TrainingArguments(output_dir=f'./roberta_fold{fold}',
#                         overwrite_output_dir=True, 
#                                 do_train=True, 
#                                 do_eval=True,
#                                 evaluation_strategy='epoch',
#                                 per_device_train_batch_size=8,
#                                 per_device_eval_batch_size=8,
#                                 gradient_accumulation_steps=1,
#                                 learning_rate=2e-06,
#                                 weight_decay=0.0, 
#                                 adam_beta1=0.9, 
#                                 adam_beta2=0.999, 
#                                 adam_epsilon=1e-08, 
#                                 max_grad_norm=1.0, 
#                                 num_train_epochs=20.0,
#                                 max_steps=-1, 
#                                 warmup_steps=0,
#                                 logging_dir='runs/whataver', 
#                                 logging_first_step=False, 
#                                 logging_steps=100, 
#                                 save_steps=500,
#                                 seed=42, 
#                                 eval_steps=100,
#                                 dataloader_num_workers=0)
    
    ####################### TRAIN ######################
    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
        data_collator=default_data_collator,
    )
    
    res.append(trainer.evaluate(eval_dataset=eval_dataset))

In [7]:
res

NameError: name 'res' is not defined

In [None]:
print(f"Precision is {np.mean([i['eval_precision'] for i in res])}")
print(f"STD of precision is {np.std([i['eval_precision'] for i in res])}")

In [None]:
print(f"Recall is {np.mean([i['eval_recall'] for i in res])}")
print(f"STD of recall is {np.std([i['eval_recall'] for i in res])}")

In [None]:
print(f"F1 is {np.mean([i['eval_f1'] for i in res])}")
print(f"STD of f1 is {np.std([i['eval_f1'] for i in res])}")

In [None]:
np.mean([i['eval_f1'] for i in res])

In [None]:
np.mean([i['eval_f1'] for i in res])

# To picture label distribution

In [6]:
pd.DataFrame([(-1, 0.5), (4, 0.25), (3, 0.1), (2, 0.08), (1, 0.05), (5, 0.004)], columns=['Breakpoint', '% of examples']).set_index('Breakpoint').sort_index()

Unnamed: 0_level_0,% of examples
Breakpoint,Unnamed: 1_level_1
-1,0.5
1,0.05
2,0.08
3,0.1
4,0.25
5,0.004


In [None]:
to_picture = pd.DataFrame(to_picture)