In [1]:
import logging
import os
import random
import sys
from dataclasses import dataclass, field
from typing import Optional
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
import numpy as np
from datasets import load_dataset, load_metric
from transformers import EvalPrediction

import transformers
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoModelForPreTraining,
    AutoTokenizer,
    EvalPrediction,
    HfArgumentParser,
    PretrainedConfig,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
# from transformers.trainer_utils import is_main_process

logger = logging.getLogger(__name__)
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss, MSELoss
import pandas as pd
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

from transformers import T5Config, T5ForConditionalGeneration, T5Tokenizer
from torch import nn
# from simpletransformers.t5 import T5Model
prefix = 'binary classification'

# Custom dataset

In [2]:
eat_with_folds = pd.read_pickle('./eat_with_folds.pkl')

for fold in range(10):
    print("fold n#{}".format(fold))
    train = eat_with_folds[eat_with_folds['fold'] != fold]
    val = eat_with_folds[eat_with_folds['fold'] == fold]
    
    break

fold n#0


In [3]:
# Extracts the tokens and offsets(positions of A, B, and P)
tokens = []

for ix, row in train.iterrows():

    first_sentence = " ".join(row['story'][0:2])
    rest = " ".join(row['story'][2:])
    input_text = "sentence1: " + first_sentence + " sentence2: " + rest
    target_text = str(bool(row['label']))

    batch = {"input_text": input_text, 'target_text': target_text, 'prefix': prefix}

#             labels[labels == self.tokenizer.pad_token_id] = -100
#             one_story = tokenizer.encode_plus(first_sentence, rest, padding='max_length',
#                                         max_length=100, truncation=True)

    tokens.append(batch)

In [4]:
true_train = pd.DataFrame(tokens)

In [None]:
true_val = pd.DataFrame(tokens)

In [None]:
train_df = true_train
eval_df = true_val

In [None]:
model_args = {
    'evaluation_strategy': 'epoch',
    "max_seq_length": 100,
    "train_batch_size": 16,
    "eval_batch_size": 64,
    "num_train_epochs": 40,
    'evaluation_strategy': 'epoch',
    
    'per_device_train_batch_size':8,
    'per_device_eval_batch_size':8,
    'gradient_accumulation_steps':1,
    'learning_rate':2e-07,
    'weight_decay':0.0, 
    'adam_beta1':0.9, 
    'adam_beta2':0.999, 
    'adam_epsilon':1e-08, 
    'max_grad_norm':1.0, 
    'num_train_epochs':40.0,
    'max_steps':-1, 
    'warmup_steps':0,
    
    'logging_dir': 'runs/whataver', 
    'logging_first_step': False, 
    'logging_steps': 100, 
    'load_best_model_at_end': True,
    'metric_for_best_model': 'accuracy',
    'seed': 42, 
    'eval_steps': 100,
    'dataloader_num_workers': 0,
    
    "evaluate_during_training": True,
    "evaluate_during_training_steps": 15000,
    "evaluate_during_training_verbose": True,

    "reprocess_input_data": True,
    "overwrite_output_dir": True,
}

model = T5Model("t5", "t5-base", args=model_args)

In [None]:
model.train_model(train_df, eval_data=eval_df)

In [None]:
import json
from datetime import datetime
from pprint import pprint
from statistics import mean

import numpy as np
import pandas as pd
from scipy.stats import pearsonr, spearmanr
from simpletransformers.t5 import T5Model
from sklearn.metrics import accuracy_score, f1_score
from transformers.data.metrics.squad_metrics import compute_exact, compute_f1


def f1(truths, preds):
    return mean([compute_f1(truth, pred) for truth, pred in zip(truths, preds)])


def exact(truths, preds):
    return mean([compute_exact(truth, pred) for truth, pred in zip(truths, preds)])


model_args = {
    "overwrite_output_dir": True,
    "max_seq_length": 100,
    "eval_batch_size": 32,
    "num_train_epochs": 1,
    "use_multiprocessing": False,
    "num_beams": None,
    "do_sample": True,
    "max_length": 100,
    "top_k": 50,
    "top_p": 0.95,
    "num_return_sequences": 3,
}

# Load the trained model
# model = T5Model("t5", "outputs", args=model_args)

# Prepare the data for testing
to_predict = [
    prefix + ": " + str(input_text)
    for prefix, input_text in zip(true_val["prefix"].tolist(), true_val["input_text"].tolist())
]

truth = true_val["target_text"].tolist()
tasks = true_val["prefix"].tolist()

# Get the model predictions
preds = model.predict(to_predict)

In [None]:
task_truth = [int(eval(t)) for t in truth]
task_preds = [int(eval(p)) for p in preds]
precision_recall_fscore_support(task_truth, task_preds)

In [5]:
# Loading a dataset from local csv files
# datasets = load_dataset("csv", data_files={"train": 'eat_train.csv', "validation": 'eat_test.csv'})

class EATDataset(Dataset):
    """Custom EAT Dataset class"""

    def __init__(self, df, tokenizer):
        
        # Extracts the tokens and offsets(positions of A, B, and P)
        self.tokens, self.y = [], []
        self.y = df['label'].values
        prefix = 'binary classification'
        
        for ix, row in df.iterrows():
            
            first_sentence = " ".join(row['story'][0:2])
            rest = " ".join(row['story'][2:])
            input_text = "sentence1: " + first_sentence + " sentence2: " + rest
            target_text = str(bool(row['label']))
            
            batch = tokenizer.prepare_seq2seq_batch(
                src_texts=[prefix + ": " + input_text],
                tgt_texts=[target_text],
                max_length=100,
                max_target_length=3,
                padding="max_length",
                return_tensors="pt",
                truncation=True,
            )
            
#             labels[labels == self.tokenizer.pad_token_id] = -100
#             one_story = tokenizer.encode_plus(first_sentence, rest, padding='max_length',
#                                         max_length=100, truncation=True)
            
            self.tokens.append(batch)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        item = {key: val[0] for key, val in self.tokens[idx].items()}
#         item['input_ids'] = item['input_ids'].squeeze()
#         item['attention_mask'] = item['attention_mask']
        
        return item

In [6]:
class CustomTrainer(Trainer):

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def prediction_step(
        self,
        model: nn.Module,
        inputs: Dict[str, Union[torch.Tensor, Any]],
        prediction_loss_only: bool,
        ignore_keys: Optional[List[str]] = None,
    ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
        """
        Perform an evaluation step on :obj:`model` using obj:`inputs`.

        Subclass and override to inject custom behavior.

        Args:
            model (:obj:`nn.Module`):
                The model to evaluate.
            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.

                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument :obj:`labels`. Check your model's documentation for all accepted arguments.
            prediction_loss_only (:obj:`bool`):
                Whether or not to return the loss only.
            ignore_keys (:obj:`Lst[str]`, `optional`):
                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                gathering predictions.

        Return:
            Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
            labels (each being optional).
        """
        has_labels = all(inputs.get(k) is not None for k in self.label_names)
        inputs = self._prepare_inputs(inputs)
        if ignore_keys is None:
            if hasattr(self.model, "config"):
                ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", [])
            else:
                ignore_keys = []
#         print(inputs.keys())
        with torch.no_grad():
            if self.args.fp16 and _use_native_amp:
                with autocast():
                    outputs = model(**inputs)
            else:
                outputs = model(**inputs)
#             print(len(outputs))
            if has_labels:
                if isinstance(outputs, dict):
                    loss = outputs["loss"].mean().detach()
                    logits = tuple(v for k, v in outputs.items() if k not in ignore_keys + ["loss"])
                else:
                    # We are here
                    loss = outputs[0].mean().detach()
                    logits = outputs[1:]
            else:
                loss = None
                if isinstance(outputs, dict):
                    logits = tuple(v for k, v in outputs.items() if k not in ignore_keys)
                else:
                    logits = outputs
            # TODO: this needs to be fixed and made cleaner later.
            if self.args.past_index >= 0:
                self._past = outputs[self.args.past_index if has_labels else self.args.past_index - 1]
        
        if prediction_loss_only:
            return (loss, None, None)

#         logits = nested_detach(logits)
        # here the length of the logits is 3, because we run this logits = outputs[1:]
#         print(len(logits))
#         print(len(logits[2]))
#         if len(logits) == 1:
        logits = logits[0]

        if has_labels:
            labels = tuple(inputs.get(name) for name in self.label_names)
            if len(labels) == 1:
                labels = labels[0]
        else:
            labels = None
        return (loss, logits, labels)

In [7]:
# A useful fast method:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
label_list = eat_with_folds['label'].astype(bool).unique() #datasets["train"].unique("label")
model_name_or_path = 't5-large'
label_list.sort()  # Let's sort it for determinism
num_labels = len(label_list)

In [9]:
config = AutoConfig.from_pretrained(
    model_name_or_path,
    use_cache=False
)

In [10]:
# Load pretrained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path,
    use_fast=True,
)

In [11]:
model = T5ForConditionalGeneration.from_pretrained(
    model_name_or_path,
    from_tf=False,
    config=config,
)

In [None]:
# model.classifier.out_proj = nn.Linear(in_features=1024, out_features=6, bias=True)
# model.init_weights()

In [None]:
# model = AutoModelForSequenceClassification.from_pretrained(
#     model_name_or_path,
#     from_tf=False,
#     config=config,
# )

In [12]:
datasets = {}
datasets['train'] = EATDataset(train, tokenizer)
datasets['val'] = EATDataset(val, tokenizer)

In [None]:
train_loader = DataLoader(
    datasets['train'],
    batch_size=16,
    num_workers=2,
    pin_memory=True,
    shuffle=False,
    drop_last=False
)

In [None]:
a = iter(train_loader)
b = next(a)

In [None]:
# weimerw

In [None]:
b['input_ids'].shape

In [None]:
b['attention_mask'].shape

In [None]:
b['input_ids']

In [None]:
b['attention_mask'].long().shape

In [None]:
b['labels'].long().shape

In [None]:
h = model(b['input_ids'].long(), b['attention_mask'].long(), decoder_input_ids=b['labels'].long())

In [None]:
model.forward??

In [None]:
h[0].shape

In [None]:
h[1].shape

In [None]:
h[2].shape

In [None]:
config.output_attentions

In [None]:
model.forward??

In [None]:
len(h[2])

In [None]:
len(h)

In [None]:
h[1].shape

In [None]:
model

In [None]:
h

# Train

In [None]:
# non_label_column_names = [name for name in datasets["train"].column_names if name != "label"]
# sentence1_key, sentence2_key = non_label_column_names[0], None

In [None]:
pad_to_max_length = True
max_seq_length = 256

In [None]:
# Padding strategy
if pad_to_max_length:
    padding = "max_length"
    max_length = max_seq_length
else:
    # We will pad later, dynamically at batch creation, to the max sequence length in each batch
    padding = False
    max_length = None

In [None]:
# def preprocess_function(examples):
#     # Tokenize the texts
#     args = (
#         (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
#     )
#     result = tokenizer(*args, padding=padding, max_length=max_length, truncation=True)

#     # Map labels to IDs (not necessary for GLUE tasks)
#     if label_to_id is not None and "label" in examples:
#         result["label"] = [label_to_id[l] for l in examples["label"]]
#     return result

# datasets = preprocess_function() for i in datasets (, batched=True, load_from_cache_file=False)

# label_to_id = {v: i for i, v in enumerate(label_list)}

In [13]:
train_dataset = datasets["train"]
eval_dataset = datasets["val"]

In [14]:
# # Log a few random samples from the training set:
# for index in random.sample(range(len(train_dataset)), 3):
#     logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

# Own training

In [15]:
# for param in model.base_model.parameters():
#     param.requires_grad = False

In [16]:
training_args = TrainingArguments(output_dir='./deleteme',#output_dir=f'./roberta_fold{fold}_task2',
                                overwrite_output_dir=True, 
                                do_train=True, 
                                do_eval=True,
                                evaluation_strategy='epoch',
                                per_device_train_batch_size=8,
                                per_device_eval_batch_size=8,
                                gradient_accumulation_steps=1,
                                learning_rate=2e-07,
                                weight_decay=0.0, 
                                adam_beta1=0.9, 
                                adam_beta2=0.999, 
                                adam_epsilon=1e-08, 
                                max_grad_norm=1.0, 
                                num_train_epochs=40.0,
                                max_steps=-1, 
                                warmup_steps=0,
                                logging_dir='runs/whataver', 
                                logging_first_step=False, 
                                logging_steps=100, 
                                load_best_model_at_end=True,
                                metric_for_best_model='f1',
                                seed=42, 
                                eval_steps=100,
                                dataloader_num_workers=0)

In [17]:
def compute_metrics(pred: EvalPrediction):
    
    preds = list()
    labels = list()
    
    for i in pred.label_ids:
        labels.append(tokenizer.decode(
            i, skip_special_tokens=True, clean_up_tokenization_spaces=True
        ))
    
    labels = ['True' if 'tr' in l.lower() else 'False' for l in labels]
    labels = [eval(l) for l in labels]
    
    for i in pred.predictions:
        preds.append(tokenizer.decode(
            i.argmax(-1), skip_special_tokens=True, clean_up_tokenization_spaces=True
        ))
        
    preds = ['True' if 'tr' in p.lower() else 'False' for p in preds]
    preds = [eval(l) for l in preds]

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [18]:
####################### TRAIN ######################
# Initialize our Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
    data_collator=default_data_collator,
)

In [20]:
trainer.train(
    model_path=None
)

# trainer.save_model()  # Saves the tokenizer too for easy upload

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 15.75 GiB total capacity; 14.24 GiB already allocated; 19.56 MiB free; 14.63 GiB reserved in total by PyTorch)

-0.54807 acc, 0.37 f1
-1 0.567 acc, 0.3734 f1
-2 0.5769 acc, 0.4 f1
- 3 0.58 acc, 0.45 f1



54%, 0.38 f1

In [None]:
trainer.evaluate(eval_dataset=eval_dataset)

# Full cross-validation

In [None]:
res = []
for fold in range(0, 10):
    print("fold n#{}".format(fold))
    train = eat_with_folds[eat_with_folds['fold'] != fold]
    val = eat_with_folds[eat_with_folds['fold'] == fold]
    
    config = AutoConfig.from_pretrained(
        model_name_or_path,
        num_labels=num_labels,
        output_hidden_states = False
    )
    
    # Load pretrained model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        model_name_or_path,
        use_fast=True,
    )
    
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name_or_path,
        from_tf=False,
        config=config,
    )
    
    datasets = {}
    datasets['train'] = EATDataset(train, tokenizer)
    datasets['val'] = EATDataset(val, tokenizer)
    
    train_dataset = datasets["train"]
    eval_dataset = datasets["val"]
    
    training_args = TrainingArguments(output_dir='tmp',
                                overwrite_output_dir=True, 
                                do_train=True, 
                                do_eval=True,
                                evaluation_strategy='epoch',
                                per_device_train_batch_size=8,
                                per_device_eval_batch_size=8,
                                gradient_accumulation_steps=1,
                                learning_rate=2e-06,
                                weight_decay=0.0, 
                                adam_beta1=0.9, 
                                adam_beta2=0.999, 
                                adam_epsilon=1e-08, 
                                max_grad_norm=1.0, 
                                num_train_epochs=40.0,
                                max_steps=-1, 
                                warmup_steps=0,
                                logging_dir='runs/whataver', 
                                logging_first_step=False, 
                                logging_steps=100, 
                                load_best_model_at_end=True,
                                metric_for_best_model='f1',
                                seed=42, 
                                eval_steps=100,
                                dataloader_num_workers=0)
    
    ####################### TRAIN ######################
    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
        data_collator=default_data_collator,
    )
    
    trainer.train(
        model_path=None
    )
    
    res.append(trainer.evaluate(eval_dataset=eval_dataset))
    
#     trainer.save_model(f'roberta_large_mnli_task_1_fold_{fold}')  # Saves the tokenizer too for easy upload

In [None]:
print(f"Acc is {np.mean([i['eval_accuracy'] for i in res])}")
print(f"STD of acc is {np.std([i['eval_accuracy'] for i in res])}")

In [None]:
res

# Load best performing

In [None]:
for fold in range(0, 1):
    
    # because we already pre-trained
    model_name_or_path = f'./roberta_fold{fold}_task2'
    
    print("fold n#{}".format(fold))
    train = eat_with_folds[eat_with_folds['fold'] != fold]
    val = eat_with_folds[eat_with_folds['fold'] == fold]
    
    config = AutoConfig.from_pretrained(
        model_name_or_path,
        num_labels=num_labels,
        output_hidden_states = False
    )
    
    # Load pretrained model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        model_name_or_path,
        use_fast=True,
    )
    
    model = AutoModel.from_pretrained(
        model_name_or_path,
        from_tf=False,
        config=config,
    )
    
    datasets = {}
    datasets['train'] = EATDataset(train, tokenizer)
    datasets['val'] = EATDataset(val, tokenizer)
    
    train_dataset = datasets["train"]
    eval_dataset = datasets["val"]
    
    
#     training_args = TrainingArguments(output_dir=f'./roberta_fold{fold}',
#                         overwrite_output_dir=True, 
#                                 do_train=True, 
#                                 do_eval=True,
#                                 evaluation_strategy='epoch',
#                                 per_device_train_batch_size=8,
#                                 per_device_eval_batch_size=8,
#                                 gradient_accumulation_steps=1,
#                                 learning_rate=2e-06,
#                                 weight_decay=0.0, 
#                                 adam_beta1=0.9, 
#                                 adam_beta2=0.999, 
#                                 adam_epsilon=1e-08, 
#                                 max_grad_norm=1.0, 
#                                 num_train_epochs=20.0,
#                                 max_steps=-1, 
#                                 warmup_steps=0,
#                                 logging_dir='runs/whataver', 
#                                 logging_first_step=False, 
#                                 logging_steps=100, 
#                                 save_steps=500,
#                                 seed=42, 
#                                 eval_steps=100,
#                                 dataloader_num_workers=0)
    
    ####################### TRAIN ######################
    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
        data_collator=default_data_collator,
    )
    
    res.append(trainer.evaluate(eval_dataset=eval_dataset))

In [None]:
res

In [None]:
print(f"Precision is {np.mean([i['eval_precision'] for i in res])}")
print(f"STD of precision is {np.std([i['eval_precision'] for i in res])}")

In [None]:
print(f"Recall is {np.mean([i['eval_recall'] for i in res])}")
print(f"STD of recall is {np.std([i['eval_recall'] for i in res])}")

In [None]:
print(f"F1 is {np.mean([i['eval_f1'] for i in res])}")
print(f"STD of f1 is {np.std([i['eval_f1'] for i in res])}")

In [None]:
np.mean([i['eval_f1'] for i in res])

In [None]:
np.mean([i['eval_f1'] for i in res])