In [1]:
import pandas as pd
from skmultilearn.model_selection import IterativeStratification

In [2]:
eat = pd.read_json('eat_train.json')

It's balanced in labels

In [3]:
eat['label'].value_counts()

1    522
0    522
Name: label, dtype: int64

And not that balanced with breakpoints

In [4]:
eat['breakpoint'].value_counts(normalize=True)

-1    0.500000
 4    0.250958
 3    0.104406
 2    0.083333
 1    0.056513
 5    0.004789
Name: breakpoint, dtype: float64

In [5]:
# kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) # 10 fold because our data is very small
kfold = IterativeStratification(n_splits=10, random_state=42)



In [6]:
# we create the folds once, and always use those.
for fold, (train_index, val_index) in enumerate(kfold.split(X=eat, y=eat[['label', 'breakpoint']])):
    eat.loc[val_index, 'fold'] = int(fold) # fold to predict on
    
eat['fold'] = eat['fold'].astype(int)
eat.to_pickle('eat_with_folds.pkl')

In [None]:
eat['story'].apply(len).value_counts()

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
precision_recall_fscore_support(eat['breakpoint'].replace(-1, 0), np.random.randint(0 ,6, size=len(eat)), average='macro')

In [None]:
eat['breakpoint'].replace(-1, 0).value_counts()

In [None]:
import numpy as np

In [None]:
eat_for_transformers = pd.DataFrame()

In [None]:
eat_for_transformers['sentence1'] = eat['story'].apply(lambda x: ' '.join(x))

In [None]:
eat_for_transformers['label'] = eat['breakpoint']

In [None]:
eat_for_transformers['fold'] = eat['fold']

In [None]:
for fold in range(10):
    current_train = eat_for_transformers[eat_for_transformers['fold'] != fold]
    current_test = eat_for_transformers[eat_for_transformers['fold'] == fold]
    
    current_train[['sentence1', 'label']].to_csv(f'./eat_folded/eat_train_breakpoint_{fold}.csv', index=False)
    current_test[['sentence1', 'label']].to_csv(f'./eat_folded/eat_test_breakpoint_{fold}.csv', index=False)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
eat_train, eat_test = train_test_split(eat_for_transformers, test_size=0.1)

In [None]:
eat_train.to_csv('eat_train.csv', index=False)
eat_test.to_csv('eat_test.csv', index=False)

In [None]:
# eat[eat['fold'] == 9]['breakpoint'].value_counts()

# Model!

In [None]:
GLUE_TASKS = ["cola", "mnli", "mnli-mm", "mrpc", "qnli", "qqp", "rte", "sst2", "stsb", "wnli"]

In [None]:
task = "rte"
model_checkpoint = "roberta-large"
batch_size = 8

In [None]:
from datasets import load_dataset, load_metric

In [None]:
actual_task = "mnli" if task == "mnli-mm" else task
dataset = load_dataset("glue", actual_task)
metric = load_metric('glue', actual_task)

In [None]:
dataset

In [None]:
dataset["train"][1]

In [None]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [None]:
show_random_elements(dataset["train"])

In [None]:
import numpy as np

fake_preds = np.random.randint(0, 2, size=(64,))
fake_labels = np.random.randint(0, 2, size=(64,))
metric.compute(predictions=fake_preds, references=fake_labels)

In [None]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [None]:
tokenizer("Hello, this one sentence!", "And this sentence goes with it.")

In [None]:
task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mnli-mm": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}

In [None]:
sentence1_key, sentence2_key = task_to_keys[task]
if sentence2_key is None:
    print(f"Sentence: {dataset['train'][0][sentence1_key]}")
else:
    print(f"Sentence 1: {dataset['train'][0][sentence1_key]}")
    print(f"Sentence 2: {dataset['train'][0][sentence2_key]}")

In [None]:
def preprocess_function(examples):
    if sentence2_key is None:
        return tokenizer(examples[sentence1_key], truncation=True)
    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True)

In [None]:
preprocess_function(dataset['train'][:5])

In [None]:
encoded_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

num_labels = 3 if task.startswith("mnli") else 1 if task=="stsb" else 2
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

In [None]:
metric_name = "pearson" if task == "stsb" else "matthews_correlation" if task == "cola" else "accuracy"

args = TrainingArguments(
    "test-glue",
    evaluation_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=20,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if task != "stsb":
        predictions = np.argmax(predictions, axis=1)
    else:
        predictions = predictions[:, 0]
    return metric.compute(predictions=predictions, references=labels)

In [None]:
validation_key = "validation_mismatched" if task == "mnli-mm" else "validation_matched" if task == "mnli" else "validation"
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset[validation_key],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

Ideas: 
1. use classification with the optimized weights

# Performance

In [21]:
res = [{'eval_loss': 2.0373501777648926,
  'eval_accuracy': 0.7115384615384616,
  'eval_f1': 0.5514998807990646,
  'eval_precision': 0.7200458365164246,
  'eval_recall': 0.5672506738544475,
  'epoch': 40.0},
 {'eval_loss': 1.5406006574630737,
  'eval_accuracy': 0.75,
  'eval_f1': 0.7124412372847513,
  'eval_precision': 0.7448672911787666,
  'eval_recall': 0.6977142857142857,
  'epoch': 40.0},
 {'eval_loss': 1.8844071626663208,
  'eval_accuracy': 0.7115384615384616,
  'eval_f1': 0.6245996914962432,
  'eval_precision': 0.6476724137931035,
  'eval_recall': 0.6280820830356434,
  'epoch': 40.0},
 {'eval_loss': 1.6027127504348755,
  'eval_accuracy': 0.6923076923076923,
  'eval_f1': 0.5363919681573355,
  'eval_precision': 0.5607755662319834,
  'eval_recall': 0.573976948976949,
  'epoch': 40.0},
 {'eval_loss': 1.7633825540542603,
  'eval_accuracy': 0.7238095238095238,
  'eval_f1': 0.5526622228709516,
  'eval_precision': 0.5930845456462155,
  'eval_recall': 0.5464850882531954,
  'epoch': 40.0},
 {'eval_loss': 1.9273165464401245,
  'eval_accuracy': 0.7307692307692307,
  'eval_f1': 0.5422210395036482,
  'eval_precision': 0.5827683615819209,
  'eval_recall': 0.5182652492435101,
  'epoch': 40.0},
 {'eval_loss': 2.000366687774658,
  'eval_accuracy': 0.6571428571428571,
  'eval_f1': 0.5143304560566792,
  'eval_precision': 0.5412987012987013,
  'eval_recall': 0.5005772005772006,
  'epoch': 40.0},
 {'eval_loss': 2.160609722137451,
  'eval_accuracy': 0.6285714285714286,
  'eval_f1': 0.41717426092426096,
  'eval_precision': 0.4426544128623236,
  'eval_recall': 0.4519230769230769,
  'epoch': 40.0},
 {'eval_loss': 2.5876944065093994,
  'eval_accuracy': 0.6346153846153846,
  'eval_f1': 0.5251370851370851,
  'eval_precision': 0.5153571428571428,
  'eval_recall': 0.5546153846153845,
  'epoch': 40.0},
 {'eval_loss': 1.3912707567214966,
  'eval_accuracy': 0.7523809523809524,
  'eval_f1': 0.6682106782106783,
  'eval_precision': 0.6967159277504106,
  'eval_recall': 0.6548733874820831,
  'epoch': 40.0}]

In [22]:
import numpy as np

In [23]:
print(f"Precision is {np.mean([i['eval_precision'] for i in res])}")
print(f"STD of precision is {np.std([i['eval_precision'] for i in res])}")

Precision is 0.6045240199716992
STD of precision is 0.0916900734381751


In [24]:
print(f"Recall is {np.mean([i['eval_recall'] for i in res])}")
print(f"STD of recall is {np.std([i['eval_recall'] for i in res])}")

Recall is 0.5693763378675777
STD of recall is 0.07009654140159816


In [25]:
print(f"F1 is {np.mean([i['eval_f1'] for i in res])}")
print(f"STD of f1 is {np.std([i['eval_f1'] for i in res])}")

F1 is 0.5644668520440697
STD of f1 is 0.07976297486678649
