# Deberta

In [15]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from datasets import load_from_disk
import sklearn.metrics as metrics

In [2]:
model_id = "microsoft/deberta-v3-large"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=4)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['pooler.dense.bias', 'classifier.bias', 'pooler.dense.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
datasets = load_from_disk("/home/mlynatom/data/gug/gug_hf")
datasets["train"][0]

{'Id': 1,
 'sentence': 'If the teacher once entered in to the class she should be well preaperd of what she is going to explain.',
 'label': 'Comprehensible'}

In [5]:
rename_dict = {
    "Incomprehensible": 0,
    "Somewhat Comprehensible": 1,
    "Comprehensible": 2,
    "Perfect": 3,
}

def map_labels(sample):
    label = sample["label"]
    sample["label"] = rename_dict[label]
    return sample

datasets = datasets.map(map_labels)
datasets["train"][0]

Map:   0%|          | 0/1518 [00:00<?, ? examples/s]

Map:   0%|          | 0/747 [00:00<?, ? examples/s]

Map:   0%|          | 0/754 [00:00<?, ? examples/s]

{'Id': 1,
 'sentence': 'If the teacher once entered in to the class she should be well preaperd of what she is going to explain.',
 'label': 2}

In [8]:
def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True)

tokenized_datasets = datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)

Map:   0%|          | 0/1518 [00:00<?, ? examples/s]

In [17]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(axis=1)
    accuracy = metrics.accuracy_score(labels, preds)
    f1 = metrics.f1_score(labels, preds, average="weighted")
    return {
        "accuracy": accuracy,
        "f1": f1
        }

In [12]:
learning_rate = 5e-5
batch_size = 9
num_epochs = 5
warmup_ratio = 0.1

In [13]:
%env WANDB_PROJECT=deberta_fluency

env: WANDB_PROJECT=deberta_fluency


In [14]:
training_args = TrainingArguments(
    output_dir="models/deberta_gug",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.03,
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=100,
    save_steps=100,
    load_best_model_at_end=True,
    fp16=False,
    report_to=["wandb"],
    warmup_ratio=warmup_ratio,
    metric_for_best_model="accuracy",
    overwrite_output_dir=True
)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1
100,No log,0.914641,0.654618,0.596194
200,No log,0.858059,0.720214,0.701483
300,No log,0.997004,0.593039,0.567279
400,No log,1.022137,0.698795,0.669794
500,0.589300,0.935092,0.720214,0.705735
600,0.589300,1.319575,0.697456,0.681754
700,0.589300,1.506701,0.717537,0.690316
800,0.589300,1.482288,0.720214,0.704843


NameError: name 'wandb' is not defined

In [19]:
trainer.predict(test_dataset=tokenized_datasets["test"])

PredictionOutput(predictions=array([[-3.135937  , -1.3244065 ,  2.5553534 ,  1.1783088 ],
       [-0.24588235,  2.886966  ,  0.3460716 , -3.1959226 ],
       [-2.2057528 ,  0.9438204 ,  2.5106316 , -1.953611  ],
       ...,
       [-4.269916  , -2.211344  ,  2.3848379 ,  3.6043663 ],
       [-1.9142662 ,  1.498152  ,  2.3716965 , -2.7523026 ],
       [-2.167883  ,  0.7436087 ,  2.6391587 , -1.4938335 ]],
      dtype=float32), label_ids=array([1, 0, 2, 3, 1, 3, 1, 2, 3, 2, 2, 2, 2, 3, 2, 2, 2, 1, 1, 1, 0, 2,
       3, 1, 3, 3, 1, 2, 3, 2, 2, 2, 2, 1, 2, 1, 3, 1, 3, 3, 2, 3, 0, 3,
       2, 1, 1, 2, 1, 1, 2, 2, 1, 1, 3, 2, 1, 3, 2, 2, 2, 3, 2, 2, 2, 1,
       1, 1, 2, 1, 2, 2, 2, 3, 2, 2, 2, 2, 2, 3, 3, 3, 2, 2, 2, 2, 2, 2,
       2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 1, 3, 2, 1, 2, 2, 3, 3, 3, 1, 2, 1,
       3, 2, 3, 3, 2, 3, 2, 2, 2, 2, 3, 2, 1, 0, 2, 3, 2, 2, 3, 3, 2, 2,
       2, 1, 2, 2, 2, 3, 1, 1, 2, 2, 1, 2, 2, 3, 2, 3, 3, 2, 1, 1, 3, 0,
       1, 2, 1, 2, 2, 3, 3, 2, 3, 3, 2, 1, 2, 1, 