In [1]:
! pip install transformers
! pip install datasets

Collecting transformers
  Downloading transformers-4.10.0-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 15.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 34.1 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 39.0 MB/s 
Collecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.16-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 9.0 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 54.8 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: P

In [2]:
cd /content/drive/MyDrive/CAMemBERT2

/content/drive/MyDrive/CAMemBERT2


In [3]:
import numpy as np
from torch import nn
import torch.nn.functional as F
from transformers import AutoModelForSequenceClassification,Trainer
import copy
from trainer_utils import TrainingArguments,get_dataset,freeze_layers
from sklearn.metrics import cohen_kappa_score
from scipy.stats import spearmanr
import torch
import pdb
import pandas as pd

In [7]:
pretrained_model_name = 'distilroberta-base'
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name,num_labels=1)
for name,params in model.roberta.named_parameters():
    if 'classifier' not in name:
        params.requires_grad = False

loading configuration file https://huggingface.co/distilroberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/42d6b7c87cbac84fcdf35aa69504a5ccfca878fcee2a1a9b9ff7a3d1297f9094.aa95727ac70adfa1aaf5c88bea30a4f5e50869c68e68bce96ef1ec41b5facf46
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.10.0",
  "type_vocab_size": 1,
  "use_cache": true,


In [13]:
class R2Trainer(Trainer):

    def compute_loss(self,model,inputs,return_outputs=False):
        # get model output
        ouput = model(**inputs)
        # ModelForClassification automatically uses MSELoss when 1 label is provided
        loss_m = ouput.loss
        predictions = ouput.logits
        # caclulate the softmax for predictions 
        sm_pred_scores = F.softmax(predictions.flatten(),dim=0)
        # caclulate the softmax for labels 
        sm_gold_scores = F.softmax(inputs['labels'].float(),dim=0) 
        # compare the two distributions by cross entropy loss
        loss_r = torch.sum((-sm_gold_scores*torch.log(sm_pred_scores)))
        # weight the two losses in accordance with the number of epochs 
        loss = self.calc_dynamic_loss(loss_m,loss_r)
        return (loss,(loss,predictions)) if return_outputs else loss

    # helper function for calculating dynamic loss 
    def calc_dynamic_loss(self,loss_m,loss_r):
        if self.state.epoch==0:
            self.set_gamma()
            self.update_te()
            self.prev_epoch = 0
        elif self.state.epoch != self.prev_epoch:
            self.prev_epoch = copy.copy(self.state.epoch)
            self.update_te()
        return self.te*loss_m + (1-self.te)*loss_r

    def set_gamma(self):
        self.gamma = np.log((1/1e-6)-1)/((self.args.num_train_epochs/2)-1)

    def update_te(self):
        self.te = 1/(1+np.exp(self.gamma*((self.args.num_train_epochs/2)-self.state.epoch)))

# compute metrics
def compute_metrics(p):
    preds, labels = p
    preds,labels = np.rint(preds.flatten()*40),np.rint(labels*40)
    metrics_dic = {
            "rmse": np.sqrt(np.mean((preds-labels)**2)),
            "pearson": np.corrcoef(preds,labels)[0,1],
            "spearman" : spearmanr(preds, labels)[0],
            "kappa":cohen_kappa_score(np.rint(preds),np.rint(labels),weights='quadratic')
            }
    return metrics_dic

def normalise_scores(example):
    score = example['labels']/40
    return {'labels':score}

args = TrainingArguments(
    output_dir = '/',
    evaluation_strategy = "epoch",
    learning_rate = 4e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    remove_unused_columns=False,
    num_train_epochs =15,
    weight_decay = 0.01,
    )

# load aes dataset
dataset_obj,dataset_dict = get_dataset(['aes'],pretrained_model=pretrained_model_name,max_length=512)
# prepare for model
dataset_dict = dataset_dict.remove_columns(['dataset','labels']).rename_column('scores','labels')
# normalise scores
dataset_dict = dataset_dict.map(normalise_scores)
trainer = R2Trainer(
        model,
        args,
        train_dataset=dataset_dict['train'],
        eval_dataset=dataset_dict['dev'],
        compute_metrics = compute_metrics,
    )
trainer.train()
trainer.evaluate(dataset_dict['test'])

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/distilroberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/42d6b7c87cbac84fcdf35aa69504a5ccfca878fcee2a1a9b9ff7a3d1297f9094.aa95727ac70adfa1aaf5c88bea30a4f5e50869c68e68bce96ef1ec41b5facf46
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "i

  0%|          | 0/1057 [00:00<?, ?ex/s]

  0%|          | 0/97 [00:00<?, ?ex/s]

  0%|          | 0/79 [00:00<?, ?ex/s]

  0%|          | 0/1057 [00:00<?, ?ex/s]

  0%|          | 0/97 [00:00<?, ?ex/s]

  0%|          | 0/79 [00:00<?, ?ex/s]

Reusing dataset conll2003 (/root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)


  0%|          | 0/14041 [00:00<?, ?ex/s]

  0%|          | 0/3250 [00:00<?, ?ex/s]

  0%|          | 0/3453 [00:00<?, ?ex/s]

  0%|          | 0/1057 [00:00<?, ?ex/s]

  0%|          | 0/266 [00:00<?, ?ex/s]

  0%|          | 0/243 [00:00<?, ?ex/s]

  0%|          | 0/1057 [00:00<?, ?ex/s]

  0%|          | 0/266 [00:00<?, ?ex/s]

  0%|          | 0/243 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1057 [00:00<?, ?ex/s]

  0%|          | 0/97 [00:00<?, ?ex/s]

  0%|          | 0/79 [00:00<?, ?ex/s]

***** Running training *****
  Num examples = 1057
  Num Epochs = 15
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1005


Epoch,Training Loss,Validation Loss,Rmse,Pearson,Spearman,Kappa
1,No log,0.019249,5.550687,0.701647,0.690299,0.497106
2,No log,0.017438,5.298674,0.701999,0.692122,0.523908
3,No log,0.018982,5.52898,0.704476,0.695161,0.502958
4,No log,0.019102,5.52898,0.70958,0.701925,0.503719
5,No log,0.016762,5.208319,0.699527,0.699766,0.531031
6,No log,0.016883,5.182737,0.712353,0.704905,0.529059
7,No log,0.016886,5.196152,0.702337,0.702125,0.534577
8,2.619000,0.015408,4.996201,0.701427,0.695301,0.556895
9,2.619000,0.016704,5.139817,0.710781,0.709461,0.539454
10,2.619000,0.016329,5.100261,0.702954,0.706006,0.547166


***** Running Evaluation *****
  Num examples = 79
  Batch size = 16
***** Running Evaluation *****
  Num examples = 79
  Batch size = 16
***** Running Evaluation *****
  Num examples = 79
  Batch size = 16
***** Running Evaluation *****
  Num examples = 79
  Batch size = 16
***** Running Evaluation *****
  Num examples = 79
  Batch size = 16
***** Running Evaluation *****
  Num examples = 79
  Batch size = 16
***** Running Evaluation *****
  Num examples = 79
  Batch size = 16
Saving model checkpoint to /checkpoint-500
Configuration saved in /checkpoint-500/config.json
Model weights saved in /checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 79
  Batch size = 16
***** Running Evaluation *****
  Num examples = 79
  Batch size = 16
***** Running Evaluation *****
  Num examples = 79
  Batch size = 16
***** Running Evaluation *****
  Num examples = 79
  Batch size = 16
***** Running Evaluation *****
  Num examples = 79
  Batch size = 16
***** Running Evaluat

{'epoch': 15.0,
 'eval_kappa': 0.6131725138505717,
 'eval_loss': 0.013914179988205433,
 'eval_pearson': 0.7792636430107545,
 'eval_rmse': 4.734170436859131,
 'eval_runtime': 0.5751,
 'eval_samples_per_second': 168.663,
 'eval_spearman': 0.7827034917272796,
 'eval_steps_per_second': 12.172}

In [14]:
pd.DataFrame(trainer.state.log_history).to_csv('results/mse_bertbase_lr4e_5_bs16_norm_no_freeze.csv')