In [1]:
# Load Dataset
import os
import sys
sys.path.append(os.path.abspath('../'))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json


In [2]:
# Load Dataset
with open('/content/NoveltyDetectionResearch/.data/dlnd/TAP-DLND-1.0_LREC2018_modified/dlnd.jsonl','r') as f:
    data = f.readlines()
    dataset = [json.loads(line) for line in data]

In [3]:
from sklearn.model_selection import train_test_split

texts=[(i["source"],i["target_text"]) for i in dataset]
labels=[1 if i["DLA"]=='Novel' else 0 for i in dataset]

train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=.2)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)


In [4]:
from transformers import DistilBertTokenizerFast,BertTokenizerFast,LongformerTokenizerFast, RobertaTokenizerFast
# tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
# tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096')
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

In [5]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [6]:
import torch

class DLNDDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [7]:
train_dataset = DLNDDataset(train_encodings, train_labels)
val_dataset = DLNDDataset(val_encodings, val_labels)
test_dataset = DLNDDataset(test_encodings, test_labels)

In [11]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import BertForSequenceClassification
from transformers import LongformerForSequenceClassification
from transformers import RobertaForSequenceClassification

import numpy as np

from sklearn.metrics import precision_recall_fscore_support,accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average=None)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1_novel': f1[0],
        'P_novel': precision[0],
        'R_novel': recall[0],
        'f1_non_novel': f1[1],
        'P_non_novel': precision[1],
        'R_non_novel': recall[1],
    }

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=15,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,               # number of steps between logging
    evaluation_strategy="steps",     # evaluation strategy
    eval_steps=200,                  # number of steps between evaluations
    gradient_accumulation_steps=2,   # number of steps for gradient accumulation
)

# model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
# model = LongformerForSequenceClassification.from_pretrained("allenai/longformer-base-4096")
model = RobertaForSequenceClassification.from_pretrained("roberta-base")


trainer = Trainer(
    model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics,     # function to compute metrics
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_head

Step,Training Loss,Validation Loss,Accuracy,F1 Novel,P Novel,R Novel,F1 Non Novel,P Non Novel,R Non Novel
200,0.6274,0.562248,0.712644,0.641834,0.854962,0.513761,0.760077,0.651316,0.912442
400,0.501,0.399783,0.82069,0.81295,0.851759,0.777523,0.827815,0.794492,0.864055
600,0.3552,0.366205,0.847126,0.856526,0.808554,0.91055,0.836408,0.897098,0.78341
800,0.3004,0.353447,0.858621,0.852341,0.894207,0.81422,0.864388,0.828753,0.903226
1000,0.1995,0.400545,0.87931,0.87574,0.904645,0.848624,0.882682,0.856833,0.910138
1200,0.1746,0.541795,0.881609,0.8812,0.886311,0.876147,0.882016,0.876993,0.887097
1400,0.1347,0.498447,0.895402,0.897175,0.884187,0.91055,0.893567,0.907363,0.880184
1600,0.0815,0.494135,0.889655,0.890909,0.882883,0.899083,0.888372,0.896714,0.880184


***** Running Evaluation *****
  Num examples = 870
  Batch size = 16
***** Running Evaluation *****
  Num examples = 870
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 870
  Batch size = 16
***** Running Evaluation *****
  Num examples = 870
  Batch size = 16
***** Running Evaluation *****
  Num examples = 870
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 870
  Batch size = 16
***** Running Evaluation *****
  Num examples = 870
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/chec

In [9]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 870
  Batch size = 16


{'eval_loss': 0.4461168348789215,
 'eval_accuracy': 0.9057471264367816,
 'eval_f1_novel': 0.9072398190045249,
 'eval_P_novel': 0.8950892857142857,
 'eval_R_novel': 0.9197247706422018,
 'eval_f1_non_novel': 0.9042056074766356,
 'eval_P_non_novel': 0.9170616113744076,
 'eval_R_non_novel': 0.8917050691244239,
 'eval_runtime': 15.8479,
 'eval_samples_per_second': 54.897,
 'eval_steps_per_second': 3.47,
 'epoch': 6.0}

In [10]:
trainer.evaluate(test_dataset)

***** Running Evaluation *****
  Num examples = 1087
  Batch size = 16


{'eval_loss': 0.594714879989624,
 'eval_accuracy': 0.8647654093836247,
 'eval_f1_novel': 0.8771929824561403,
 'eval_P_novel': 0.8578431372549019,
 'eval_R_novel': 0.8974358974358975,
 'eval_f1_non_novel': 0.8495394063459569,
 'eval_P_non_novel': 0.8736842105263158,
 'eval_R_non_novel': 0.8266932270916335,
 'eval_runtime': 19.7652,
 'eval_samples_per_second': 54.996,
 'eval_steps_per_second': 3.44,
 'epoch': 6.0}