In [None]:
from transformers import AutoTokenizer, BertForSequenceClassification, get_scheduler
from datasets import load_dataset

from sklearn.metrics import *
from sklearn.model_selection import train_test_split

import evaluate, torch, os

from torch.optim import AdamW
from torch.utils.data import DataLoader

from tqdm.auto import tqdm

import pandas as pd
import numpy as np

In [None]:
column = "topics"

In [None]:
gpu = 0

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] =  str(gpu)
torch.cuda.set_device(gpu) 

training_stats = []

In [None]:
#SciBERT parameters
num_epochs = 5
l_r = 5e-5
wu = 50
len_batch = 8

#Class mapping
label2id = {"supertopic": 0, "subtopic":1, "same_as": 2, "other": 3}
labels_id = [0, 1, 2, 3]

In [None]:
def tokenize_function(examples):
    return tokenizer(examples[column], padding="max_length", max_length=512, truncation=True)

In [None]:
#get the right tokenizer
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased', output_hidden_states=True)

#load the dataset
dataset = load_dataset('csv', data_files={'train': '../dataset/LM_dataset/training_set.csv', 'validation': '../dataset/LM_dataset/validation_set.csv',
                                          'test': '../dataset/LM_dataset/testing_set.csv'})

In [None]:
#tokenize data
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns([column])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.remove_columns(['subject','object'])

tokenized_datasets.set_format("torch")

train_dataset = tokenized_datasets["train"]
validation_dataset = tokenized_datasets["validation"]
test_dataset = tokenized_datasets["test"]

train_dataloader = DataLoader(train_dataset, shuffle=False, batch_size=len_batch)
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=len_batch)
eval_dataloader = DataLoader(validation_dataset, shuffle=False, batch_size=len_batch)

In [None]:
model = BertForSequenceClassification.from_pretrained('allenai/scibert_scivocab_uncased', num_labels=len(label2id), label2id=label2id)
model_name =  "ep" + str(num_epochs) + "_optAdam_" + str(l_r) + "_wu" + str(wu) + "_batch" + str(len_batch)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=l_r)

num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=wu, num_training_steps=num_training_steps)

for epoch in range(num_epochs):
    print("Start training epoch (" + str(epoch + 1) + "/" + str(num_epochs) + ")")
    
    progress_bar = tqdm(range(len(train_dataloader) + len(eval_dataloader)))

    # Performing Training for each epoch
    training_loss = 0.
    model.train()

    i = 0
    running_loss = 0.0
    
    # The training loop
    for batch in train_dataloader:
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()

        training_loss += loss.item()
        running_loss += loss.item()

        progress_bar.update(1)

        if i % 100 == 99:  # print every 100 batches
            print(f'[{epoch + 1}, {i + 1:5d}] training loss: {running_loss / 100:.3f}')
            running_loss = 0.0
        i = i + 1
        
        
    # Performing Validation for each epoch
    validation_loss = 0.
    model.eval()

    # The validation loop
    j = 0
    running_loss = 0.0
    acc_val = 0.0
    
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss

        validation_loss += loss.item()
        running_loss += loss.item()

        progress_bar.update(1)

        labels = batch["labels"].cpu().numpy()
        logits = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
        acc_val += accuracy_score(labels, logits)

        if j % 50 == 49:  # print every 50 batches
            print(f'[{epoch + 1}, {j + 1:5d}] evaluation loss: {running_loss / 50:.3f}')
            running_loss = 0.0

        j = j + 1

        
    # Calculating the average training and validation loss and accuracy over epoch
    training_loss_avg = training_loss / len(train_dataloader)
    validation_loss_avg = validation_loss / len(eval_dataloader)
    avg_val_accuracy = acc_val / len(eval_dataloader)

    # Printing average training and average validation losses
    print("Epoch: {}".format(epoch+1))
    print("Training loss: {}".format(training_loss_avg))
    print("Validation loss: {}".format(validation_loss_avg))
    print("Validation accuracy: {}".format(avg_val_accuracy))

    training_stats.append(
        {
            'epoch': epoch + 1,
            'Training Loss': training_loss_avg,
            'Valid. Loss': validation_loss_avg,
            'Valid. Accur.': avg_val_accuracy
        }
    )
    
torch.save({
    'epoch': num_epochs,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict()
}, model_name + '.bin')



In [None]:
# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)
df_stats = df_stats.set_index('epoch')
df_stats.to_csv("Training_stats_" + str(model_name) + ".csv")

In [None]:

# Evaluating testing set
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision", labels=labels_id)
recall = evaluate.load("recall", labels=labels_id)

real_labels_list = []
predictions_list = []

for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    accuracy.add_batch(predictions=predictions, references=batch["labels"])
    precision.add_batch(predictions=predictions, references=batch["labels"])
    recall.add_batch(predictions=predictions, references=batch["labels"])

acc_ = accuracy.compute()
prec_ = precision.compute(average=None)
rec_ = recall.compute(average=None)

eval_results = [round(acc_['accuracy'], 4), round(prec_['precision'][0],4),round(prec_['precision'][1],4),
round(prec_['precision'][2],4), round(prec_['precision'][3],4),  round(rec_['recall'][0], 4),round(rec_['recall'][1], 4),
round(rec_['recall'][2], 4),round(rec_['recall'][3], 4)]

print(eval_results)