In [None]:
import numpy as np
import pandas as pd
import torch.optim as optim
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


# Model training and evaluation

In [None]:
def train(model, tokenzier, X_train, X_val, y_train, y_val, output_path_prefix, batch_size = 32, learning_rate = 5e-5, total_epoch = 10, max_length = 128):
    """
    Finetune the pretrained model on the given dataset

    Args:
        model(object): initialised pretrained model with self defined classification head.
        tokenzier(AutoTokenizer): tokenizer used by the selected pretrained model to tokenize the input text.
        X_train(List[Str]): list of training texts.
        X_val(List[Str]): list of validation texts.
        y_train(Array[Int]): encoded labels of training texts as NumPy array.
        y_val(list): encoded labels of validation texts as NumPy array.
        output_path_prefix(str): prefix of the path to save the trained model, comprised of the path to the output folder and the experiment name. This will be appended with '_best.pt' and '_best_state_dict.pt' to save the best model during training and its state dictionary.
        batch_size(int): batch size for training. Default is 32.
        learning_rate(float): learning rate for training. Default is 5e-5.
        total_epoch(int): total number of epochs for training. Default is 10.
        max_length(int): maximum length for the input text. Longer texts will be truncated and shorter texts will be padded with special tokens. Default is 128.


    Returns:
        paths(Tuple(Str)): file paths where the best model and its state dictionary is saved .
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, eps = 1e-8)

    best = 0
    # Training the model
    for epoch in range(total_epoch):
        train_loss = 0
        for batch_start_index in range(0,len(X_train), batch_size):
            batch_tokenized_docs = X_train[batch_start_index:(batch_start_index+batch_size)]
            batch_tensor = torch.cat([torch.tensor(tokenizer.encode_plus(doc, add_special_tokens = True,max_length = max_length, pad_to_max_length = True,return_attention_mask = True,return_tensors = 'pt')['input_ids']) for doc in batch_tokenized_docs], dim=0).to(device)
            batch_mask = torch.cat([torch.tensor(tokenizer.encode_plus(doc, add_special_tokens = True,max_length = max_length, pad_to_max_length = True,return_attention_mask = True,return_tensors = 'pt')['attention_mask']) for doc in batch_tokenized_docs]).to(device)
            batch_labels = torch.tensor(y_train[batch_start_index:(batch_start_index+batch_size)]).to(device)

            model.train()
            optimizer.zero_grad()
            outputs,_,_ = model(batch_tensor,batch_mask)
            loss = criterion(outputs, batch_labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        # validation
        model.eval()
        val_predictions = []
        for batch_start_index in range(0,len(X_val), batch_size):

            batch_tokenized_docs = X_val[batch_start_index:(batch_start_index+batch_size)]
            batch_tensor = torch.cat([torch.tensor(tokenizer.encode_plus(doc, add_special_tokens = True,max_length = max_length, pad_to_max_length = True,return_attention_mask = True,return_tensors = 'pt')['input_ids']) for doc in batch_tokenized_docs], dim=0).to(device)
            batch_mask = torch.cat([torch.tensor(tokenizer.encode_plus(doc, add_special_tokens = True,max_length = max_length, pad_to_max_length = True,return_attention_mask = True,return_tensors = 'pt')['attention_mask']) for doc in batch_tokenized_docs]).to(device)
            outputs,_,_ = model(batch_tensor, batch_mask)
            predicted = torch.argmax(outputs, 1)
            val_predictions.append(predicted)

        predictions_tensor = torch.hstack(val_predictions)
        current_val_acc = accuracy_score(y_val, predictions_tensor.cpu().numpy())


        # save best
        if current_val_acc > best:
            torch.save(model, f'{output_path_prefix}_best.pt')
            torch.save(model.state_dict(),f'{output_path_prefix}_best_state_dict.pt')
            best =  current_val_acc
        print('Epoch: %d, train loss: %.5f, val acc: %.5f, best acc: %.5f'%(epoch + 1, train_loss, current_val_acc, best))


    print('Finished Training')
    return f'{output_path_prefix}_best.pt', f'{output_path_prefix}_best_state_dict.pt'


In [None]:
def evaluate(model_path, tokenzier, raw_texts_test, label_encoded_test, batch_size=32, max_length=128):
    """
    Run inference of the finetuned model on the testing test features and compute the accuracy and f1 score

    Args:
        model_path(str): file path where the best finetuned model is saved .
        tokenzier(AutoTokenizer): tokenizer used by the selected pretrained model to tokenize the input text.
        raw_texts_test(List[Str]): list of testing texts.
        label_encoded_test(Array[Int]): encoded labels of testing texts as NumPy array.
        batch_size(int): batch size for inference. Default is 32.
        max_length(int): maximum length for the input text. Longer texts will be truncated and shorter texts will be padded with special tokens. Default is 128.

    Returns:
        None
    """
    model = torch.load(model_path)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # evaluation on the test set
    model.eval()
    predictions = []
    for batch_start_index in range(0,len(raw_texts_test), batch_size):

        batch_tokenized_docs = raw_texts_test[batch_start_index:(batch_start_index+batch_size)]
        batch_tensor = torch.cat([torch.tensor(tokenizer.encode_plus(doc, add_special_tokens = True,max_length = max_length, pad_to_max_length = True,return_attention_mask = True,return_tensors = 'pt')['input_ids']) for doc in batch_tokenized_docs], dim=0).to(device)
        batch_mask = torch.cat([torch.tensor(tokenizer.encode_plus(doc, add_special_tokens = True,max_length = max_length, pad_to_max_length = True,return_attention_mask = True,return_tensors = 'pt')['attention_mask']) for doc in batch_tokenized_docs]).to(device)
        outputs,_,_ = model(batch_tensor, batch_mask)
        predicted = torch.argmax(outputs, 1)
        predictions.append(predicted)

    predictions_tensor = torch.hstack(predictions)
    print(classification_report(label_encoded_test, predictions_tensor.cpu().numpy(),digits=4))

# embedding extraction and storing

In [None]:
def get_CLS_row_values(model_path, raw_docs, labels,max_length=128):
    """
    Extract the CLS embeddings of the given texts.

    Args:
        model_path(str): file path where the best finetuned model is saved .
        raw_docs(List[Str]): list of texts.
        labels(Array[Int]): encoded labels of texts as NumPy array.
        max_length(int): maximum length for the input text. Longer texts will be truncated and shorter texts will be padded with special tokens. Default is 128.

    Returns:
        row_values (List[List[float]]): CLS embeddings + encoded label of the given texts.
    """
    model = torch.load(model_path)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    row_values = []
    with torch.no_grad():
        model.eval()
        for batch_start_index in range(0,len(raw_docs), 1):
            batch_tokenized_docs = raw_docs[batch_start_index:(batch_start_index+1)]
            batch_tensor = torch.cat([torch.tensor(tokenizer.encode_plus(doc, add_special_tokens = True,max_length = max_length, pad_to_max_length = True,return_attention_mask = True,return_tensors = 'pt')['input_ids']) for doc in batch_tokenized_docs], dim=0).to(device)
            batch_mask = torch.cat([torch.tensor(tokenizer.encode_plus(doc, add_special_tokens = True,max_length = max_length, pad_to_max_length = True,return_attention_mask = True,return_tensors = 'pt')['attention_mask']) for doc in batch_tokenized_docs]).to(device)
            _,_,cls_rep = model(batch_tensor,batch_mask)
            one_row = cls_rep.cpu().squeeze().tolist()
            one_row.append(labels[batch_start_index])
            row_values.append(one_row)
    return row_values

In [None]:
def save_CLS_embs(model_path, raw_docs, labels, output_path, dim_size=768, max_length=128, save_csv=True):
    """
    Convert any extracted embeddings as a dataframe and save them in CSV file.

    Args:
        model_path(str): file path where the best finetuned model is saved .
        raw_docs(List[Str]): list of texts.
        labels(Array[Int]): encoded labels of texts as NumPy array.
        output_path(str): path to save the CSV file.
        dim_size(int): dimension size of the embeddings. Default is 768.
        max_length(int): maximum length for the input text. Longer texts will be truncated and shorter texts will be padded with special tokens. Default is 128.
        save_csv(bool): whether to save the CSV file or not. Default is True.

    Returns:
        df (DataFrame): Given embeddings coverted to the dataframe, where each dim of the embedding is a column, and named as f'vec_val_{dim_index}', and the encoded label information is stored as the last column named as  'Class_label'.
    """
    column_names = []
    for i in range(dim_size):
        column_names.append("vec_val_" + str(i))
    column_names.append("Class_label")

    train_row_values = get_CLS_row_values(model_path, raw_docs, labels, max_length)
    df = pd.DataFrame(train_row_values, columns=column_names)
    if save_csv:
        df.to_csv(output_path,sep=',',index=False)
    return df

