# Shows how to fine-tune the pre-trained seBERT model for commit intent classification

This is just for demonstration purposes, we distributed the fine-tuning on our SLURM HPC system.
We distribute the HPC code under the /ft/ folder in this replication kit.
However, this notebook shows the principle and should be sufficient if you have a small amount of data and executions.

In [None]:
import gc
import random
import torch
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, matthews_corrcoef
from sklearn.model_selection import train_test_split, cross_validate, KFold
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import make_scorer
from sklearn.preprocessing import MultiLabelBinarizer

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer

In [None]:
def compute_metrics_multi_label(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    mcc = matthews_corrcoef(y_true=labels, y_pred=pred)

    precision_micro = precision_score(y_true=labels, y_pred=pred, average='micro')
    recall_micro = recall_score(y_true=labels, y_pred=pred, average='micro')
    f1_micro = f1_score(y_true=labels, y_pred=pred, average='micro')

    precision_macro = precision_score(y_true=labels, y_pred=pred, average='macro')
    recall_macro = recall_score(y_true=labels, y_pred=pred, average='macro')
    f1_macro = f1_score(y_true=labels, y_pred=pred, average='macro')

    return {'accuracy': accuracy, 'mcc': mcc, 'precision_micro': precision_micro, 'recall_micro': recall_micro, 'f1_micro': f1_micro, 'precision_macro': precision_macro, 'recall_macro': recall_macro, 'f1_macro': f1_macro}


class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

In [None]:
# load pre-trained model and tokenizer
# needs the pre-trained model extracted in that folder https://smartshark2.informatik.uni-goettingen.de/sebert/seBERT_pre_trained.tar.gz
model = BertForSequenceClassification.from_pretrained('../ft/models/seBERT/', num_labels=3)
tokenizer = BertTokenizer.from_pretrained('../ft/models/seBERT/')

In [None]:
# load data, ground truth only
df = pd.read_csv('../data/all_changes_gt.csv.gz')

def unify_label_num(row):
    label = 0
    if row['internal_quality']:
        label = 1
    if row['external_quality']:
        label = 2
    return label

df['message_no_newlines'] = df['message'].str.replace('\n', ' ')
df['label'] = df.apply(unify_label_num, axis=1)

In [None]:
X = df['message_no_newlines'].values
y = df['label'].values


# this simulates one fold for 10-fold cross-validation, for evaluation of fine tuning this would be a KFold split
# IMPORTANT: the model needs to start from scratch for multiple evaluation runs!
# If you run this in a loop for evaluation the model and tokenizer load code needs to be in the loop
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

# this is used as evaluation data to select the best performing epoch
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

X_train_tokens = tokenizer(X_train.tolist(), padding=True, truncation=True, max_length=128)
X_test_tokens = tokenizer(X_test.tolist(), padding=True, truncation=True, max_length=128)
X_val_tokens = tokenizer(X_val.tolist(), padding=True, truncation=True, max_length=128)

train_dataset = Dataset(X_train_tokens, y_train)
test_dataset = Dataset(X_test_tokens, y_test)
eval_dataset = Dataset(X_val_tokens, y_val)


training_args = TrainingArguments(
    output_dir                  = "../ft/checkpoints",
    num_train_epochs            = 3,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size  = 4,
    gradient_accumulation_steps = 2,
    eval_accumulation_steps     = 5,
    evaluation_strategy         = "epoch",
    load_best_model_at_end      = True
)

trainer = Trainer(
    model           = model,
    args            = training_args,
    train_dataset   = train_dataset,
    eval_dataset    = eval_dataset,
    compute_metrics = compute_metrics_multi_label
)

In [None]:
print(trainer.train())
print(trainer.evaluate())

In [None]:
# To generate a fine-tuned model the model is simply fed all available ground truth data and saved