# Install packages

In [4]:
!pip install pandas numpy scikit-learn transformers[torch]

Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Downloading transformers-4.48.0-py3-none-any.whl (9.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting tokenizers<0.22,>=0.21
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m49.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting tqdm>=4.27
  Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 KB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Collecting safetensors>=0.4.1
  Downloading safetensors-0.5.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (461 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.0/462.0 KB[0m [31m38.3 MB/s[0m eta [36m0

# Import packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import precision_recall_fscore_support

  from .autonotebook import tqdm as notebook_tqdm
2025-01-10 20:14:18.112803: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736540058.134396   65635 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736540058.140994   65635 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Read and load data

In [2]:
df = pd.read_csv("ESWA20_manualclassification.csv", encoding="cp1252")

In [3]:
df

Unnamed: 0,Commit Message,Category
0,\tNIO Reads writes are completed in the caller...,functional
1,* temporary fix for non-shared JArray object...,bugfix
2,* Java:\n * Added RCON functionality for So...,functional
3,- Merge changes from Pull-Request #6 \t - Fix...,code smell
4,Bug 233643 - API builder performance bad for...,external
...,...,...
1735,working on #339: introducing the getSpecialFea...,functional
1736,working on #339: next step. i\n\nmove all the ...,functional
1737,working on #369: generalized the adjustMapForP...,functional
1738,working up test coverage; minor tweaks; removi...,code smell


# Dataset Split and load into torch Dataset 

In [4]:
train_val_df, test_df = train_test_split(
    df, 
    test_size=0.2, 
    random_state=42, 
    stratify=df["Category"]
)

# From the remaining 80%, create train (60%) and validation (20%)
train_df, val_df = train_test_split(
    train_val_df,
    test_size=0.25,
    random_state=42,
    stratify=train_val_df["Category"]
)


train_df = train_df.reset_index()
val_df = val_df.reset_index()
test_df = test_df.reset_index()

print("Train shape:", train_df.shape)
print("Validation shape:", val_df.shape)
print("Test shape:", test_df.shape)

Train shape: (1044, 3)
Validation shape: (348, 3)
Test shape: (348, 3)


In [5]:
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Models

In [6]:
def exec(model_name):

    label2id = {"bugfix": 0, "code smell": 1, "external": 2, "functional": 3, "internal": 4}
    id2label = {v: k for k, v in label2id.items()}
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def encode_texts(texts, max_length=128):
        # Encode the textual data into IDs BERT can understand
        encodings = tokenizer(
            texts,
            truncation=True,
            padding=True,
            max_length=tokenizer.model_max_length,
        )
        return encodings
    
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=1)

        # average=None => returns arrays of per-class metrics
        precision, recall, f1, support = precision_recall_fscore_support(
            labels, preds, average=None, labels=list(id2label.keys())
        )
        
        # Convert to dict for each class
        metrics_dict = {}
        for class_id, (p, r, f, s) in enumerate(zip(precision, recall, f1, support)):
            class_name = id2label[class_id]
            metrics_dict[f"precision_{class_name}"] = p
            metrics_dict[f"recall_{class_name}"] = r
            metrics_dict[f"f1_{class_name}"] = f
            metrics_dict[f"support_{class_name}"] = s
            
        # Optionally, you might also want overall (macro, weighted, etc.)
        # Here’s an example for macro averages:
        precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
            labels, preds, average="macro"
        )
        metrics_dict["precision_macro"] = precision_macro
        metrics_dict["recall_macro"] = recall_macro
        metrics_dict["f1_macro"] = f1_macro

        return metrics_dict
    
    train_encodings = encode_texts(train_df["Commit Message"].tolist())
    val_encodings   = encode_texts(val_df["Commit Message"].tolist())
    test_encodings  = encode_texts(test_df["Commit Message"].tolist())

    train_labels = [label2id[label] for label in train_df["Category"]]
    val_labels   = [label2id[label] for label in val_df["Category"]]
    test_labels  = [label2id[label] for label in test_df["Category"]]

    train_dataset = TextDataset(train_encodings, train_labels)
    val_dataset   = TextDataset(val_encodings, val_labels)
    test_dataset  = TextDataset(test_encodings, test_labels)

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5, id2label=id2label, label2id=label2id)

    training_args = TrainingArguments(output_dir="./results", num_train_epochs=5, per_device_train_batch_size=16, per_device_eval_batch_size=16, eval_strategy="epoch")

    trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, compute_metrics=compute_metrics)
    trainer.train()
    
    trainer.evaluate()
    predictions = trainer.predict(test_dataset)
    model.cpu()
    del model
    return predictions.metrics
    

## distilbert-base-uncased

In [7]:
metrics = {}
metrics["distilbert-base-uncased"] = exec("distilbert/distilbert-base-uncased")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision Bugfix,Recall Bugfix,F1 Bugfix,Support Bugfix,Precision Code smell,Recall Code smell,F1 Code smell,Support Code smell,Precision External,Recall External,F1 External,Support External,Precision Functional,Recall Functional,F1 Functional,Support Functional,Precision Internal,Recall Internal,F1 Internal,Support Internal,Precision Macro,Recall Macro,F1 Macro
1,No log,0.716412,0.658824,0.811594,0.727273,69,0.722222,0.928571,0.8125,70,0.868852,0.768116,0.815385,69,0.822581,0.728571,0.772727,70,0.96,0.685714,0.8,70,0.806496,0.784513,0.785577
2,No log,0.429618,0.761364,0.971014,0.853503,69,0.881579,0.957143,0.917808,70,0.895522,0.869565,0.882353,69,0.949153,0.8,0.868217,70,0.965517,0.8,0.875,70,0.890627,0.879545,0.879376
3,No log,0.410492,0.888889,0.811594,0.848485,69,0.951613,0.842857,0.893939,70,0.826667,0.898551,0.861111,69,0.85,0.971429,0.906667,70,0.911765,0.885714,0.898551,70,0.885787,0.882029,0.881751
4,No log,0.364059,0.890625,0.826087,0.857143,69,0.928571,0.928571,0.928571,70,0.911765,0.898551,0.905109,69,0.85,0.971429,0.906667,70,0.939394,0.885714,0.911765,70,0.904071,0.90207,0.901851
5,No log,0.330612,0.875,0.913043,0.893617,69,0.942029,0.928571,0.935252,70,0.911765,0.898551,0.905109,69,0.916667,0.942857,0.929577,70,0.925373,0.885714,0.905109,70,0.914167,0.913747,0.913733


## bert-base-uncased

In [8]:
metrics["bert-base-uncased"] = exec("google-bert/bert-base-uncased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision Bugfix,Recall Bugfix,F1 Bugfix,Support Bugfix,Precision Code smell,Recall Code smell,F1 Code smell,Support Code smell,Precision External,Recall External,F1 External,Support External,Precision Functional,Recall Functional,F1 Functional,Support Functional,Precision Internal,Recall Internal,F1 Internal,Support Internal,Precision Macro,Recall Macro,F1 Macro
1,No log,0.966963,0.417808,0.884058,0.567442,69,0.734177,0.828571,0.778523,70,0.808511,0.550725,0.655172,69,0.75,0.042857,0.081081,70,0.736111,0.757143,0.746479,70,0.689321,0.612671,0.56574
2,No log,0.432281,0.779221,0.869565,0.821918,69,0.938462,0.871429,0.903704,70,0.867647,0.855072,0.861314,69,0.884058,0.871429,0.877698,70,0.913043,0.9,0.906475,70,0.876486,0.873499,0.874222
3,No log,0.551765,0.891304,0.594203,0.713043,69,0.96875,0.885714,0.925373,70,0.9,0.782609,0.837209,69,0.653846,0.971429,0.781609,70,0.851351,0.9,0.875,70,0.85305,0.826791,0.826447
4,No log,0.403643,0.852941,0.84058,0.846715,69,0.927536,0.914286,0.920863,70,0.921875,0.855072,0.887218,69,0.835443,0.942857,0.885906,70,0.911765,0.885714,0.898551,70,0.889912,0.887702,0.887851
5,No log,0.40985,0.846154,0.797101,0.820896,69,0.914286,0.914286,0.914286,70,0.885714,0.898551,0.892086,69,0.853333,0.914286,0.882759,70,0.911765,0.885714,0.898551,70,0.88225,0.881988,0.881715


## bert-large-uncased

In [9]:
metrics["bert-large-uncased"] = exec("google-bert/bert-large-uncased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision Bugfix,Recall Bugfix,F1 Bugfix,Support Bugfix,Precision Code smell,Recall Code smell,F1 Code smell,Support Code smell,Precision External,Recall External,F1 External,Support External,Precision Functional,Recall Functional,F1 Functional,Support Functional,Precision Internal,Recall Internal,F1 Internal,Support Internal,Precision Macro,Recall Macro,F1 Macro
1,No log,0.591512,0.708333,0.985507,0.824242,69,0.844156,0.928571,0.884354,70,0.960784,0.710145,0.816667,69,0.859155,0.871429,0.865248,70,0.90566,0.685714,0.780488,70,0.855618,0.836273,0.8342
2,No log,0.381084,0.772727,0.985507,0.866242,69,0.953125,0.871429,0.910448,70,0.935484,0.84058,0.885496,69,0.911765,0.885714,0.898551,70,0.909091,0.857143,0.882353,70,0.896438,0.888075,0.888618
3,No log,0.546366,0.953488,0.594203,0.732143,69,0.8375,0.957143,0.893333,70,0.828571,0.84058,0.834532,69,0.77907,0.957143,0.858974,70,0.884058,0.871429,0.877698,70,0.856538,0.844099,0.839336
4,No log,0.449347,0.909091,0.869565,0.888889,69,0.971014,0.957143,0.964029,70,0.952381,0.869565,0.909091,69,0.809524,0.971429,0.883117,70,0.924242,0.871429,0.897059,70,0.913251,0.907826,0.908437
5,No log,0.369825,0.888889,0.927536,0.907801,69,0.955882,0.928571,0.942029,70,0.923077,0.869565,0.895522,69,0.868421,0.942857,0.90411,70,0.910448,0.871429,0.890511,70,0.909343,0.907992,0.907995


## roberta-base

In [10]:
metrics["roberta-base"] = exec("FacebookAI/roberta-base")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision Bugfix,Recall Bugfix,F1 Bugfix,Support Bugfix,Precision Code smell,Recall Code smell,F1 Code smell,Support Code smell,Precision External,Recall External,F1 External,Support External,Precision Functional,Recall Functional,F1 Functional,Support Functional,Precision Internal,Recall Internal,F1 Internal,Support Internal,Precision Macro,Recall Macro,F1 Macro
1,No log,0.492422,0.5,1.0,0.666667,69,0.868421,0.942857,0.90411,70,0.930556,0.971014,0.950355,69,1.0,0.028571,0.055556,70,0.933333,0.8,0.861538,70,0.846462,0.748489,0.687645
2,No log,0.270848,0.894737,0.985507,0.937931,69,0.984127,0.885714,0.932331,70,0.928571,0.942029,0.935252,69,0.971014,0.957143,0.964029,70,0.914286,0.914286,0.914286,70,0.938547,0.936936,0.936766
3,No log,0.257771,0.931507,0.985507,0.957746,69,0.941176,0.914286,0.927536,70,0.917808,0.971014,0.943662,69,0.984848,0.928571,0.955882,70,0.911765,0.885714,0.898551,70,0.937421,0.937019,0.936676
4,No log,0.256817,0.893333,0.971014,0.930556,69,0.984375,0.9,0.940299,70,0.954545,0.913043,0.933333,69,0.970149,0.928571,0.948905,70,0.868421,0.942857,0.90411,70,0.934165,0.931097,0.93144
5,No log,0.216613,0.942857,0.956522,0.94964,69,1.0,0.914286,0.955224,70,0.931507,0.985507,0.957746,69,0.957746,0.971429,0.964539,70,0.942857,0.942857,0.942857,70,0.954994,0.95412,0.954001


## distilroberta-base

In [11]:
metrics["distilroberta-base"] = exec("distilbert/distilroberta-base")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilbert/distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision Bugfix,Recall Bugfix,F1 Bugfix,Support Bugfix,Precision Code smell,Recall Code smell,F1 Code smell,Support Code smell,Precision External,Recall External,F1 External,Support External,Precision Functional,Recall Functional,F1 Functional,Support Functional,Precision Internal,Recall Internal,F1 Internal,Support Internal,Precision Macro,Recall Macro,F1 Macro
1,No log,0.502455,0.758242,1.0,0.8625,69,0.797297,0.842857,0.819444,70,0.893939,0.855072,0.874074,69,1.0,0.742857,0.852459,70,0.784615,0.728571,0.755556,70,0.846819,0.833872,0.832807
2,No log,0.263032,0.931507,0.985507,0.957746,69,0.88,0.942857,0.910345,70,0.930556,0.971014,0.950355,69,0.985075,0.942857,0.963504,70,0.934426,0.814286,0.870229,70,0.932313,0.931304,0.930436
3,No log,0.226422,0.943662,0.971014,0.957143,69,0.96875,0.885714,0.925373,70,0.918919,0.985507,0.951049,69,0.971014,0.957143,0.964029,70,0.9,0.9,0.9,70,0.940469,0.939876,0.939519
4,No log,0.236627,0.955882,0.942029,0.948905,69,1.0,0.885714,0.939394,70,0.943662,0.971014,0.957143,69,0.944444,0.971429,0.957746,70,0.893333,0.957143,0.924138,70,0.947464,0.945466,0.945465
5,No log,0.210248,0.957143,0.971014,0.964029,69,1.0,0.885714,0.939394,70,0.943662,0.971014,0.957143,69,0.971429,0.971429,0.971429,70,0.893333,0.957143,0.924138,70,0.953113,0.951263,0.951226


# Evaluation

In [14]:
print(metrics)

{'distilbert-base-uncased': {'test_loss': 0.3680214285850525, 'test_precision_bugfix': 0.9032258064516129, 'test_recall_bugfix': 0.8, 'test_f1_bugfix': 0.8484848484848486, 'test_support_bugfix': 70, 'test_precision_code smell': 0.9705882352941176, 'test_recall_code smell': 0.9565217391304348, 'test_f1_code smell': 0.9635036496350365, 'test_support_code smell': 69, 'test_precision_external': 0.8695652173913043, 'test_recall_external': 0.8571428571428571, 'test_f1_external': 0.8633093525179856, 'test_support_external': 70, 'test_precision_functional': 0.825, 'test_recall_functional': 0.9428571428571428, 'test_f1_functional': 0.88, 'test_support_functional': 70, 'test_precision_internal': 0.9565217391304348, 'test_recall_internal': 0.9565217391304348, 'test_f1_internal': 0.9565217391304348, 'test_support_internal': 69, 'test_precision_macro': 0.9049801996534939, 'test_recall_macro': 0.9026086956521739, 'test_f1_macro': 0.9023639179536611, 'test_runtime': 1.6696, 'test_samples_per_second':

In [20]:
categories = [
    ("bugfix", "Bug Fix"),
    ("code smell", "Code Smell"),
    ("external", "External QA"),
    ("functional", "Functional"),
    ("internal", "Internal QA"),
]

def build_results_df(model_metrics, categories):
    rows = []
    for cat_key, cat_name in categories:
        precision = round(model_metrics[f"test_precision_{cat_key}"], 2)
        recall    = round(model_metrics[f"test_recall_{cat_key}"], 2)
        f1        = round(model_metrics[f"test_f1_{cat_key}"], 2)
        rows.append([cat_name, precision, recall, f1])
    
    # Optionally add an average row
    p_macro = model_metrics["test_precision_macro"]
    r_macro = model_metrics["test_recall_macro"]
    f_macro = model_metrics["test_f1_macro"]
    rows.append(["Average", round(p_macro, 2), round(r_macro, 2), round(f_macro, 2)])
    
    # Convert to DataFrame
    df = pd.DataFrame(rows, columns=["Category", "Precision", "Recall", "F1"])
    return df

for model_name in metrics.keys():
    print(f"=== {model_name} ===")
    df_results = build_results_df(metrics[model_name], categories)
    display(df_results)   # in Jupyter, shows a table
    print()  # blank line


=== distilbert-base-uncased ===


Unnamed: 0,Category,Precision,Recall,F1
0,Bug Fix,0.9,0.8,0.85
1,Code Smell,0.97,0.96,0.96
2,External QA,0.87,0.86,0.86
3,Functional,0.82,0.94,0.88
4,Internal QA,0.96,0.96,0.96
5,Average,0.9,0.9,0.9



=== bert-base-uncased ===


Unnamed: 0,Category,Precision,Recall,F1
0,Bug Fix,0.89,0.67,0.76
1,Code Smell,0.94,0.91,0.93
2,External QA,0.81,0.84,0.83
3,Functional,0.77,0.93,0.84
4,Internal QA,0.94,0.97,0.96
5,Average,0.87,0.87,0.86



=== bert-large-uncased ===


Unnamed: 0,Category,Precision,Recall,F1
0,Bug Fix,0.92,0.83,0.87
1,Code Smell,0.97,0.96,0.96
2,External QA,0.86,0.87,0.87
3,Functional,0.81,0.9,0.85
4,Internal QA,0.96,0.94,0.95
5,Average,0.9,0.9,0.9



=== roberta-base ===


Unnamed: 0,Category,Precision,Recall,F1
0,Bug Fix,0.97,0.9,0.93
1,Code Smell,1.0,0.96,0.98
2,External QA,0.94,0.96,0.95
3,Functional,0.89,0.94,0.92
4,Internal QA,0.93,0.97,0.95
5,Average,0.95,0.95,0.95



=== distilroberta-base ===


Unnamed: 0,Category,Precision,Recall,F1
0,Bug Fix,0.96,0.99,0.97
1,Code Smell,0.98,0.94,0.96
2,External QA,0.96,0.96,0.96
3,Functional,0.98,0.93,0.96
4,Internal QA,0.92,0.99,0.95
5,Average,0.96,0.96,0.96



