### Importing libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_recall_fscore_support, accuracy_score
from scipy.special import softmax
from sklearn.svm import SVC
import torch
import random
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    EarlyStoppingCallback,
    TrainingArguments,
    Trainer
)
from datasets import Dataset
import pickle

After finish cleaning all of the data, clean_train.xlsx and clean_test.xlsx should be saved into the folder.
Then, we can start on testing different models to find the model that works the best with this dataset.

### Softmax Regression (Final Model)

First, we will start with **Softmax Regression**, which is a **Multinomial Logistic Regression**. This is quite straightforward as the dataset itself have three labels, which are `delete`, `insert` and `replace`. Hence, we tested Softmax Regression, using CountVectorizer as the Feature Extraction method and SelectKBest from Chi-Squared Test as the Feature Selection method. Out of the different solver, we decided to use particularly `solver=lbfgs` because it is more computationally efficient approximating the inverse Hessian matrix and memory-efficient. 

In [2]:
class SoftmaxClassifier:
    def __init__(self):
        self.pipeline = Pipeline([
            ('features', FeatureUnion([
                ('text', Pipeline([
                    ('vectorizer', CountVectorizer(
                        ngram_range=(1, 2)
                    )),
                    ('selector', SelectKBest(chi2, k=1010))
                ]))
            ])),
            ('classifier', LogisticRegression(
                multi_class='multinomial',
                solver='lbfgs',  # saga is better for large datasets but we are smaller dataset so use lbfgs
                max_iter=300000,
                C=1,  # Moderate regularization
                class_weight='balanced',
                random_state=42
            ))
        ])

    def fit(self, X, y):
        self.pipeline.fit(X, y)
        return self

    def predict(self, X):
        return self.pipeline.predict(X)

    def predict_proba(self, X):
        return self.pipeline.predict_proba(X)

def main():
    # Load cleaned data    
    train_df = pd.read_excel('../Model and Dataset/clean_train.xlsx', engine= 'openpyxl')
    test_df = pd.read_excel('../Model and Dataset/clean_test.xlsx', engine= 'openpyxl')

    print("Dataset sizes:")
    print(f"Train: {len(train_df)}")
    print(f"Test: {len(test_df)}")
    
    print("\nClass distribution in training data:")
    print(train_df['Expected Operation by Developer'].value_counts(normalize=True))
    
    # train softmax classifier using train data
    classifier = SoftmaxClassifier()
    X_train = train_df['Review Comment']
    y_train = train_df['Expected Operation by Developer']
    classifier.fit(X_train, y_train)
    
    # prediction made by test data
    X_test = test_df['Review Comment']
    y_pred = classifier.predict(X_test)
    
    # print classification report
    if 'Expected Operation by Developer' in test_df.columns:
        y_test = test_df['Expected Operation by Developer']
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))

    # save the model
    softmax_regression_model = 'final_model.bin'
    with open(softmax_regression_model, 'wb') as f:
        pickle.dump(classifier, f)

if __name__ == "__main__":
    main()

Dataset sizes:
Train: 875
Test: 221

Class distribution in training data:
Expected Operation by Developer
insert     0.336000
delete     0.332571
replace    0.331429
Name: proportion, dtype: float64

Classification Report:
              precision    recall  f1-score   support

      delete       0.74      0.76      0.75        74
      insert       0.75      0.82      0.79        74
     replace       0.77      0.67      0.72        73

    accuracy                           0.75       221
   macro avg       0.75      0.75      0.75       221
weighted avg       0.75      0.75      0.75       221



### Support Vector Machine (SVM)

Then, we tested on SVM, using the same feature extraction and feature selection methods, as to compare the performance of the Classification Model. We decided to choose the parameter `kernel='rbf'` due to its capability to work for higher dimensional data which is not linearly separable. We also use `gamma=0.01`, generating a further influence of a single training example which impacts a larger region of the feature space, to avoid overfitting.

Citation: https://www.geeksforgeeks.org/gamma-parameter-in-svm/

In [3]:
class SVMClassifier:
    def __init__(self):
        self.pipeline = Pipeline([
            ('features', FeatureUnion([
                ('text', Pipeline([
                    ('vectorizer', CountVectorizer(
                        ngram_range=(1, 2)
                    )),
                    ('selector', SelectKBest(chi2, k=1010))
                ]))
            ])),
            ('classifier', SVC(kernel="rbf", # rbf can work for higher dimensional data and data that are not linearly separable
                               C=20, # regularization parameter
                               gamma=0.01
                               )) 
        ])

    def fit(self, X, y):
        self.pipeline.fit(X, y)
        return self

    def predict(self, X):
        return self.pipeline.predict(X)
    

def main():
    # Load cleaned data  
    train_df = pd.read_excel('../Model and Dataset/clean_train.xlsx', engine= 'openpyxl')
    test_df = pd.read_excel('../Model and Dataset/clean_test.xlsx', engine= 'openpyxl')

    print("Dataset sizes:")
    print(f"Train: {len(train_df)}")
    print(f"Test: {len(test_df)}")
    
    print("\nClass distribution in training data:")
    print(train_df['Expected Operation by Developer'].value_counts(normalize=True))
    
    # train SVM classifier
    classifier = SVMClassifier()
    X_train = train_df['Review Comment']
    y_train = train_df['Expected Operation by Developer']
    classifier.fit(X_train, y_train)
    
    # predict using X_test
    X_test = test_df['Review Comment']
    y_pred = classifier.predict(X_test)
    
    # print classification report
    if 'Expected Operation by Developer' in test_df.columns:
        y_test = test_df['Expected Operation by Developer']
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))

if __name__ == "__main__":
    main()

Dataset sizes:
Train: 875
Test: 221

Class distribution in training data:
Expected Operation by Developer
insert     0.336000
delete     0.332571
replace    0.331429
Name: proportion, dtype: float64

Classification Report:
              precision    recall  f1-score   support

      delete       0.69      0.69      0.69        74
      insert       0.66      0.70      0.68        74
     replace       0.69      0.64      0.67        73

    accuracy                           0.68       221
   macro avg       0.68      0.68      0.68       221
weighted avg       0.68      0.68      0.68       221



### Function required when codeBERT is involved

Set seed function is created for reproducibility.

In [4]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    # If using CUDA
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    # Ensure deterministic behavior
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

### Fine-Tuning CodeBERT

First, we set seed to 42 so that everytime we run this code the result is reproducible.

In the `train` function, we set parameters like `num_train_epochs=10` to provide the model more time and opportunity to optimise, however the downside is that it will take much longer training time. To address this, we implemented `EarlyStoppingCallback(early_stopping_patience=2)` to stop training the model when the selected evaluation metrics fails to improve any further on **two consecutive epoches**, which is `metric_for_best_model='accuracy'` to maintain the consistency in measuring the performance metrics throughout the project. We also set a common `learning_rate=2e-5` and `weight_decay=0.01` which will act as a penalty weightage to prevent overfitting.


In [5]:
set_seed(42)

class CodeBERTClassifier:
    def __init__(self, model_name='microsoft/codebert-base', max_length=256, num_labels=3):
        self.model_name = model_name
        self.max_length = max_length
        self.num_labels = num_labels
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        # Load tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            num_labels=self.num_labels
        ).to(self.device)

        # Unfreeze the entire model
        self.unfreeze_model_layers()

        self.is_trained = False

    def unfreeze_model_layers(self):
        # Ensure all model parameters are trainable
        for param in self.model.parameters():
            param.requires_grad = True

    def _tokenize_function(self, examples):
        return self.tokenizer(
            examples['text'],
            padding='max_length',
            truncation=True,
            max_length=self.max_length
        )

    def _compute_metrics(self, eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)

        precision, recall, f1, _ = precision_recall_fscore_support(
            labels,
            predictions,
            average='macro'  # Use 'macro' because classes are balanced
        )
        acc = accuracy_score(labels, predictions)
        return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

    def train(self, train_dataset, val_dataset, output_dir='./results'):
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=10,             # Adjust as needed
            per_device_train_batch_size=8,   # Reduce if memory issues occur
            per_device_eval_batch_size=8,
            learning_rate=2e-5,
            weight_decay=0.01,
            eval_strategy='epoch',
            save_strategy='epoch',
            load_best_model_at_end=True,
            metric_for_best_model='accuracy',     
            greater_is_better=True,
            save_total_limit=1,              # Limit the number of saved models
            logging_dir='./logs',
            logging_steps=10,
            seed=42
        )

        self.trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=self.tokenizer,
            compute_metrics=self._compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
        )

        self.trainer.train()
        self.is_trained = True

        return self.trainer

    def predict(self, X):
        # Ensure the model is trained
        if not self.is_trained:
            raise RuntimeError("The model must be fitted before calling predict().")

        # Prepare data
        df = pd.DataFrame({'text': X})
        dataset = Dataset.from_pandas(df)
        dataset = dataset.map(self._tokenize_function, batched=True)
        dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

        # Make predictions
        predictions = self.trainer.predict(dataset)
        logits = predictions.predictions
        y_pred = np.argmax(logits, axis=-1)
        return y_pred  # Return numerical predictions

    def evaluate(self, trainer, test_dataset):
        # Make predictions on the test set
        predictions = trainer.predict(test_dataset)
        logits = predictions.predictions
        y_pred = np.argmax(logits, axis=-1)
        y_true = test_dataset['label']

        # Return predictions and true labels
        return y_true, y_pred

def main():
    # Load cleaned data
    train_df = pd.read_excel('../Model and Dataset/clean_train.xlsx', engine= 'openpyxl')
    test_df = pd.read_excel('../Model and Dataset/clean_test.xlsx', engine= 'openpyxl')

    print("Dataset sizes:")
    print(f"Train: {len(train_df)}")
    print(f"Test: {len(test_df)}")

    print("\nClass distribution in training data:")
    print(train_df['Expected Operation by Developer'].value_counts(normalize=True))

    # Encode labels
    label_encoder = LabelEncoder()
    train_df['label'] = label_encoder.fit_transform(train_df['Expected Operation by Developer'])
    test_df['label'] = label_encoder.transform(test_df['Expected Operation by Developer'])

    # Split train_df into train and validation sets
    train_split_df, val_df = train_test_split(
        train_df,
        test_size=0.1,  # 10% for validation
        stratify=train_df['label'],  # Maintain class balance
        random_state=42
    )

    # Prepare datasets for Hugging Face Trainer
    def prepare_dataset(df):
        return Dataset.from_pandas(
            df[['Review Comment', 'label']].rename(columns={'Review Comment': 'text'})
        )

    train_dataset = prepare_dataset(train_split_df)
    val_dataset = prepare_dataset(val_df)
    test_dataset = prepare_dataset(test_df)

    # Initialize classifier
    classifier = CodeBERTClassifier(
        model_name='microsoft/codebert-base',
        max_length=256,
        num_labels=len(label_encoder.classes_)
    )

    # Tokenize datasets
    train_dataset = train_dataset.map(classifier._tokenize_function, batched=True)
    val_dataset = val_dataset.map(classifier._tokenize_function, batched=True)
    test_dataset = test_dataset.map(classifier._tokenize_function, batched=True)

    # Set format for PyTorch
    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

    # Train the model
    trainer = classifier.train(train_dataset, val_dataset)

    # Evaluate the model on the test set
    y_pred = classifier.predict(test_df['Review Comment'])
    y_pred_labels = label_encoder.inverse_transform(y_pred)
    y_true_labels = test_df['Expected Operation by Developer'].values

    # Print classification report
    print("\nClassification Report on Test Set:")
    print(classification_report(y_true_labels, y_pred_labels))

if __name__ == "__main__":
    main()


Dataset sizes:
Train: 875
Test: 221

Class distribution in training data:
Expected Operation by Developer
insert     0.336000
delete     0.332571
replace    0.331429
Name: proportion, dtype: float64


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/787 [00:00<?, ? examples/s]

Map:   0%|          | 0/88 [00:00<?, ? examples/s]

Map:   0%|          | 0/221 [00:00<?, ? examples/s]

  0%|          | 0/990 [00:00<?, ?it/s]

{'loss': 1.1059, 'grad_norm': 6.518070220947266, 'learning_rate': 1.97979797979798e-05, 'epoch': 0.1}
{'loss': 1.1356, 'grad_norm': 4.033086776733398, 'learning_rate': 1.9595959595959596e-05, 'epoch': 0.2}
{'loss': 1.1094, 'grad_norm': 3.565566062927246, 'learning_rate': 1.9393939393939395e-05, 'epoch': 0.3}
{'loss': 1.0827, 'grad_norm': 4.797770023345947, 'learning_rate': 1.9191919191919194e-05, 'epoch': 0.4}
{'loss': 1.1091, 'grad_norm': 6.9524760246276855, 'learning_rate': 1.8989898989898993e-05, 'epoch': 0.51}
{'loss': 1.08, 'grad_norm': 7.297496318817139, 'learning_rate': 1.8787878787878792e-05, 'epoch': 0.61}
{'loss': 1.0722, 'grad_norm': 5.285859107971191, 'learning_rate': 1.8585858585858588e-05, 'epoch': 0.71}
{'loss': 1.0535, 'grad_norm': 4.370762348175049, 'learning_rate': 1.8383838383838387e-05, 'epoch': 0.81}
{'loss': 1.0014, 'grad_norm': 7.5923848152160645, 'learning_rate': 1.8181818181818182e-05, 'epoch': 0.91}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.9953741431236267, 'eval_accuracy': 0.48863636363636365, 'eval_f1': 0.44975738294932355, 'eval_precision': 0.4583742833742834, 'eval_recall': 0.49272030651341, 'eval_runtime': 1.6143, 'eval_samples_per_second': 54.511, 'eval_steps_per_second': 6.814, 'epoch': 1.0}
{'loss': 1.0341, 'grad_norm': 8.023241996765137, 'learning_rate': 1.797979797979798e-05, 'epoch': 1.01}
{'loss': 1.0751, 'grad_norm': 8.4229736328125, 'learning_rate': 1.7777777777777777e-05, 'epoch': 1.11}
{'loss': 0.9435, 'grad_norm': 5.373987197875977, 'learning_rate': 1.7575757575757576e-05, 'epoch': 1.21}
{'loss': 1.0002, 'grad_norm': 6.1695451736450195, 'learning_rate': 1.7373737373737375e-05, 'epoch': 1.31}
{'loss': 0.9943, 'grad_norm': 7.081664562225342, 'learning_rate': 1.7171717171717173e-05, 'epoch': 1.41}
{'loss': 1.0396, 'grad_norm': 9.021428108215332, 'learning_rate': 1.6969696969696972e-05, 'epoch': 1.52}
{'loss': 0.9268, 'grad_norm': 16.00798797607422, 'learning_rate': 1.6767676767676768e-05, 'e

  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.8730031847953796, 'eval_accuracy': 0.625, 'eval_f1': 0.62390288893517, 'eval_precision': 0.6234035759897829, 'eval_recall': 0.6264367816091955, 'eval_runtime': 1.686, 'eval_samples_per_second': 52.194, 'eval_steps_per_second': 6.524, 'epoch': 2.0}
{'loss': 0.8484, 'grad_norm': 11.812809944152832, 'learning_rate': 1.595959595959596e-05, 'epoch': 2.02}
{'loss': 0.6419, 'grad_norm': 6.024279594421387, 'learning_rate': 1.575757575757576e-05, 'epoch': 2.12}
{'loss': 0.7775, 'grad_norm': 28.596637725830078, 'learning_rate': 1.555555555555556e-05, 'epoch': 2.22}
{'loss': 0.7409, 'grad_norm': 9.7411470413208, 'learning_rate': 1.5353535353535354e-05, 'epoch': 2.32}
{'loss': 0.6389, 'grad_norm': 21.510696411132812, 'learning_rate': 1.5151515151515153e-05, 'epoch': 2.42}
{'loss': 0.6122, 'grad_norm': 14.595149993896484, 'learning_rate': 1.4949494949494952e-05, 'epoch': 2.53}
{'loss': 0.7364, 'grad_norm': 14.511611938476562, 'learning_rate': 1.4747474747474747e-05, 'epoch': 2.63}
{

  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.7922959923744202, 'eval_accuracy': 0.6590909090909091, 'eval_f1': 0.6537999037999038, 'eval_precision': 0.6591174507841174, 'eval_recall': 0.6609195402298851, 'eval_runtime': 1.7025, 'eval_samples_per_second': 51.688, 'eval_steps_per_second': 6.461, 'epoch': 3.0}
{'loss': 0.8113, 'grad_norm': 7.293466567993164, 'learning_rate': 1.3939393939393942e-05, 'epoch': 3.03}
{'loss': 0.465, 'grad_norm': 15.744185447692871, 'learning_rate': 1.3737373737373739e-05, 'epoch': 3.13}
{'loss': 0.4805, 'grad_norm': 11.440260887145996, 'learning_rate': 1.3535353535353538e-05, 'epoch': 3.23}
{'loss': 0.4074, 'grad_norm': 6.900041103363037, 'learning_rate': 1.3333333333333333e-05, 'epoch': 3.33}
{'loss': 0.6194, 'grad_norm': 48.9625129699707, 'learning_rate': 1.3131313131313132e-05, 'epoch': 3.43}
{'loss': 0.6708, 'grad_norm': 6.321732044219971, 'learning_rate': 1.2929292929292931e-05, 'epoch': 3.54}
{'loss': 0.4738, 'grad_norm': 9.196503639221191, 'learning_rate': 1.2727272727272728e-05, 

  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.9157833456993103, 'eval_accuracy': 0.625, 'eval_f1': 0.622768684782033, 'eval_precision': 0.6228197090266056, 'eval_recall': 0.6264367816091955, 'eval_runtime': 1.5878, 'eval_samples_per_second': 55.421, 'eval_steps_per_second': 6.928, 'epoch': 4.0}
{'loss': 0.4918, 'grad_norm': 10.699169158935547, 'learning_rate': 1.191919191919192e-05, 'epoch': 4.04}
{'loss': 0.2935, 'grad_norm': 18.206954956054688, 'learning_rate': 1.1717171717171718e-05, 'epoch': 4.14}
{'loss': 0.2681, 'grad_norm': 9.819538116455078, 'learning_rate': 1.1515151515151517e-05, 'epoch': 4.24}
{'loss': 0.4057, 'grad_norm': 21.20111656188965, 'learning_rate': 1.1313131313131314e-05, 'epoch': 4.34}
{'loss': 0.3868, 'grad_norm': 38.27432632446289, 'learning_rate': 1.1111111111111113e-05, 'epoch': 4.44}
{'loss': 0.2188, 'grad_norm': 25.2943172454834, 'learning_rate': 1.0909090909090909e-05, 'epoch': 4.55}
{'loss': 0.3105, 'grad_norm': 5.3908562660217285, 'learning_rate': 1.0707070707070708e-05, 'epoch': 4.65

  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 1.1535435914993286, 'eval_accuracy': 0.6590909090909091, 'eval_f1': 0.65625, 'eval_precision': 0.6587437254103921, 'eval_recall': 0.660536398467433, 'eval_runtime': 1.5783, 'eval_samples_per_second': 55.755, 'eval_steps_per_second': 6.969, 'epoch': 5.0}
{'train_runtime': 274.1349, 'train_samples_per_second': 28.708, 'train_steps_per_second': 3.611, 'train_loss': 0.7317375255353523, 'epoch': 5.0}


  nonzero_finite_vals = torch.masked_select(


Map:   0%|          | 0/221 [00:00<?, ? examples/s]

  0%|          | 0/28 [00:00<?, ?it/s]


Classification Report on Test Set:
              precision    recall  f1-score   support

      delete       0.73      0.61      0.66        74
      insert       0.73      0.65      0.69        74
     replace       0.60      0.77      0.67        73

    accuracy                           0.67       221
   macro avg       0.69      0.67      0.67       221
weighted avg       0.69      0.67      0.67       221



### Ensemble Model (Softmax Regression + CodeBERT)

Finally, we proposed the potential of an ensemble method, which is a combination of **Softmax Regression** (Classical ML Model) and **CodeBERT** (Neural Network). We are currently putting 60% weightage on the prediction result from Softmax Regression and 40% weightage on the prediction result from CodeBERT due to the fact that the Neural Network is expected to perform less ideally when the training sample is too small. However, in future, if we are able to obtain a larger training data (which will possibly improve CodeBERT performance), we can then leverage the strength of the two models accordingly by adjusting the weightage on each model. 

The formula for calculating the final ensemble probability for each class label:

`ensemble_probs = (classical_weight * classical_probs) + (codebert_weight * codebert_probs)`

In [6]:
# codeBERT part of ensemble model

set_seed(42)

class CodeBERTTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model_name='microsoft/codebert-base', max_length=256, num_labels=3):
        self.model_name = model_name
        self.max_length = max_length
        self.num_labels = num_labels
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = None
        self.model = None
        self.trainer = None
        self.is_trained = False

    def fit(self, X, y=None):
        # Prepare data
        df = pd.DataFrame({'text': X, 'label': y})
        # Perform stratified splitting using scikit-learn
        train_df, val_df = train_test_split(
            df,
            test_size=0.1,
            stratify=df['label'],
            random_state=42
        )
        self.train_dataset = Dataset.from_pandas(train_df)
        self.val_dataset = Dataset.from_pandas(val_df)

        # Initialize tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            num_labels=self.num_labels
        ).to(self.device)

        # Tokenize data
        self.train_dataset = self.train_dataset.map(self._tokenize_function, batched=True)
        self.val_dataset = self.val_dataset.map(self._tokenize_function, batched=True)

        # Set format for PyTorch
        self.train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
        self.val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

        # Training arguments
        training_args = TrainingArguments(
            output_dir='./codebert_results',
            num_train_epochs=10,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            learning_rate=2e-5,
            weight_decay=0.01,
            eval_strategy='epoch',
            save_strategy='epoch',
            load_best_model_at_end=True,
            metric_for_best_model='accuracy',
            greater_is_better=True,
            save_total_limit=1,
            logging_dir='./codebert_logs',
            logging_steps=10,
            seed=42
        )

        # Initialize Trainer
        self.trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=self.train_dataset,
            eval_dataset=self.val_dataset,
            tokenizer=self.tokenizer,
            compute_metrics=self._compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
        )

        # Train model
        self.trainer.train()
        self.is_trained = True
        return self

    def transform(self, X):
        # Ensure the model is trained
        if not self.is_trained:
            raise RuntimeError("The model must be fitted before calling transform().")

        # Prepare data
        df = pd.DataFrame({'text': X})
        dataset = Dataset.from_pandas(df)
        dataset = dataset.map(self._tokenize_function, batched=True)
        dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

        # Make predictions
        predictions = self.trainer.predict(dataset)
        logits = predictions.predictions
        probs = softmax(logits, axis=1)
        return probs

    def _tokenize_function(self, examples):
        return self.tokenizer(
            examples['text'],
            padding='max_length',
            truncation=True,
            max_length=self.max_length
        )

    def _compute_metrics(self, eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        precision, recall, f1, _ = precision_recall_fscore_support(
            labels,
            predictions,
            average='macro'
        )
        acc = accuracy_score(labels, predictions)
        return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

# ensemble them
class EnsembleClassifier(BaseEstimator):
    def __init__(self, classical_pipeline, codebert_transformer, label_encoder, classical_weight=0.6, codebert_weight=0.4):
        self.classical_pipeline = classical_pipeline
        self.codebert_transformer = codebert_transformer
        self.classical_weight = classical_weight
        self.codebert_weight = codebert_weight
        self.label_encoder = label_encoder

    def fit(self, X, y):
        # Encode labels
        y_encoded = y

        # Fit classical pipeline
        self.classical_pipeline.fit(X, y_encoded)

        # Fit CodeBERT transformer
        self.codebert_transformer.fit(X, y_encoded)

        return self

    def predict(self, X):
        # Get probabilities from classical pipeline
        classical_probs = self.classical_pipeline.predict_proba(X)

        # Get probabilities from CodeBERT transformer
        codebert_probs = self.codebert_transformer.transform(X)

        # Ensemble probabilities
        total_weight = self.classical_weight + self.codebert_weight
        classical_weight = self.classical_weight / total_weight
        codebert_weight = self.codebert_weight / total_weight

        ensemble_probs = (classical_weight * classical_probs) + (codebert_weight * codebert_probs)

        # Get final predictions
        y_pred = np.argmax(ensemble_probs, axis=1)
        y_pred_labels = self.label_encoder.inverse_transform(y_pred)
        return y_pred_labels

    def predict_proba(self, X):
        # Get probabilities from classical pipeline
        classical_probs = self.classical_pipeline.predict_proba(X)

        # Get probabilities from CodeBERT transformer
        codebert_probs = self.codebert_transformer.transform(X)

        # Ensemble probabilities
        total_weight = self.classical_weight + self.codebert_weight
        classical_weight = self.classical_weight / total_weight
        codebert_weight = self.codebert_weight / total_weight

        ensemble_probs = (classical_weight * classical_probs) + (codebert_weight * codebert_probs)
        return ensemble_probs


def main():
    # Load Data
    train_df = pd.read_excel('../Model and Dataset/clean_train.xlsx', engine= 'openpyxl')
    test_df = pd.read_excel('../Model and Dataset/clean_test.xlsx', engine= 'openpyxl')

    # Combine train and test data for label encoding
    combined_df = pd.concat([train_df, test_df], ignore_index=True)

    # Encode Labels
    label_encoder = LabelEncoder()
    combined_df['label'] = label_encoder.fit_transform(combined_df['Expected Operation by Developer'])
    train_df['label'] = label_encoder.transform(train_df['Expected Operation by Developer'])
    test_df['label'] = label_encoder.transform(test_df['Expected Operation by Developer'])

    # Features and Labels
    X_train = train_df['Review Comment']
    y_train = train_df['label']
    X_test = test_df['Review Comment']
    y_test = test_df['label']

    # Initialize Classical Pipeline Components
    vectorizer = CountVectorizer(ngram_range=(1, 2))
    selector = SelectKBest(chi2, k=1010)
    classifier = LogisticRegression(
        multi_class='multinomial',
        solver='lbfgs',
        max_iter=300000,
        C=1,
        class_weight='balanced',
        random_state=42
    )

    classical_pipeline = Pipeline([
        ('vectorizer', vectorizer),
        ('selector', selector),
        ('classifier', classifier)
    ])

    # Initialize CodeBERT Transformer
    num_labels = len(label_encoder.classes_)
    codebert_transformer = CodeBERTTransformer(num_labels=num_labels)

    # Initialize Ensemble Classifier
    ensemble_classifier = EnsembleClassifier(
        classical_pipeline=classical_pipeline,
        codebert_transformer=codebert_transformer,
        label_encoder=label_encoder,  # Pass the label encoder
        classical_weight=0.6,
        codebert_weight=0.4
    )

    # Fit Ensemble Classifier
    ensemble_classifier.fit(X_train, y_train)

    # Predict on Test Set
    y_pred = ensemble_classifier.predict(X_test)

    # Evaluate Ensemble Model
    y_true_labels = label_encoder.inverse_transform(y_test)
    print("\nClassification Report for Ensemble Model:")
    print(classification_report(y_true_labels, y_pred))

if __name__ == "__main__":
    main()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/787 [00:00<?, ? examples/s]

Map:   0%|          | 0/88 [00:00<?, ? examples/s]

  0%|          | 0/990 [00:00<?, ?it/s]

{'loss': 1.1059, 'grad_norm': 6.518069744110107, 'learning_rate': 1.97979797979798e-05, 'epoch': 0.1}
{'loss': 1.1356, 'grad_norm': 4.033087253570557, 'learning_rate': 1.9595959595959596e-05, 'epoch': 0.2}
{'loss': 1.1094, 'grad_norm': 3.5655674934387207, 'learning_rate': 1.9393939393939395e-05, 'epoch': 0.3}
{'loss': 1.0827, 'grad_norm': 4.797769069671631, 'learning_rate': 1.9191919191919194e-05, 'epoch': 0.4}
{'loss': 1.1091, 'grad_norm': 6.952461242675781, 'learning_rate': 1.8989898989898993e-05, 'epoch': 0.51}
{'loss': 1.08, 'grad_norm': 7.297637939453125, 'learning_rate': 1.8787878787878792e-05, 'epoch': 0.61}
{'loss': 1.0722, 'grad_norm': 5.286148548126221, 'learning_rate': 1.8585858585858588e-05, 'epoch': 0.71}
{'loss': 1.0535, 'grad_norm': 4.370704650878906, 'learning_rate': 1.8383838383838387e-05, 'epoch': 0.81}
{'loss': 1.0014, 'grad_norm': 7.5914225578308105, 'learning_rate': 1.8181818181818182e-05, 'epoch': 0.91}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.9953708052635193, 'eval_accuracy': 0.48863636363636365, 'eval_f1': 0.44975738294932355, 'eval_precision': 0.4583742833742834, 'eval_recall': 0.49272030651341, 'eval_runtime': 1.6171, 'eval_samples_per_second': 54.419, 'eval_steps_per_second': 6.802, 'epoch': 1.0}
{'loss': 1.0341, 'grad_norm': 8.022260665893555, 'learning_rate': 1.797979797979798e-05, 'epoch': 1.01}
{'loss': 1.0752, 'grad_norm': 8.426362037658691, 'learning_rate': 1.7777777777777777e-05, 'epoch': 1.11}
{'loss': 0.9436, 'grad_norm': 5.368290424346924, 'learning_rate': 1.7575757575757576e-05, 'epoch': 1.21}
{'loss': 1.0001, 'grad_norm': 6.170549392700195, 'learning_rate': 1.7373737373737375e-05, 'epoch': 1.31}
{'loss': 0.9944, 'grad_norm': 7.071694374084473, 'learning_rate': 1.7171717171717173e-05, 'epoch': 1.41}
{'loss': 1.0395, 'grad_norm': 9.011048316955566, 'learning_rate': 1.6969696969696972e-05, 'epoch': 1.52}
{'loss': 0.9269, 'grad_norm': 16.006092071533203, 'learning_rate': 1.6767676767676768e-05, 

  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.8732370734214783, 'eval_accuracy': 0.625, 'eval_f1': 0.62390288893517, 'eval_precision': 0.6234035759897829, 'eval_recall': 0.6264367816091955, 'eval_runtime': 1.6018, 'eval_samples_per_second': 54.938, 'eval_steps_per_second': 6.867, 'epoch': 2.0}
{'loss': 0.8486, 'grad_norm': 11.732205390930176, 'learning_rate': 1.595959595959596e-05, 'epoch': 2.02}
{'loss': 0.6422, 'grad_norm': 6.064476490020752, 'learning_rate': 1.575757575757576e-05, 'epoch': 2.12}
{'loss': 0.7765, 'grad_norm': 28.30057144165039, 'learning_rate': 1.555555555555556e-05, 'epoch': 2.22}
{'loss': 0.7414, 'grad_norm': 9.640498161315918, 'learning_rate': 1.5353535353535354e-05, 'epoch': 2.32}
{'loss': 0.6388, 'grad_norm': 21.75180435180664, 'learning_rate': 1.5151515151515153e-05, 'epoch': 2.42}
{'loss': 0.6134, 'grad_norm': 14.440499305725098, 'learning_rate': 1.4949494949494952e-05, 'epoch': 2.53}
{'loss': 0.736, 'grad_norm': 14.513383865356445, 'learning_rate': 1.4747474747474747e-05, 'epoch': 2.63}
{

  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.7925806641578674, 'eval_accuracy': 0.6590909090909091, 'eval_f1': 0.6537999037999038, 'eval_precision': 0.6591174507841174, 'eval_recall': 0.6609195402298851, 'eval_runtime': 1.6062, 'eval_samples_per_second': 54.787, 'eval_steps_per_second': 6.848, 'epoch': 3.0}
{'loss': 0.8123, 'grad_norm': 7.298730850219727, 'learning_rate': 1.3939393939393942e-05, 'epoch': 3.03}
{'loss': 0.465, 'grad_norm': 15.6372652053833, 'learning_rate': 1.3737373737373739e-05, 'epoch': 3.13}
{'loss': 0.479, 'grad_norm': 11.311602592468262, 'learning_rate': 1.3535353535353538e-05, 'epoch': 3.23}
{'loss': 0.4074, 'grad_norm': 6.858272552490234, 'learning_rate': 1.3333333333333333e-05, 'epoch': 3.33}
{'loss': 0.6172, 'grad_norm': 47.89690399169922, 'learning_rate': 1.3131313131313132e-05, 'epoch': 3.43}
{'loss': 0.6718, 'grad_norm': 6.3104376792907715, 'learning_rate': 1.2929292929292931e-05, 'epoch': 3.54}
{'loss': 0.4742, 'grad_norm': 9.289361000061035, 'learning_rate': 1.2727272727272728e-05, '

  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.9170381426811218, 'eval_accuracy': 0.625, 'eval_f1': 0.622768684782033, 'eval_precision': 0.6228197090266056, 'eval_recall': 0.6264367816091955, 'eval_runtime': 1.615, 'eval_samples_per_second': 54.488, 'eval_steps_per_second': 6.811, 'epoch': 4.0}
{'loss': 0.4931, 'grad_norm': 10.604461669921875, 'learning_rate': 1.191919191919192e-05, 'epoch': 4.04}
{'loss': 0.2943, 'grad_norm': 19.22991943359375, 'learning_rate': 1.1717171717171718e-05, 'epoch': 4.14}
{'loss': 0.2684, 'grad_norm': 9.718889236450195, 'learning_rate': 1.1515151515151517e-05, 'epoch': 4.24}
{'loss': 0.4029, 'grad_norm': 21.195993423461914, 'learning_rate': 1.1313131313131314e-05, 'epoch': 4.34}
{'loss': 0.3905, 'grad_norm': 41.43944549560547, 'learning_rate': 1.1111111111111113e-05, 'epoch': 4.44}
{'loss': 0.2185, 'grad_norm': 25.20840835571289, 'learning_rate': 1.0909090909090909e-05, 'epoch': 4.55}
{'loss': 0.3105, 'grad_norm': 5.518843173980713, 'learning_rate': 1.0707070707070708e-05, 'epoch': 4.65}

  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 1.1502140760421753, 'eval_accuracy': 0.6590909090909091, 'eval_f1': 0.65625, 'eval_precision': 0.6587437254103921, 'eval_recall': 0.660536398467433, 'eval_runtime': 1.6255, 'eval_samples_per_second': 54.138, 'eval_steps_per_second': 6.767, 'epoch': 5.0}
{'train_runtime': 271.7224, 'train_samples_per_second': 28.963, 'train_steps_per_second': 3.643, 'train_loss': 0.7315815477660208, 'epoch': 5.0}


Map:   0%|          | 0/221 [00:00<?, ? examples/s]

  0%|          | 0/28 [00:00<?, ?it/s]


Classification Report for Ensemble Model:
              precision    recall  f1-score   support

      delete       0.76      0.77      0.77        74
      insert       0.77      0.77      0.77        74
     replace       0.72      0.71      0.72        73

    accuracy                           0.75       221
   macro avg       0.75      0.75      0.75       221
weighted avg       0.75      0.75      0.75       221

