In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from datasets import Dataset, DatasetDict, load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the dataset
ds = load_dataset('NLBSE/nlbse25-code-comment-classification')

# Extracting train and test splits for each language
java_train = ds['java_train']
java_test = ds['java_test']

python_train = ds['python_train']
python_test = ds['python_test']

pharo_train = ds['pharo_train']
pharo_test = ds['pharo_test']

# Extracting features (X) and labels (y) for each split
java_X_train = java_train['comment_sentence']
java_y_train = java_train['labels']

java_X_test = java_test['comment_sentence']
java_y_test = java_test['labels']

python_X_train = python_train['comment_sentence']
python_y_train = python_train['labels']

python_X_test = python_test['comment_sentence']
python_y_test = python_test['labels']

pharo_X_train = pharo_train['comment_sentence']
pharo_y_train = pharo_train['labels']

pharo_X_test = pharo_test['comment_sentence']
pharo_y_test = pharo_test['labels']

# Preprocess data using TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit on all training data across datasets
all_train_data = java_X_train + python_X_train + pharo_X_train
vectorizer.fit(all_train_data)

# Transform datasets
java_X_train_tfidf = vectorizer.transform(java_X_train)
java_X_test_tfidf = vectorizer.transform(java_X_test)
java_X_train_dense = java_X_train_tfidf.toarray()
java_X_test_dense = java_X_test_tfidf.toarray()


python_X_train_tfidf = vectorizer.transform(python_X_train)
python_X_test_tfidf = vectorizer.transform(python_X_test)
python_X_train_dense = python_X_train_tfidf.toarray()
python_X_test_dense = python_X_test_tfidf.toarray()


pharo_X_train_tfidf = vectorizer.transform(pharo_X_train)
pharo_X_test_tfidf = vectorizer.transform(pharo_X_test)
pharo_X_train_dense = pharo_X_train_tfidf.toarray()
pharo_X_test_dense = pharo_X_test_tfidf.toarray()


# Define categories
categories_java = ['summary', 'ownership', 'expand', 'usage', 'pointer', 'deprecation', 'rational']
categories_python = ['usage', 'parameters', 'developmentnotes', 'expand', 'summary']
categories_pharo = ['keyimplementationpoints', 'example', 'responsibilities', 'classreference', 'intent', 'keymessages', 'collaborators']

In [None]:
for lab in java_y_test:
    print(lab[6]) #Print labels to make sure they're not squashed too much later

0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
1
1
1
1
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
1
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
1
1
1
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
1
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [None]:
# Custom Dataset class for binary classification
class BinaryClassificationDataset(Dataset):
    def __init__(self, X, y, category_index):
        # Preprocess y to extract the correct category
        self.X = X
        self.y = [label[category_index] if isinstance(label, list) else label for label in y]

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if isinstance(idx, list):  # When `idx` is a list of indices (batching case)
            features = [self.X[i] for i in idx]
            labels = [self.y[i] for i in idx]
            return {
                'features': torch.tensor(features, dtype=torch.float),
                'labels': torch.tensor(labels, dtype=torch.long),
            }
        else:  # When `idx` is a single index
            x = self.X[idx]
            y = self.y[idx]
            return {
                'features': torch.tensor(x, dtype=torch.float),
                'labels': torch.tensor(y, dtype=torch.long),
            }


# Function to train a model for a given category
def train_model(X_train, y_train, X_test, y_test, category_index, category_name, device):
    #for lab in java_y_test:
    #    print("We got ytest", lab[6])
    train_dataset = BinaryClassificationDataset(X_train, y_train, category_index) # Create data loaders
    test_dataset = BinaryClassificationDataset(X_test, y_test, category_index)
    batch_size = 32 #This might be too small
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Define the model
    class ClassificationModel(nn.Module):
        def __init__(self, input_dim, hidden_dim, output_dim):
            super(ClassificationModel, self).__init__()
            self.fc1 = nn.Linear(input_dim, hidden_dim) #Linear Layer
            self.relu = nn.ReLU() #ReLU activation
            self.fc2 = nn.Linear(hidden_dim, output_dim) #Linear Layer

        def forward(self, x):
            x = self.fc1(x)
            x = self.relu(x)
            x = self.fc2(x)
            return torch.sigmoid(x) #Apply Sigmoid


    input_dim = X_train.shape[1] #The datasets for each language are different sizes
    hidden_dim = 128
    output_dim = 2  # Binary classification

    model = ClassificationModel(input_dim, hidden_dim, output_dim)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001) #Any higher learning rate degrades performance exponentially

    # Training loop
    def train(model, loader, optimizer, criterion):
        model.train()
        total_loss = 0
        print("load", loader)
        for batch in loader:
            features = batch['features'].to(device)
            labels = batch['labels'].to(device).long()  # Convert to Long
            #print("we got train labels:", labels)
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        return total_loss / len(loader)

    def evaluate(model, loader, criterion):
        model.eval()
        total_loss = 0
        total_correct = 0

        with torch.no_grad():
            for batch in loader:
                features = batch['features'].to(device)
                labels = batch['labels'].to(device).long()  # Convert to Long
                print("we got eval labels:", labels) #Debug output
                outputs = model(features)
                loss = criterion(outputs, labels)
                total_loss += loss.item()
                # Calculate accuracy
                predictions = outputs.argmax(dim=1)
                total_correct += (predictions == labels).sum().item()
        
        y_true = torch.tensor(labels)
        y_pred = torch.tensor(predictions)
        
        TP = ((y_pred == y_true)).sum().item()
        FP = ((y_pred == 1) & (y_true == 0)).sum().item()
        FN = ((y_pred == 0) & (y_true == 1)).sum().item()

        precision = TP / (TP + FP) if TP + FP > 0 else 0
        recall = TP / (TP + FN) if TP + FN > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        #print(f'Output: {outputs}')
        print(f'Labels: {labels}')
        print(f'Predictions: {predictions}')

        print(f'Precision: {precision}')
        print(f'Recall: {recall}')
        print(f'F1 Score: {f1}')
        
        
        accuracy = total_correct / len(loader.dataset)
        return total_loss / len(loader), accuracy, f1


    print(f"Training for category: {category_name}")
    for epoch in range(5): #More epochs than this makes it overfit way too much
        train_loss = train(model, train_loader, optimizer, criterion)
        test_loss, test_accuracy, f1score = evaluate(model, test_loader, criterion)
        print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}, F1: {f1score:.4f} \n \n')

In [None]:
# Device setup, no NVIDIA so I used CPU
device = torch.device('cpu')

# Train Java
for category_index, category_name in enumerate(categories_java):
    train_model(java_X_train_dense, java_y_train, java_X_test_dense, java_y_test, category_index, category_name, device)

Training for category: summary
load <torch.utils.data.dataloader.DataLoader object at 0x0000020642EA6600>
we got labels: tensor([0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 0, 0, 1, 1, 1])
we got labels: tensor([0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1,
        1, 1, 1, 1, 1, 0, 1, 1])
we got labels: tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
        1, 1, 0, 0, 0, 1, 0, 1])
we got labels: tensor([1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
        1, 1, 0, 0, 0, 0, 1, 0])
we got labels: tensor([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 1, 0, 0, 0, 0, 0, 0])
we got labels: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])
we got labels: tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0]

  y_true = torch.tensor(labels)
  y_pred = torch.tensor(predictions)


we got labels: tensor([0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 0, 0, 1, 1, 1])
we got labels: tensor([0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1,
        1, 1, 1, 1, 1, 0, 1, 1])
we got labels: tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
        1, 1, 0, 0, 0, 1, 0, 1])
we got labels: tensor([1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
        1, 1, 0, 0, 0, 0, 1, 0])
we got labels: tensor([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 1, 0, 0, 0, 0, 0, 0])
we got labels: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])
we got labels: tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])
we got labels: tensor([0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
        0

In [None]:
#Train Python
for category_index, category_name in enumerate(categories_python):
    train_model(python_X_train_dense, python_y_train, python_X_test_dense, python_y_test, category_index, category_name, device)

In [None]:
#Train Pharo
for category_index, category_name in enumerate(categories_pharo):
    train_model(pharo_X_train_dense, pharo_y_train, pharo_X_test_dense, pharo_y_test, category_index, category_name, device)