In [1]:
import json
import pickle
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from datasets import Dataset
from datasets import load_dataset
import os
import math

In [2]:
output_dir = "/mimer/NOBACKUP/groups/naiss2025-5-243/diff_embeddings2"
all_diffs = []

for i in range(22):
    filename = f"diff_embeddings_chunk_{i:04d}.pkl"
    filepath = os.path.join(output_dir, filename)

    with open(filepath, "rb") as f:
        chunk = pickle.load(f)
        all_diffs.extend(chunk)

In [3]:
dataset = load_dataset(
    "NicholasOgenstad/my-runbugrun-dataset",
    data_files="runbugrun_all_pairs_with_language.json",
    split="train"
)
dataset = dataset.filter(lambda example: example["language"] != "tests")

buggy = dataset['buggy_code']
fixed = dataset['fixed_code']

bug_label = dataset['labels']
language = dataset['language']
bug_label = bug_label[:len(all_diffs)]
language = language[:len(all_diffs)]

change_count = []
for i in bug_label:
    if i == None:
        change_count.append(0)
    else: 
        change_count.append(len(i))

In [4]:
new_diffs = []
new_change_count = []
new_bug_label = []

for i in range(len(all_diffs)):
    if change_count[i] > 15 or change_count[i] == 0:
        continue
    else:
        new_diffs.append(all_diffs[i])
        new_change_count.append(change_count[i])
        new_bug_label.append(bug_label[i])

cpp_diffs = []
cpp_change = []
cpp_bug_label = []

for i in range(len(new_diffs)):
    if language[i] == 'cpp':
        cpp_diffs.append(new_diffs[i])
        cpp_change.append(new_change_count[i])
        cpp_bug_label.append(new_bug_label[i])

In [5]:
flat = [item for sublist in cpp_bug_label for item in sublist]
unique_strings = sorted(set(flat))
string_to_int = {s: i+1 for i, s in enumerate(unique_strings)}
mapped_data = [[string_to_int[s] for s in sublist] for sublist in cpp_bug_label]

In [6]:
from collections import Counter

flat_numbers = [n for sublist in mapped_data for n in sublist]
distribution = Counter(flat_numbers)
print(distribution)

Counter({31: 54125, 14: 54036, 86: 42981, 57: 38745, 82: 34255, 103: 31218, 74: 30350, 95: 28469, 10: 25696, 42: 25621, 58: 19395, 102: 13132, 129: 13119, 105: 12281, 55: 12021, 12: 11789, 2: 10567, 56: 9970, 118: 9116, 127: 8911, 43: 8607, 68: 8295, 97: 7411, 22: 7238, 20: 6254, 125: 5964, 128: 5782, 13: 5621, 49: 5545, 83: 5513, 101: 5513, 41: 5486, 84: 4109, 100: 4109, 11: 4095, 1: 3737, 104: 3277, 120: 3229, 87: 2345, 79: 2214, 107: 2028, 61: 1538, 29: 1495, 32: 1495, 35: 1381, 109: 1372, 53: 1282, 28: 1139, 33: 1139, 59: 1130, 9: 1130, 75: 1069, 122: 920, 112: 873, 93: 870, 119: 798, 126: 794, 47: 794, 30: 772, 52: 669, 66: 637, 27: 621, 73: 574, 36: 559, 60: 551, 8: 551, 26: 511, 63: 487, 7: 432, 121: 408, 99: 398, 37: 327, 91: 285, 65: 283, 34: 244, 46: 244, 76: 222, 67: 213, 45: 212, 88: 194, 96: 189, 3: 176, 70: 176, 48: 147, 62: 144, 78: 141, 77: 112, 54: 110, 21: 105, 38: 100, 106: 98, 39: 84, 116: 83, 85: 79, 115: 62, 50: 62, 16: 55, 15: 40, 44: 38, 80: 33, 64: 31, 4: 29, 6

In [None]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import f1_score

cpp_diffs = np.array(cpp_diffs)

all_labels = sorted(set(l for sample in mapped_data for l in sample))
label_to_idx = {label: idx for idx, label in enumerate(all_labels)}
idx_to_label = {idx: label for label, idx in label_to_idx.items()}
n_labels = len(label_to_idx)


n_samples = len(mapped_data)
y_multi_hot = np.zeros((n_samples, n_labels), dtype=np.float32)
for i, labels in enumerate(mapped_data):
    for label in labels:
        y_multi_hot[i, label_to_idx[label]] = 1.0


X_temp, X_test, y_temp, y_test = train_test_split(cpp_diffs, y_multi_hot, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

class MultiLabelDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X).float()
        self.y = torch.from_numpy(y).float()
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class MultiLabelNN(nn.Module):
    def __init__(self, input_dim, hidden1, hidden2, dropout, output_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden1),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden1, hidden2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden2, output_dim),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

param_grid = {
    'lr': [1e-3, 5e-4],
    'dropout': [0.3, 0.5],
    'hidden1': [4096, 2048, 1024, 512, 256],
    'hidden2': [4096, 2048, 1024, 512, 256],
    'batch_size': [128, 256],
}
grid = list(ParameterGrid(param_grid))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_dim = X_train.shape[1]
n_epochs = 10

best_f1 = 0.0
best_params = None

for config in grid:
    print(f"\nTraining with config: {config}")

    train_ds = MultiLabelDataset(X_train, y_train)
    val_ds = MultiLabelDataset(X_val, y_val)

    train_dl = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True, num_workers=2)
    val_dl = DataLoader(val_ds, batch_size=config['batch_size'], num_workers=2)

    model = MultiLabelNN(input_dim=input_dim,
                         hidden1=config['hidden1'],
                         hidden2=config['hidden2'],
                         dropout=config['dropout'],
                         output_dim=n_labels).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])
    criterion = nn.BCELoss()

    for epoch in range(n_epochs):
        model.train()
        for xb, yb in train_dl:
            xb = xb.to(device)
            yb = yb.to(device)
            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()

    model.eval()
    all_val_preds = []
    all_val_targets = []
    with torch.no_grad():
        for xb, yb in val_dl:
            xb = xb.to(device)
            yb = yb.to(device)
            preds = model(xb)
            all_val_preds.append(preds.cpu())
            all_val_targets.append(yb.cpu())

    val_preds = torch.cat(all_val_preds).numpy()
    val_targets = torch.cat(all_val_targets).numpy()
    val_preds_bin = (val_preds >= 0.5).astype(int)
    val_f1 = f1_score(val_targets, val_preds_bin, average='micro', zero_division=0)

    print(f"Val F1: {val_f1:.4f}")

    if val_f1 > best_f1:
        best_f1 = val_f1
        best_params = config

print("\nBest Config:")
print(best_params)
print(f"Best Validation F1: {best_f1:.4f}")



Training with config: {'batch_size': 128, 'dropout': 0.3, 'hidden1': 4096, 'hidden2': 4096, 'lr': 0.001}


Exception ignored in: <function _releaseLock at 0x14797a63bba0>
Traceback (most recent call last):
  File "/apps/Arch/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/logging/__init__.py", line 237, in _releaseLock
    def _releaseLock():
    
KeyboardInterrupt: 


Val F1: 0.7591

Training with config: {'batch_size': 128, 'dropout': 0.3, 'hidden1': 4096, 'hidden2': 4096, 'lr': 0.0005}
Val F1: 0.7528

Training with config: {'batch_size': 128, 'dropout': 0.3, 'hidden1': 4096, 'hidden2': 2048, 'lr': 0.001}
Val F1: 0.7520

Training with config: {'batch_size': 128, 'dropout': 0.3, 'hidden1': 4096, 'hidden2': 2048, 'lr': 0.0005}


In [8]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score


cpp_diffs = np.array(cpp_diffs)

all_labels = sorted(set(l for sample in mapped_data for l in sample))
label_to_idx = {label: idx for idx, label in enumerate(all_labels)}
idx_to_label = {idx: label for label, idx in label_to_idx.items()}
n_labels = len(label_to_idx)


n_samples = len(mapped_data)
y_multi_hot = np.zeros((n_samples, n_labels), dtype=np.float32)
for i, labels in enumerate(mapped_data):
    for label in labels:
        y_multi_hot[i, label_to_idx[label]] = 1.0


X_temp, X_test, y_temp, y_test = train_test_split(
    cpp_diffs, y_multi_hot, test_size=0.2, random_state=42
)


X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.2, random_state=42
)

class MultiLabelDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X).float()
        self.y = torch.from_numpy(y).float()
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

batch_size = 256
train_ds = MultiLabelDataset(X_train, y_train)
val_ds = MultiLabelDataset(X_val, y_val)
test_ds = MultiLabelDataset(X_test, y_test)

train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=4)
val_dl = DataLoader(val_ds, batch_size=batch_size, num_workers=4)
test_dl = DataLoader(test_ds, batch_size=batch_size, num_workers=4)


class MultiLabelNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 4096),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(4096, 4096),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(4096, output_dim),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultiLabelNN(1024, n_labels).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

n_epochs = 15
for epoch in range(1, n_epochs+1):
    model.train()
    train_loss = 0.0
    all_train_preds = []
    all_train_targets = []

    for xb, yb in train_dl:
        xb = xb.to(device)
        yb = yb.to(device)

        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * xb.size(0)
        all_train_preds.append(preds.detach().cpu())
        all_train_targets.append(yb.cpu())

    train_loss /= len(train_ds)
    train_preds = torch.cat(all_train_preds).numpy()
    train_targets = torch.cat(all_train_targets).numpy()

    model.eval()
    val_loss = 0.0
    all_val_preds = []
    all_val_targets = []

    with torch.no_grad():
        for xb, yb in val_dl:
            xb = xb.to(device)
            yb = yb.to(device)
            preds = model(xb)
            loss = criterion(preds, yb)
            val_loss += loss.item() * xb.size(0)
            all_val_preds.append(preds.cpu())
            all_val_targets.append(yb.cpu())

    val_loss /= len(val_ds)
    val_preds = torch.cat(all_val_preds).numpy()
    val_targets = torch.cat(all_val_targets).numpy()

    train_preds_bin = (train_preds >= 0.5).astype(int)
    val_preds_bin = (val_preds >= 0.5).astype(int)

    train_f1 = f1_score(train_targets, train_preds_bin, average='micro', zero_division=0)
    val_f1 = f1_score(val_targets, val_preds_bin, average='micro', zero_division=0)

    print(f"Epoch {epoch:02d}: "
          f"Train Loss={train_loss:.4f} F1={train_f1:.4f} "
          f"Val Loss={val_loss:.4f} F1={val_f1:.4f}")

Epoch 01: Train Loss=0.0580 F1=0.3602 Val Loss=0.0403 F1=0.5610
Epoch 02: Train Loss=0.0370 F1=0.6093 Val Loss=0.0344 F1=0.6527
Epoch 03: Train Loss=0.0320 F1=0.6693 Val Loss=0.0314 F1=0.6825
Epoch 04: Train Loss=0.0288 F1=0.7065 Val Loss=0.0298 F1=0.6959
Epoch 05: Train Loss=0.0263 F1=0.7330 Val Loss=0.0287 F1=0.7105
Epoch 06: Train Loss=0.0242 F1=0.7556 Val Loss=0.0278 F1=0.7285
Epoch 07: Train Loss=0.0223 F1=0.7754 Val Loss=0.0272 F1=0.7373
Epoch 08: Train Loss=0.0208 F1=0.7915 Val Loss=0.0268 F1=0.7481
Epoch 09: Train Loss=0.0192 F1=0.8074 Val Loss=0.0272 F1=0.7479
Epoch 10: Train Loss=0.0180 F1=0.8205 Val Loss=0.0270 F1=0.7569
Epoch 11: Train Loss=0.0168 F1=0.8333 Val Loss=0.0267 F1=0.7598
Epoch 12: Train Loss=0.0157 F1=0.8451 Val Loss=0.0273 F1=0.7620
Epoch 13: Train Loss=0.0147 F1=0.8553 Val Loss=0.0272 F1=0.7658
Epoch 14: Train Loss=0.0138 F1=0.8643 Val Loss=0.0277 F1=0.7664
Epoch 15: Train Loss=0.0129 F1=0.8733 Val Loss=0.0279 F1=0.7679


In [9]:
model.eval()
test_loss = 0.0
all_test_preds = []
all_test_targets = []

with torch.no_grad():
    for xb, yb in test_dl:
        xb = xb.to(device)
        yb = yb.to(device)
        preds = model(xb)
        loss = criterion(preds, yb)
        test_loss += loss.item() * xb.size(0)
        all_test_preds.append(preds.cpu())
        all_test_targets.append(yb.cpu())

test_loss /= len(test_ds)
test_preds = torch.cat(all_test_preds).numpy()
test_targets = torch.cat(all_test_targets).numpy()
test_preds_bin = (test_preds >= 0.5).astype(int)
test_f1 = f1_score(test_targets, test_preds_bin, average='micro', zero_division=0)

print(f"\nTest Loss={test_loss:.4f} F1 Score={test_f1:.4f}")


Test Loss=0.0281 F1 Score=0.7672
