In [None]:
import torch
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("microsoft/codebert-base")
model.to(device)

import torch
from torch.utils.data import Dataset
from transformers import RobertaTokenizer

class CodeDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe
        self.tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base')

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        with open(item['full_path'], 'r') as file:
            content = file.read()
        input_ids = self.tokenizer.encode(content, truncation=True, padding='max_length', max_length=512)
        labels = torch.tensor(1 if item['buggy'] else 0)
        return {
            'input_ids': torch.tensor(input_ids),
            'labels': labels
        }

dataset = CodeDataset(df)

import torch
from transformers import AutoTokenizer, PLBartForSequenceClassification
from transformers import TrainingArguments, Trainer

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from torch.utils.data import Dataset, DataLoader
# Split the dataset into train, validation, and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['buggy'])
train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42, stratify=train_df['buggy'])  # 0.25 * 0.8 = 0.2

# Separate the majority and minority classes
train_majority = train_df[train_df['buggy'] == 0]
train_minority = train_df[train_df['buggy'] == 1]

# Upsample the minority class
train_minority_upsampled = resample(train_minority,
                                    replace=True,    # sample with replacement
                                    n_samples=len(train_majority),  # to match majority class
                                    random_state=42)  # reproducible results

# Combine majority class with upsampled minority class
train_df = pd.concat([train_majority, train_minority_upsampled])

class CodeDataset(Dataset):
    def __init__(self, dataframe, tokenizer=True):
        self.data = dataframe
        if tokenizer:
            self.tokenizer = AutoTokenizer.from_pretrained("uclanlp/plbart-base")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        labels = torch.tensor(1 if item['buggy'] else 0)
        with open(item['full_path'], 'r') as file:
            content = file.read()
        if self.tokenizer:
            input_ids = self.tokenizer.encode(content, truncation=True, padding='max_length', max_length=512)

            return {
                'input_ids': torch.tensor(input_ids),
                'labels': labels
            }
        else:
            return content, labels

train_dataset = CodeDataset(train_df, tokenizer=True)
val_dataset = CodeDataset(val_df, tokenizer=True)
test_dataset = CodeDataset(test_df, tokenizer=True)



# Load the tokenizer and model
model = PLBartForSequenceClassification.from_pretrained("uclanlp/plbart-base", num_labels=2)
# Define metrics computation
def compute_metrics(p):
    # print(p.predictions)
    preds = p.predictions[0].argmax(-1)
    probs = p.predictions[0][:, 1]
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, probs)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': auc,
    }

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    save_steps=10,
    load_best_model_at_end=True,
    report_to="none",
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
val_metrics = trainer.evaluate(eval_dataset=val_dataset)
print("Validation metrics:", val_metrics)

test_metrics = trainer.evaluate(eval_dataset=test_dataset)
print("Test metrics:", test_metrics)

trainer.train()
metrics = trainer.evaluate()
print(metrics)