In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
from transformers import BertTokenizerFast, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score

class CodeDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        with open(item['full_path'], 'r') as file:
            content = file.read()
        inputs = self.tokenizer.encode_plus(
            content,
            truncation=True,
            padding='max_length',
            max_length=512,
            return_tensors="pt"
        )
        inputs = {key: value.squeeze(0) for key, value in inputs.items()}
        inputs['labels'] = torch.tensor(1 if item['buggy'] else 0)
        return inputs

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
train_dataset = CodeDataset(train_df, tokenizer)
val_dataset = CodeDataset(val_df, tokenizer)
test_dataset = CodeDataset(test_df, tokenizer)

class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = nn.MultiheadAttention(embed_dim, num_heads)
        self.ffn = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_dim)
        )
        self.layernorm1 = nn.LayerNorm(embed_dim, eps=1e-6)
        self.layernorm2 = nn.LayerNorm(embed_dim, eps=1e-6)
        self.dropout1 = nn.Dropout(rate)
        self.dropout2 = nn.Dropout(rate)

    def forward(self, x):
        attn_output, _ = self.att(x, x, x)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(x + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(nn.Module):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, embed_dim)
        self.pos_emb = nn.Embedding(maxlen, embed_dim)

    def forward(self, x):
        positions = torch.arange(0, x.size(1), device=x.device).unsqueeze(0)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

class SimpleTransformerModel(nn.Module):
    def __init__(self, maxlen, vocab_size, embed_dim, num_heads, ff_dim, num_classes):
        super(SimpleTransformerModel, self).__init__()
        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
        self.global_avg_pool = nn.AdaptiveAvgPool1d(1)
        self.dropout = nn.Dropout(0.1)
        self.fc1 = nn.Linear(embed_dim, 20)
        self.fc2 = nn.Linear(20, num_classes)

    def forward(self, x):
        x = self.embedding_layer(x)
        x = x.permute(1, 0, 2)
        x = self.transformer_block(x)
        x = x.permute(1, 2, 0)
        x = self.global_avg_pool(x).squeeze(-1)
        x = self.dropout(x)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        logits = self.fc2(x)
        return logits

# Parameters
maxlen = 512
vocab_size = tokenizer.vocab_size
embed_dim = 128
num_heads = 8
ff_dim = 512
num_classes = 2
batch_size = 8
epochs = 10
learning_rate = 2e-5
adam_epsilon = 1e-8
warmup_steps = 0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model, optimizer, scheduler
model = SimpleTransformerModel(maxlen, vocab_size, embed_dim, num_heads, ff_dim, num_classes).to(device)
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=adam_epsilon)
total_steps = len(train_dataset) * epochs // batch_size
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

def compute_metrics(preds, labels):
    preds = preds.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, preds)
    return acc, precision, recall, f1, auc

def train(model, loader, optimizer, scheduler):
    model.train()
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        inputs = {key: value.to(device) for key, value in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(inputs['input_ids'])
        loss = F.cross_entropy(outputs, labels)
        loss.backward()
        total_loss += loss.item()
        optimizer.step()
        scheduler.step()
    return total_loss / len(loader)

def evaluate(model, loader):
    model.eval()
    preds = []
    labels = []
    with torch.no_grad():
        for batch in loader:
            inputs = {key: value.to(device) for key, value in batch.items() if key != 'labels'}
            label = batch['labels'].to(device)
            output = model(inputs['input_ids'])
            preds.extend(output.cpu().numpy())
            labels.extend(label.cpu().numpy())
    acc, precision, recall, f1, auc = compute_metrics(np.array(preds), np.array(labels))
    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1, 'auc': auc}

for epoch in range(epochs):
    train_loss = train(model, train_loader, optimizer, scheduler)
    val_metrics = evaluate(model, val_loader)
    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"Train Loss: {train_loss}")
    print(f"Validation Metrics: {val_metrics}")

test_metrics = evaluate(model, test_loader)
print("Test Metrics:", test_metrics)
