In [1]:
import pandas as pd
import ast

In [2]:
# Load CSV with engineered features
df = pd.read_csv("datasets/security/train_preprocessed.csv")

In [3]:
# Convert string lists back to real lists
df["input_ids"] = df["input_ids"].apply(ast.literal_eval)
df["attention_mask"] = df["attention_mask"].apply(ast.literal_eval)
df["target"] = df["target"].astype(int)

In [4]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.2, stratify=df["target"], random_state=42)

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader

class CodeTokenDataset(Dataset):
    def __init__(self, df):
        self.inputs = df["input_ids"].tolist()
        self.masks = df["attention_mask"].tolist()
        self.labels = df["target"].tolist()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.inputs[idx], dtype=torch.long),
            torch.tensor(self.masks[idx], dtype=torch.long),
            torch.tensor(self.labels[idx], dtype=torch.long)
        )

In [8]:
from torch.nn.utils.rnn import pad_sequence
def collate_fn(batch):
    input_ids, attention_masks, labels = zip(*batch)

    input_ids = [x.clone().detach() if isinstance(x, torch.Tensor) else torch.tensor(x, dtype=torch.long) for x in input_ids]
    attention_masks = [x.clone().detach() if isinstance(x, torch.Tensor) else torch.tensor(x, dtype=torch.long) for x in attention_masks]

    labels = torch.tensor(labels, dtype=torch.long)

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    attention_masks = pad_sequence(attention_masks, batch_first=True, padding_value=0)

    return input_ids, attention_masks, labels

In [9]:
train_dataset = CodeTokenDataset(train_df)
val_dataset = CodeTokenDataset(val_df)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, collate_fn=collate_fn)

In [19]:
import torch
import torch.nn as nn

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256, num_classes=2, dropout=0.5):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

        self.lstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=1,
            bidirectional=True,
            batch_first=True
        )

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, input_ids, attention_mask):
        embedded = self.embedding(input_ids)  # (batch, seq_len, embed_dim)
        lstm_out, _ = self.lstm(embedded)     # (batch, seq_len, hidden_dim*2)

        pooled, _ = torch.max(lstm_out, dim=1)  # (batch, hidden_dim*2)

        output = self.dropout(pooled)
        logits = self.fc(output)
        return logits


In [22]:
from sklearn.metrics import accuracy_score
import torch.optim as optim
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Detect vocab size from max token ID in dataset
vocab_size = max(df["input_ids"].explode()) + 1

# Initialize model
model = LSTMClassifier(vocab_size=vocab_size).to(device)
classes = np.array([0, 1])
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=train_df['target'])
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)
optimizer = optim.Adam(model.parameters(), lr=1e-4)


In [23]:
!pip install tqdm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [24]:
from tqdm import tqdm
import matplotlib.pyplot as plt

epochs = 15
train_acc_list = []
val_acc_list = []
train_loss_list = []

best_val_acc = 0.0

for epoch in range(epochs):
    model.train()
    train_loss = 0
    correct = 0
    total = 0

    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
    for input_ids, attention_mask, labels in loop:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        preds = torch.argmax(outputs, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

        # Update progress bar
        avg_loss = train_loss / len(train_loader)
        loop.set_postfix(avg_loss=avg_loss, accuracy=100 * correct / total)

    train_acc = 100 * correct / total
    avg_epoch_loss = train_loss / len(train_loader)

    # Validation after each epoch
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for input_ids, attention_mask, labels in val_loader:
            input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1).cpu().tolist()
            all_preds.extend(preds)
            all_labels.extend(labels.tolist())

    val_acc = accuracy_score(all_labels, all_preds)

    print(f"\n Epoch {epoch+1} complete - Train Accuracy: {train_acc:.2f}% - Val Accuracy: {val_acc:.4f}\n")

    # Track metrics
    train_acc_list.append(train_acc)
    val_acc_list.append(val_acc * 100)
    train_loss_list.append(avg_epoch_loss)

    # Save best model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), "best_model.pt")
        print("Best model saved!")

Epoch 1/15: 100%|██████████| 547/547 [14:15<00:00,  1.56s/it, accuracy=52.7, avg_loss=0.693]



 Epoch 1 complete - Train Accuracy: 52.74% - Val Accuracy: 0.5944

Best model saved!


Epoch 2/15: 100%|██████████| 547/547 [15:34<00:00,  1.71s/it, accuracy=57, avg_loss=0.676]    



 Epoch 2 complete - Train Accuracy: 56.99% - Val Accuracy: 0.6118

Best model saved!


Epoch 3/15:  19%|█▉        | 103/547 [16:38:30<71:44:16, 581.66s/it, accuracy=59.8, avg_loss=0.124]    


KeyboardInterrupt: 

In [None]:
plt.figure(figsize=(12, 5))

# Accuracy plot
plt.subplot(1, 2, 1)
plt.plot(train_acc_list, label="Train Accuracy")
plt.plot(val_acc_list, label="Validation Accuracy")
plt.title("Accuracy over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Accuracy (%)")
plt.legend()
plt.grid(True)

# Loss plot
plt.subplot(1, 2, 2)
plt.plot(train_loss_list, label="Train Loss")
plt.title("Training Loss over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()


In [16]:
import pandas as pd
import ast

df_test = pd.read_csv("datasets/security/test_preprocessed.csv")
df_test["input_ids"] = df_test["input_ids"].apply(ast.literal_eval)
df_test["attention_mask"] = df_test["attention_mask"].apply(ast.literal_eval)
df_test["target"] = df_test["target"].astype(int)


In [17]:
test_dataset = CodeTokenDataset(df_test)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, collate_fn=collate_fn)

In [18]:
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for input_ids, attention_mask, labels in test_loader:
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
        outputs = model(input_ids, attention_mask)
        preds = torch.argmax(outputs, dim=1).cpu().tolist()
        all_preds.extend(preds)
        all_labels.extend(labels.tolist())

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(all_labels, all_preds))

              precision    recall  f1-score   support

           0       0.62      0.72      0.67      1477
           1       0.59      0.48      0.53      1255

    accuracy                           0.61      2732
   macro avg       0.61      0.60      0.60      2732
weighted avg       0.61      0.61      0.60      2732

