In [16]:
# 1. Imports and Data Loading
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import ast

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [18]:
# Load preprocessed datasets
train_df = pd.read_csv("datasets/security/train_preprocessed.csv")
valid_df = pd.read_csv("datasets/security/valid_preprocessed.csv")
test_df = pd.read_csv("datasets/security/test_preprocessed.csv")

# Convert stringified lists back to actual lists
train_df["input_ids"] = train_df["input_ids"].apply(ast.literal_eval)
train_df["attention_mask"] = train_df["attention_mask"].apply(ast.literal_eval)
valid_df["input_ids"] = valid_df["input_ids"].apply(ast.literal_eval)
valid_df["attention_mask"] = valid_df["attention_mask"].apply(ast.literal_eval)
test_df["input_ids"] = test_df["input_ids"].apply(ast.literal_eval)
test_df["attention_mask"] = test_df["attention_mask"].apply(ast.literal_eval)

In [23]:
print(train_df.columns)

Index(['id', 'func', 'target', 'project', 'commit_id', 'func_length',
       'func_aug', 'func_length.1', 'num_loops', 'has_eval', 'has_system',
       'num_if', 'num_return', 'uses_pointer', 'uses_buffer', 'input_ids',
       'attention_mask'],
      dtype='object')


In [25]:
class EncodedTextDataset(Dataset):
    def __init__(self, dataframe):
        self.input_ids = dataframe["input_ids"].tolist()
        self.attention_masks = dataframe["attention_mask"].tolist()
        self.labels = dataframe["target"].tolist()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.input_ids[idx], dtype=torch.long)
        attention_mask = torch.tensor(self.attention_masks[idx], dtype=torch.long)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return input_ids, attention_mask, label

In [26]:
def collate_fn(batch):
    input_ids, attention_masks, labels = zip(*batch)
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    attention_masks = pad_sequence(attention_masks, batch_first=True, padding_value=0)
    labels = torch.stack(labels)
    lengths = torch.tensor([len(seq) for seq in input_ids])
    return input_ids, attention_masks, labels, lengths

In [27]:
train_loader = DataLoader(EncodedTextDataset(train_df), batch_size=32, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(EncodedTextDataset(valid_df), batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(EncodedTextDataset(test_df), batch_size=32, shuffle=False, collate_fn=collate_fn)

In [28]:
# 3. LSTM Model Definition
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, input_ids, lengths):
        embedded = self.embedding(input_ids)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, (hidden, _) = self.lstm(packed)
        return self.fc(hidden[-1])

In [34]:
!pip install tqdm

Python(3729) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [35]:
from tqdm import tqdm

def train(model, train_loader, valid_loader, criterion, optimizer, epochs=5):
    for epoch in range(epochs):
        print(f"\nStarting Epoch {epoch + 1}/{epochs}")
        model.train()
        total_loss = 0

        for input_ids, _, labels, lengths in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            input_ids, labels, lengths = input_ids.to(device), labels.to(device), lengths.to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, lengths)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f" Epoch {epoch+1} | Train Loss: {total_loss / len(train_loader):.4f}")
        evaluate(model, valid_loader, label="Validation")


In [36]:
def evaluate(model, loader, label="Test"):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for input_ids, _, labels, lengths in loader:
            input_ids, labels, lengths = input_ids.to(device), labels.to(device), lengths.to(device)
            outputs = model(input_ids, lengths)
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    print(f"{label} Accuracy: {100 * correct / total:.2f}%")

In [37]:
all_lengths = [len(seq) for seq in train_df["input_ids"]]
print(f"Max length: {max(all_lengths)} | Avg length: {sum(all_lengths)/len(all_lengths):.2f}")

Max length: 512 | Avg length: 367.48


In [38]:
# 5. Training Execution
vocab_size = max(max(seq) for seq in train_df["input_ids"]) + 1
embed_dim = 100
hidden_dim = 128
output_dim = len(set(train_df["target"]))

model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

train(model, train_loader, valid_loader, criterion, optimizer, epochs=5)


Starting Epoch 1/5


Epoch 1:   3%|▎         | 23/683 [01:45<50:22,  4.58s/it]


KeyboardInterrupt: 

In [None]:
# 6. Final Evaluation on Test Set
evaluate(model, test_loader, label="Test")