In [1]:
!pip install xgboost



In [2]:
!brew install libomp

[34m==>[0m [1mAuto-updating Homebrew...[0m
Adjust how often this is run with HOMEBREW_AUTO_UPDATE_SECS or disable with
HOMEBREW_NO_AUTO_UPDATE. Hide these hints with HOMEBREW_NO_ENV_HINTS (see `man brew`).
[34m==>[0m [1mAuto-updated Homebrew![0m
Updated 2 taps (homebrew/core and homebrew/cask).
[34m==>[0m [1mNew Formulae[0m
arp-scan-rs     kbt             mermaid-cli     rnp             tiledb
go-rice         lolcrab         ovsx            sherif
[34m==>[0m [1mNew Casks[0m
accordance@13              linqpad                    macsyzones

You have [1m83[0m outdated formulae installed.

To reinstall 20.1.7, run:
  brew reinstall libomp


In [7]:
!pip install tensorflow



In [7]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Concatenate, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix,  precision_recall_curve
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import ast


In [8]:
# Load preprocessed CSVs
train_df = pd.read_csv("datasets/security/train_preprocessed.csv")
valid_df = pd.read_csv("datasets/security/valid_preprocessed.csv")
test_df = pd.read_csv("datasets/security/test_preprocessed.csv")

MAX_LEN = 300

def parse_sequence(x): return ast.literal_eval(x)[:MAX_LEN]
X_train_seq = pad_sequences(train_df['input_ids'].apply(parse_sequence), maxlen=MAX_LEN)
X_valid_seq = pad_sequences(valid_df['input_ids'].apply(parse_sequence), maxlen=MAX_LEN)
X_test_seq = pad_sequences(test_df['input_ids'].apply(parse_sequence), maxlen=MAX_LEN)

In [9]:
# Select top 10 tabular features
features = [col for col in train_df.columns if col not in ['id', 'project', 'commit_id', 'tokens', 'func', 'input_ids', 'attention_mask', 'target']]
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=100, max_depth=6, random_state=42)
xgb.fit(train_df[features], train_df['target'].astype(int))
top_features = pd.Series(xgb.feature_importances_, index=features).sort_values(ascending=False).head(10).index.tolist()

X_train_tab = train_df[top_features]
X_valid_tab = valid_df[top_features]
X_test_tab = test_df[top_features]

scaler = StandardScaler()
X_train_tab_scaled = scaler.fit_transform(X_train_tab)
X_valid_tab_scaled = scaler.transform(X_valid_tab)
X_test_tab_scaled = scaler.transform(X_test_tab)

y_train = train_df['target'].astype(int)
y_valid = valid_df['target'].astype(int)
y_test = test_df['target'].astype(int)

In [10]:
# Combine train and valid
X_seq_comb = np.concatenate([X_train_seq, X_valid_seq])
X_tab_comb = np.concatenate([X_train_tab_scaled, X_valid_tab_scaled])
y_comb = np.concatenate([y_train, y_valid])

In [11]:
# CNN + Tabular Model
vocab_size = max(np.max(X_seq_comb), np.max(X_test_seq)) + 1

embed_dim = 64

seq_input = Input(shape=(MAX_LEN,))
embed = Embedding(input_dim=vocab_size, output_dim=64)(seq_input)
conv1 = Conv1D(256, 3, activation='relu', padding='same')(embed)
conv2 = Conv1D(128, 5, activation='relu', padding='same')(conv1)
pool = GlobalMaxPooling1D()(conv2)
drop_seq = Dropout(0.5)(pool)

tab_input = Input(shape=(X_tab_comb.shape[1],))

concat = Concatenate()([drop_seq, tab_input])
dense = Dense(64, activation='relu')(concat)
drop = Dropout(0.4)(dense)
output = Dense(1, activation='sigmoid')(drop)

model = Model(inputs=[seq_input, tab_input], outputs=output)
model.compile(optimizer=Adam(0.0005), loss='binary_crossentropy', metrics=['accuracy'])

model.fit([X_seq_comb, X_tab_comb], y_comb,
          validation_data=([X_valid_seq, X_valid_tab_scaled], y_valid),
          epochs=15,
          batch_size=64)

Epoch 1/15
[1m385/385[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 452ms/step - accuracy: 0.5310 - loss: 0.6957 - val_accuracy: 0.5835 - val_loss: 0.6582
Epoch 2/15
[1m385/385[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m170s[0m 442ms/step - accuracy: 0.5999 - loss: 0.6509 - val_accuracy: 0.8071 - val_loss: 0.4928
Epoch 3/15
[1m385/385[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 447ms/step - accuracy: 0.7768 - loss: 0.4671 - val_accuracy: 0.8946 - val_loss: 0.3036
Epoch 4/15
[1m385/385[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 455ms/step - accuracy: 0.8700 - loss: 0.2912 - val_accuracy: 0.9312 - val_loss: 0.1972
Epoch 5/15
[1m385/385[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m180s[0m 467ms/step - accuracy: 0.9126 - loss: 0.2020 - val_accuracy: 0.9462 - val_loss: 0.1444
Epoch 6/15
[1m385/385[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m263s[0m 685ms/step - accuracy: 0.9310 - loss: 0.1518 - val_accuracy: 0.9612 - val_loss: 0.1074
Epoc

<keras.src.callbacks.history.History at 0x13b025930>

In [12]:
y_pred_proba = model.predict([X_test_seq, X_test_tab_scaled])
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
f1 = 2 * (precision * recall) / (precision + recall + 1e-8)
best_thresh = thresholds[np.argmax(f1)]

y_test_pred = (y_pred_proba > best_thresh).astype(int)

print("Best threshold:", best_thresh)
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("F1 Score:", f1_score(y_test, y_test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))

[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step
Best threshold: 5.5487316e-08
Accuracy: 0.47291361639824303
F1 Score: 0.6341463414634146
Confusion Matrix:
 [[  44 1433]
 [   7 1248]]


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report

In [2]:
# Load Preprocessed Data
train_df = pd.read_csv("datasets/security/train_preprocessed.csv")
valid_df = pd.read_csv("datasets/security/valid_preprocessed.csv")
test_df = pd.read_csv("datasets/security/test_preprocessed.csv")

In [3]:
all_ids = []

for df in [train_df, valid_df, test_df]:
    ids_list = df['input_ids'].apply(eval).tolist()
    flat_ids = [i for sublist in ids_list for i in sublist]
    all_ids.extend(flat_ids)

max_token_id = max(all_ids)
print(f"Max token ID in dataset: {max_token_id}")

Max token ID in dataset: 477363


In [4]:
# Constants
MAX_LEN = 100
VOCAB_SIZE = max_token_id + 1
BATCH_SIZE = 32
EPOCHS = 5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
# Dataset Class
class CodeDataset(Dataset):
    def __init__(self, df):
        self.X = df['input_ids'].apply(eval).tolist()
        self.y = df['target'].astype(int).values

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        tokens = self.X[idx][:MAX_LEN]
        tokens += [0] * (MAX_LEN - len(tokens))  # Padding
        return torch.tensor(tokens, dtype=torch.long), torch.tensor(self.y[idx], dtype=torch.float32)

In [6]:
# Data Loaders
train_dataset = CodeDataset(train_df)
valid_dataset = CodeDataset(valid_df)
test_dataset = CodeDataset(test_df)

In [7]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [8]:
# CNN Model
class CodeCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, num_filters=128, kernel_size=5):
        super(CodeCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv = nn.Conv1d(embed_dim, num_filters, kernel_size)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.fc = nn.Linear(num_filters, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x).transpose(1, 2)  # (B, embed_dim, seq_len)
        x = torch.relu(self.conv(x))
        x = self.pool(x).squeeze(-1)
        x = self.fc(x)
        return self.sigmoid(x)

In [9]:
# Model Setup
model = CodeCNN(VOCAB_SIZE).to(DEVICE)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [10]:
print(len(train_dataset))  # Should be reasonable (e.g., thousands)
all_lengths = [len(eval(x)) for x in train_df['input_ids']]
print(f"Max token length: {max(all_lengths)}")  # Should be <500 ideally


21854
Max token length: 512


In [11]:
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for i, (X_batch, y_batch) in enumerate(train_loader):
        X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(X_batch).squeeze()
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        if i % 50 == 0:
            print(f"Batch {i}/{len(train_loader)}, Loss: {loss.item():.4f}")

    print(f"Epoch {epoch+1} completed, Total Loss: {total_loss/len(train_loader):.4f}")


Batch 0/683, Loss: 0.7160
Batch 50/683, Loss: 0.7959
Batch 100/683, Loss: 0.6495
Batch 150/683, Loss: 0.6825
Batch 200/683, Loss: 0.5893
Batch 250/683, Loss: 0.6934
Batch 300/683, Loss: 0.7086
Batch 350/683, Loss: 0.6218
Batch 400/683, Loss: 0.7002
Batch 450/683, Loss: 0.7199
Batch 500/683, Loss: 0.7291
Batch 550/683, Loss: 0.7679
Batch 600/683, Loss: 0.6093
Batch 650/683, Loss: 0.7211
Epoch 1 completed, Total Loss: 0.6808
Batch 0/683, Loss: 0.5981
Batch 50/683, Loss: 0.5883
Batch 100/683, Loss: 0.5894
Batch 150/683, Loss: 0.6004
Batch 200/683, Loss: 0.5615
Batch 250/683, Loss: 0.6563
Batch 300/683, Loss: 0.6006
Batch 350/683, Loss: 0.5571
Batch 400/683, Loss: 0.5728
Batch 450/683, Loss: 0.5977
Batch 500/683, Loss: 0.6736
Batch 550/683, Loss: 0.5012
Batch 600/683, Loss: 0.5444
Batch 650/683, Loss: 0.6480
Epoch 2 completed, Total Loss: 0.5794
Batch 0/683, Loss: 0.3624
Batch 50/683, Loss: 0.3812
Batch 100/683, Loss: 0.2627
Batch 150/683, Loss: 0.2916
Batch 200/683, Loss: 0.4063
Batch 250

In [12]:
# Evaluation on Test Set
model.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(DEVICE)
        outputs = model(X_batch).squeeze().cpu().numpy()
        preds = (outputs > 0.5).astype(int)
        y_true.extend(y_batch.numpy())
        y_pred.extend(preds)

In [13]:
# Classification Report
report = classification_report(y_true, y_pred)
print("\nTest Set Performance:\n")
print(report)


Test Set Performance:

              precision    recall  f1-score   support

         0.0       0.57      0.69      0.62      1477
         1.0       0.51      0.38      0.44      1255

    accuracy                           0.55      2732
   macro avg       0.54      0.54      0.53      2732
weighted avg       0.54      0.55      0.54      2732

