In [1]:
!pip install xgboost



In [2]:
!brew install libomp

[34m==>[0m [1mAuto-updating Homebrew...[0m
Adjust how often this is run with HOMEBREW_AUTO_UPDATE_SECS or disable with
HOMEBREW_NO_AUTO_UPDATE. Hide these hints with HOMEBREW_NO_ENV_HINTS (see `man brew`).
[34m==>[0m [1mAuto-updated Homebrew![0m
Updated 2 taps (homebrew/core and homebrew/cask).
[34m==>[0m [1mNew Formulae[0m
arp-scan-rs     kbt             mermaid-cli     rnp             tiledb
go-rice         lolcrab         ovsx            sherif
[34m==>[0m [1mNew Casks[0m
accordance@13              linqpad                    macsyzones

You have [1m83[0m outdated formulae installed.

To reinstall 20.1.7, run:
  brew reinstall libomp


In [7]:
!pip install tensorflow



In [7]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Concatenate, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix,  precision_recall_curve
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import ast


In [8]:
# Load preprocessed CSVs
train_df = pd.read_csv("datasets/security/train_preprocessed.csv")
valid_df = pd.read_csv("datasets/security/valid_preprocessed.csv")
test_df = pd.read_csv("datasets/security/test_preprocessed.csv")

MAX_LEN = 300

def parse_sequence(x): return ast.literal_eval(x)[:MAX_LEN]
X_train_seq = pad_sequences(train_df['input_ids'].apply(parse_sequence), maxlen=MAX_LEN)
X_valid_seq = pad_sequences(valid_df['input_ids'].apply(parse_sequence), maxlen=MAX_LEN)
X_test_seq = pad_sequences(test_df['input_ids'].apply(parse_sequence), maxlen=MAX_LEN)

In [9]:
# Select top 10 tabular features
features = [col for col in train_df.columns if col not in ['id', 'project', 'commit_id', 'tokens', 'func', 'input_ids', 'attention_mask', 'target']]
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=100, max_depth=6, random_state=42)
xgb.fit(train_df[features], train_df['target'].astype(int))
top_features = pd.Series(xgb.feature_importances_, index=features).sort_values(ascending=False).head(10).index.tolist()

X_train_tab = train_df[top_features]
X_valid_tab = valid_df[top_features]
X_test_tab = test_df[top_features]

scaler = StandardScaler()
X_train_tab_scaled = scaler.fit_transform(X_train_tab)
X_valid_tab_scaled = scaler.transform(X_valid_tab)
X_test_tab_scaled = scaler.transform(X_test_tab)

y_train = train_df['target'].astype(int)
y_valid = valid_df['target'].astype(int)
y_test = test_df['target'].astype(int)

In [10]:
# Combine train and valid
X_seq_comb = np.concatenate([X_train_seq, X_valid_seq])
X_tab_comb = np.concatenate([X_train_tab_scaled, X_valid_tab_scaled])
y_comb = np.concatenate([y_train, y_valid])

In [11]:
# CNN + Tabular Model
vocab_size = max(np.max(X_seq_comb), np.max(X_test_seq)) + 1

embed_dim = 64

seq_input = Input(shape=(MAX_LEN,))
embed = Embedding(input_dim=vocab_size, output_dim=64)(seq_input)
conv1 = Conv1D(256, 3, activation='relu', padding='same')(embed)
conv2 = Conv1D(128, 5, activation='relu', padding='same')(conv1)
pool = GlobalMaxPooling1D()(conv2)
drop_seq = Dropout(0.5)(pool)

tab_input = Input(shape=(X_tab_comb.shape[1],))

concat = Concatenate()([drop_seq, tab_input])
dense = Dense(64, activation='relu')(concat)
drop = Dropout(0.4)(dense)
output = Dense(1, activation='sigmoid')(drop)

model = Model(inputs=[seq_input, tab_input], outputs=output)
model.compile(optimizer=Adam(0.0005), loss='binary_crossentropy', metrics=['accuracy'])

model.fit([X_seq_comb, X_tab_comb], y_comb,
          validation_data=([X_valid_seq, X_valid_tab_scaled], y_valid),
          epochs=15,
          batch_size=64)

Epoch 1/15
[1m385/385[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 452ms/step - accuracy: 0.5310 - loss: 0.6957 - val_accuracy: 0.5835 - val_loss: 0.6582
Epoch 2/15
[1m385/385[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m170s[0m 442ms/step - accuracy: 0.5999 - loss: 0.6509 - val_accuracy: 0.8071 - val_loss: 0.4928
Epoch 3/15
[1m385/385[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 447ms/step - accuracy: 0.7768 - loss: 0.4671 - val_accuracy: 0.8946 - val_loss: 0.3036
Epoch 4/15
[1m385/385[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 455ms/step - accuracy: 0.8700 - loss: 0.2912 - val_accuracy: 0.9312 - val_loss: 0.1972
Epoch 5/15
[1m385/385[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m180s[0m 467ms/step - accuracy: 0.9126 - loss: 0.2020 - val_accuracy: 0.9462 - val_loss: 0.1444
Epoch 6/15
[1m385/385[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m263s[0m 685ms/step - accuracy: 0.9310 - loss: 0.1518 - val_accuracy: 0.9612 - val_loss: 0.1074
Epoc

<keras.src.callbacks.history.History at 0x13b025930>

In [12]:
y_pred_proba = model.predict([X_test_seq, X_test_tab_scaled])
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
f1 = 2 * (precision * recall) / (precision + recall + 1e-8)
best_thresh = thresholds[np.argmax(f1)]

y_test_pred = (y_pred_proba > best_thresh).astype(int)

print("Best threshold:", best_thresh)
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("F1 Score:", f1_score(y_test, y_test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))

[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step
Best threshold: 5.5487316e-08
Accuracy: 0.47291361639824303
F1 Score: 0.6341463414634146
Confusion Matrix:
 [[  44 1433]
 [   7 1248]]
