In [1]:
!pip install xgboost



In [2]:
!brew install libomp

[34m==>[0m [1mAuto-updating Homebrew...[0m
Adjust how often this is run with HOMEBREW_AUTO_UPDATE_SECS or disable with
HOMEBREW_NO_AUTO_UPDATE. Hide these hints with HOMEBREW_NO_ENV_HINTS (see `man brew`).
[34m==>[0m [1mAuto-updated Homebrew![0m
Updated 2 taps (homebrew/core and homebrew/cask).
[34m==>[0m [1mNew Formulae[0m
arp-scan-rs     kbt             mermaid-cli     rnp             tiledb
go-rice         lolcrab         ovsx            sherif
[34m==>[0m [1mNew Casks[0m
accordance@13              linqpad                    macsyzones

You have [1m83[0m outdated formulae installed.

To reinstall 20.1.7, run:
  brew reinstall libomp


In [7]:
!pip install tensorflow



In [1]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Concatenate, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import ast


2025-06-21 17:52:01.453741: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load preprocessed CSVs
train_df = pd.read_csv("datasets/security/train_preprocessed.csv")
valid_df = pd.read_csv("datasets/security/valid_preprocessed.csv")
test_df = pd.read_csv("datasets/security/test_preprocessed.csv")

MAX_LEN = 300

def parse_sequence(x): return ast.literal_eval(x)[:MAX_LEN]
X_train_seq = pad_sequences(train_df['input_ids'].apply(parse_sequence), maxlen=MAX_LEN)
X_valid_seq = pad_sequences(valid_df['input_ids'].apply(parse_sequence), maxlen=MAX_LEN)
X_test_seq = pad_sequences(test_df['input_ids'].apply(parse_sequence), maxlen=MAX_LEN)

In [3]:
# Select top 10 tabular features
features = [col for col in train_df.columns if col not in ['id', 'project', 'commit_id', 'tokens', 'func', 'input_ids', 'attention_mask', 'target']]
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=100, max_depth=6, random_state=42)
xgb.fit(train_df[features], train_df['target'].astype(int))
top_features = pd.Series(xgb.feature_importances_, index=features).sort_values(ascending=False).head(10).index.tolist()

X_train_tab = train_df[top_features]
X_valid_tab = valid_df[top_features]
X_test_tab = test_df[top_features]

scaler = StandardScaler()
X_train_tab_scaled = scaler.fit_transform(X_train_tab)
X_valid_tab_scaled = scaler.transform(X_valid_tab)
X_test_tab_scaled = scaler.transform(X_test_tab)

y_train = train_df['target'].astype(int)
y_valid = valid_df['target'].astype(int)
y_test = test_df['target'].astype(int)

In [4]:
# Combine train and valid
X_seq_comb = np.concatenate([X_train_seq, X_valid_seq])
X_tab_comb = np.concatenate([X_train_tab_scaled, X_valid_tab_scaled])
y_comb = np.concatenate([y_train, y_valid])

In [5]:
# CNN + Tabular Model
vocab_size = max(np.max(X_seq_comb), np.max(X_test_seq)) + 1

embed_dim = 64

seq_input = Input(shape=(MAX_LEN,))
embed = Embedding(input_dim=vocab_size, output_dim=embed_dim)(seq_input)
conv = Conv1D(128, 5, activation='relu')(embed)
pool = GlobalMaxPooling1D()(conv)

tab_input = Input(shape=(X_tab_comb.shape[1],))
concat = Concatenate()([pool, tab_input])
dense = Dense(64, activation='relu')(concat)
drop = Dropout(0.3)(dense)
output = Dense(1, activation='sigmoid')(drop)

model = Model(inputs=[seq_input, tab_input], outputs=output)
model.compile(optimizer=Adam(0.001), loss='binary_crossentropy', metrics=['accuracy'])

model.fit([X_seq_comb, X_tab_comb], y_comb, validation_data=([X_valid_seq, X_valid_tab_scaled], y_valid), epochs=10, batch_size=64)

Epoch 1/10
[1m385/385[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 344ms/step - accuracy: 0.5455 - loss: 0.6880 - val_accuracy: 0.7694 - val_loss: 0.5615
Epoch 2/10
[1m385/385[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 339ms/step - accuracy: 0.7369 - loss: 0.5258 - val_accuracy: 0.9129 - val_loss: 0.3047
Epoch 3/10
[1m385/385[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 335ms/step - accuracy: 0.8898 - loss: 0.2687 - val_accuracy: 0.9502 - val_loss: 0.1731
Epoch 4/10
[1m385/385[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 346ms/step - accuracy: 0.9330 - loss: 0.1595 - val_accuracy: 0.9704 - val_loss: 0.0890
Epoch 5/10
[1m385/385[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 343ms/step - accuracy: 0.9539 - loss: 0.1038 - val_accuracy: 0.9733 - val_loss: 0.0642
Epoch 6/10
[1m385/385[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 317ms/step - accuracy: 0.9688 - loss: 0.0701 - val_accuracy: 0.9817 - val_loss: 0.0440
Epoc

<keras.src.callbacks.history.History at 0x139313be0>

In [6]:
# Evaluate
y_test_pred = (model.predict([X_test_seq, X_test_tab_scaled]) > 0.5).astype(int)
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("F1 Score:", f1_score(y_test, y_test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))

[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step
Accuracy: 0.5871156661786238
F1 Score: 0.5837638376383764
Confusion Matrix:
 [[813 664]
 [464 791]]
