In [81]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm

import joblib
import tensorflow as tf
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

from prepare import remove_strings_from_input_print, remove_strings_from_quotes, remove_variables, remove_functions, remove_numbers, remove_escape_chars, remove_symbols, remove_extra_space, lowercase_strings, prepare_code

In [None]:
pd.set_option('display.max_colwidth', None)
secure_df = pd.read_csv("snippets/secure.csv", sep=";")
insecure_df = pd.read_csv("snippets/insecure.csv", sep=";")
df = pd.concat([secure_df, insecure_df], axis=0)

In [None]:
df["code-cleaned"] = (
    df["Code"]
    .apply(lambda snippet: remove_strings_from_input_print(snippet))
    .apply(lambda snippet: remove_strings_from_quotes(snippet))
    .apply(lambda snippet: remove_variables(snippet))
    .apply(lambda snippet: remove_functions(snippet))
    .apply(lambda snippet: remove_numbers(snippet))
    .apply(lambda snippet: remove_symbols(snippet))
    .apply(lambda snippet: remove_extra_space(snippet))
    .apply(lambda snippet: lowercase_strings(snippet))
)
df.head()

In [None]:
def tokenizer(document):
    return document.split()

document_arr = []
for index, row in df.iterrows():
    document_arr.append(row["code-cleaned"])

vectorizer = CountVectorizer(tokenizer=tokenizer)
vectorizer.fit(document_arr)
feature_names = vectorizer.get_feature_names_out()
X = vectorizer.transform(document_arr)

word_counts = X.sum(axis=0)
filtered_words = [
    word
    for word, count in zip(vectorizer.get_feature_names_out(), word_counts.tolist()[0])
    if count > 1
]

new_vectorizer = CountVectorizer(vocabulary=filtered_words)
X_new = new_vectorizer.fit_transform(document_arr)
new_feature_names = new_vectorizer.get_feature_names_out()
df_bow = pd.DataFrame(X_new.toarray(), columns=new_feature_names)
df_bow.head()

In [None]:
X = df_bow.values
y = df["Label"].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3,
)

In [None]:
def numerize_y_labels(y):
    label_mapping = {"Secure": 1, "Insecure": 0}
    y_train_numeric = np.vectorize(label_mapping.get)(y)
    return y_train_numeric
    
def unnumerize_y_labels(y_labels, threshold=0.5):
    labels = []
    for prob in y_labels:
        if prob >= threshold:
            labels.append("Secure")
        else:
            labels.append("Insecure")
    return labels

In [None]:
k = 10
knn_model = KNeighborsClassifier(n_neighbors=10, metric="euclidean")
knn_model.fit(X_train, y_train)
y_pred = knn_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

In [None]:
svm_model = svm.SVC(kernel='linear')
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

In [None]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
y_pred = nb_model.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)
print(f"Accuracy: {accuracy}")

In [None]:
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

In [None]:
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train)
colors = ['red' if label == "Secure" else 'blue' for label in y_train]
xlim = (-5, 20)
ylim = (-10, 15)
fig, axes = plt.subplots(2, 2, figsize=(12, 12))

ax = axes[0, 0]
ax.scatter(X_train_pca[:, 0], X_train_pca[:, 1], c=colors, edgecolors='k', linewidths=0, alpha=0.6)
ax.set_ylim(ylim)
ax.set_xlim(xlim)
ax.set_xticks([])
ax.set_yticks([])
ax.set_title("K Nearest Neighbors")

ax = axes[0, 1]
ax.scatter(X_train_pca[:, 0], X_train_pca[:, 1], c=colors, edgecolors='k', linewidth=0, alpha=0.5)
xx = np.linspace(xlim[0], xlim[1], 30)
yy = np.linspace(ylim[0], ylim[1], 30)
YY, XX = np.meshgrid(yy, xx)
xy = np.vstack([XX.ravel(), YY.ravel()]).T
Z = svm_model.decision_function(pca.inverse_transform(xy)).reshape(XX.shape)
ax.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.6, linestyles=['--', '-', '--'])
support_vectors_pca = pca.transform(svm_model.support_vectors_)
ax.scatter(support_vectors_pca[:, 0], support_vectors_pca[:, 1], linewidth=0, facecolors='none', edgecolors='k')
ax.set_xlim(xlim)
ax.set_ylim(ylim)
ax.set_xticks([])
ax.set_yticks([])
ax.set_title("Support Vector Machines")

ax = axes[1, 0]
nb_model_vis = GaussianNB()
nb_model_vis.fit(np.array(X_train_pca), numerize_y_labels(y_train))
xx, yy = np.meshgrid(np.arange(xlim[0], xlim[1], 0.1), np.arange(ylim[0], ylim[1], 0.1))
Z = nb_model_vis.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
cmap_light = ListedColormap(['blue', 'red'])
ax.pcolormesh(xx, yy, Z, cmap=cmap_light, alpha=0.2)
ax.scatter(X_train_pca[:, 0], X_train_pca[:, 1], c=colors, edgecolors='k', linewidth=0, alpha=0.6, facecolors='none')
ax.set_xlim(xlim)
ax.set_ylim(ylim)
ax.set_xticks([])
ax.set_yticks([])
ax.set_title("Naive Bayes")

ax = axes[1, 1]
ax.scatter(X_train_pca[:, 0], X_train_pca[:, 1], c=colors, alpha=0.6, linewidth=0)
coef = lr_model.coef_[0]
x_vals = np.linspace(min(X_train_pca[:, 0]), max(X_train_pca[:, 0]), 100)
y_vals = -(coef[0] / coef[1]) * x_vals - (lr_model.intercept_ / coef[1])
ax.plot(x_vals, y_vals, 'k--')
ax.set_xlim(xlim)
ax.set_ylim(ylim)
ax.set_xticks([])
ax.set_yticks([])
ax.set_title('Logistic Regression')

plt.tight_layout()
plt.show()

In [None]:
joblib.dump(new_vectorizer, 'models/conven/vectorizer.joblib')
joblib.dump(new_feature_names, 'models/conven/feature_names.joblib')
joblib.dump(knn_model, 'models/conven/knn_model.joblib')
joblib.dump(svm_model, 'models/conven/svm_model.joblib')
joblib.dump(nb_model, 'models/conven/nb_model.joblib')
joblib.dump(lr_model, 'models/conven/lr_model.joblib')

In [None]:
def plot_loss_and_accuracy(history):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

    ax1.plot(history['loss'], label='loss')
    ax1.plot(history['val_loss'], label='val_loss')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Binary crossentropy')
    ax1.set_title('Loss')
    ax1.grid(True)
    ax1.legend()

    ax2.plot(history['accuracy'], label='accuracy')
    ax2.plot(history['val_accuracy'], label='val_accuracy')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy')
    ax2.set_title('Accuracy')
    ax2.grid(True)
    ax2.legend()

    plt.tight_layout()
    plt.show()


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=20
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=20
)

In [None]:
def train_model(X_train, y_train, X_val, y_val, num_nodes, dropout_prob, lr, batch_size, epochs):
    nn_model = tf.keras.Sequential([
        tf.keras.layers.Dense(num_nodes, activation="relu", input_shape=(df_bow.shape[1],)),
        tf.keras.layers.Dropout(dropout_prob),
        tf.keras.layers.Dense(num_nodes, activation="relu"),
        tf.keras.layers.Dropout(dropout_prob),
        tf.keras.layers.Dense(1, activation="sigmoid"),
    ])

    nn_model.compile(optimizer=tf.keras.optimizers.Adam(lr), loss="binary_crossentropy", metrics=["accuracy"])
    history = nn_model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, y_val), verbose=0)
    return nn_model, history

In [None]:
num_nodes = 4
dropout = 0.1
learning_rate = 0.001
batch_size = 16
epochs = 50
model, history = train_model(X_train, numerize_y_labels(y_train), X_val, numerize_y_labels(y_val), num_nodes, dropout, learning_rate, batch_size, epochs)
plot_loss_and_accuracy(history.history)

In [None]:
tf.keras.models.save_model(model, "models/neural/")
joblib.dump(history.history, "models/neural/history.joblib")

In [None]:
nn_model = tf.keras.models.load_model('models/neural/')
history = joblib.load("models/neural/history.joblib")

y_pred = nn_model.predict(X_test)
accuracy = accuracy_score(y_test, unnumerize_y_labels(y_pred))
print("Accuracy:", accuracy)

history = joblib.load("models/neural/history.joblib")
plot_loss_and_accuracy(history)

In [None]:
least_val_loss = float("inf")
least_loss_model, least_loss_history = None, None
best_epochs, best_num_nodes, best_dropout_prob, best_lr = None, None, None, None
for epochs in [50]:
    for num_nodes in [4, 6, 8]:
        for dropout_prob in [0.1, 0.2, 0.3]:
            for lr in [0.1, 0.005, 0.001]:
                print(f"{num_nodes} nodes, {dropout_prob} dropout, {lr} lr, batch size {batch_size}, {epochs} epochs")
                model, history = train_model(X_train, numerize_y_labels(y_train), X_val, numerize_y_labels(y_val), num_nodes, dropout_prob, lr, 32, epochs)
                plot_loss_and_accuracy(history)
                val_loss = model.evaluate(X_val, numerize_y_labels(y_val))[0]
                if val_loss < least_val_loss:
                    least_val_loss = val_loss
                    least_loss_model = model
                    least_loss_history = history
                    best_epochs, best_num_nodes, best_dropout_prob, best_lr = epochs, num_nodes, dropout_prob, lr

print(f"Best model: {best_num_nodes} nodes, {best_dropout_prob} dropout, {best_lr} lr, {best_epochs} epochs")
plot_loss_and_accuracy(least_loss_history)