# Convolutional Neural Network (CNN)

# Libraries

Load the libraries.

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from tensorflow.keras.models import load_model
from sklearn.metrics import roc_curve, auc, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt
from tensorflow.keras import layers
from keras.optimizers import Adam
from tensorflow.keras.utils import plot_model
from imblearn.under_sampling import RandomUnderSampler

## Data Preprocessing

Load the data.

In [None]:
data = pd.read_csv("data/000webhost_subset_classifed_featureExtracted.csv", error_bad_lines=False)

Split the data into training and testing sets. 

In [None]:
#X = data.drop(columns=["password", "strength", "length", "uppercase", "lowercase", "digits", "special", "cracking_time"])
#X = X[['entropy', 'levenshtein_distance', 'char_repetition_weight_sum', "consecutive_char_type", "most_common_char_type", "char_freq_ratio", "password_length_ratio_to_unique_val", 'bigram_freq', 'trigram_freq', 'fourgram_freq']]
X = data.drop(columns=["password", "strength", "length", "uppercase", "lowercase", "digits", "special", "consecutive_char_type_count", "cracking_time"])
y = data["strength"]

rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# First, split the data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Then, split the training set again to create a validation set.
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

Categorize the output

In [None]:
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_val_encoded = encoder.fit_transform(y_val)
y_test_encoded = encoder.fit_transform(y_test)

In [None]:
X_train_reshaped = X_train.values.reshape(X_train.shape[0], X_train.shape[1], 1)
X_val_reshaped = X_val.values.reshape(X_val.shape[0], X_val.shape[1], 1)
X_test_reshaped = X_test.values.reshape(X_test.shape[0], X_test.shape[1], 1)


## Model Construction and Evaluation

Build the model.

In [None]:
model = keras.Sequential(
    [
        keras.Input(shape=(X_train.shape[1], 1)),
        layers.Conv1D(32, kernel_size=3, activation='relu'),
        layers.MaxPooling1D(pool_size=2),  # Change pool_size from 4 to 2
        layers.Conv1D(8, kernel_size=2, activation='relu'),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dense(64, activation='relu'),
        layers.Dense(32, activation = 'relu'),
        layers.Dense(4, activation='softmax')
    ]
)
model.summary()

Compile and fit the model.

In [None]:
batch_size = 32
epochs = 50

optimizer = Adam(learning_rate=0.0005)

model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
training_history = model.fit(X_train_reshaped, y_train_encoded, batch_size=batch_size, epochs=epochs, validation_split=0.1)

Validate the model.

In [None]:
val_loss, val_acc = model.evaluate(X_val_reshaped, y_val_encoded)
print(f"Val accuracy: {val_acc:.4f}")


Test the model.

In [None]:
test_loss, test_acc = model.evaluate(X_test_reshaped, y_test_encoded)
print(f"Test accuracy: {test_acc:.4f}")


Save the model. 

In [None]:
model.save("cnn_model.h5")


# Analysis of the Model

Load the model. 

In [None]:
loaded_model = load_model("cnn_model.h5")


Create a diagram of the model. 

In [None]:
plot_model(loaded_model, to_file='CNN.png', show_shapes=True, show_layer_names=True)


Test the model.

In [None]:
test_loss, test_acc = loaded_model.evaluate(X_test, y_test_encoded)
print(f"Test accuracy: {test_acc:.4f}")


Predict on the test set. 

In [None]:
y_test_pred = model.predict(X_test)
y_test_pred_classes = np.argmax(y_test_pred, axis=1)

Calculate different performance metrics. 

In [None]:
precision = precision_score(y_test_encoded, y_test_pred_classes, average='weighted')
recall = recall_score(y_test_encoded, y_test_pred_classes, average='weighted')
f1 = f1_score(y_test_encoded, y_test_pred_classes, average='weighted')


Print the metrics. 

In [None]:
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print(classification_report(y_test_encoded, y_test_pred_classes))


Find the ROC and AUC.

In [None]:
n_classes = 4
y_test_binarized = label_binarize(y_test_encoded, classes=[0, 1, 2, 3])
fpr, tpr, roc_auc = dict(), dict(), dict()

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], y_test_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

    
    

Plot the ROC curve.

In [None]:
# Calculate the macro-average AUC
auc_macro = sum(roc_auc.values()) / len(roc_auc)
print("Macro-average AUC:", auc_macro)

# Plot the ROC curves with AUC values
plt.figure()
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label=f'Class {i} (AUC = {roc_auc[i]:.2f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('CNN ROC')
plt.legend(loc="lower right")
plt.savefig("CNN ROC", dpi=300, bbox_inches='tight')
plt.show()


Rank the features by importance. 

In [None]:
def permutation_feature_importance(model, X_test, y_test_encoded):
    X_test = X_test.to_numpy()  # Convert X_test to a NumPy array

    # Calculate the baseline performance
    baseline_accuracy = accuracy_score(y_test_encoded, np.argmax(model.predict(X_test), axis=1))

    # Initialize an array to store feature importances
    feature_importances = np.zeros(X_test.shape[1])

    # For each feature
    for i in range(X_test.shape[1]):
        # Create a copy of the test data with the feature values shuffled
        X_test_shuffled = X_test.copy()
        X_test_shuffled[:, i] = shuffle(X_test_shuffled[:, i])

        # Calculate the performance of the model using the shuffled test data
        shuffled_accuracy = accuracy_score(y_test_encoded, np.argmax(model.predict(X_test_shuffled), axis=1))

        # Calculate the feature importance as the difference between the baseline and shuffled accuracy
        feature_importances[i] = baseline_accuracy - shuffled_accuracy

    return feature_importances

cnn_feature_importance = permutation_feature_importance(loaded_model, X_test, y_test_encoded)
cnn_feature_rank = np.argsort(cnn_feature_importance)[::-1]

print("CNN feature importance ranking:")
for i, rank in enumerate(cnn_feature_rank):
    print(f"Feature {rank}: {cnn_feature_importance[rank]:.4f}")



Plot the training loss. 

In [None]:
import matplotlib.pyplot as plt

def plot_loss(history, epoch_interval, model_name):
    epochs = range(1, len(history.history['loss']) + 1, epoch_interval)
    train_loss = history.history['loss'][::epoch_interval]
    val_loss = history.history['val_loss'][::epoch_interval]

    plt.figure()
    plt.plot(epochs, train_loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'r', label='Validation loss')
    plt.title(f'{model_name} - Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig("CNN Training and Validation Loss", dpi=300, bbox_inches='tight')
    plt.show()
    
plot_loss(training_history, 10, 'CNN')

Plot the confusion matrix. 

In [None]:
def plot_confusion_matrix(y_true, y_pred, class_names):
    cm = confusion_matrix(y_true, y_pred)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.figure(figsize=(8, 8))
    sns.heatmap(cm_normalized, annot=True, cmap='Blues', fmt='.2f', xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('CNN Confusion Matrix')
    plt.savefig("CNN Confusion Matrix", dpi=300, bbox_inches='tight')
    plt.show()
    return(cm)
    
cm = plot_confusion_matrix(y_test_encoded, y_test_pred_classes, ['Weak', 'Medium', 'Strong', 'Very strong'])