In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

# Importing TensorFlow/Keras for Neural Network
import tensorflow as tf
from tensorflow.keras import layers, models

2024-09-20 12:12:48.797673: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-20 12:12:48.827950: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-20 12:12:48.828392: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load the dataset
df = pd.read_csv('Final_Datasets_just_string_Balance.csv')

In [3]:
# Feature extraction: N-Grams and Entropy
def calculate_entropy(s):
    p, lns = Counter(s), float(len(s))
    return -sum(count/lns * math.log(count/lns, 2) for count in p.values())

In [4]:
def generate_n_grams(text, n=3):
    n_grams = [text[i:i+n] for i in range(len(text)-n+1)]
    return ' '.join(n_grams)

In [5]:
df['N_GRAMS'] = df['STRING'].apply(lambda x: generate_n_grams(str(x), 3))
df['ENTROPY'] = df['STRING'].apply(lambda x: calculate_entropy(str(x)))

In [6]:
# Prepare features (N-grams and entropy) and target
X_ngrams = df['N_GRAMS']
X_entropy = df['ENTROPY'].values.reshape(-1, 1)
y = df['TARGET']

In [7]:
# Convert N-grams to vectorized format using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_ngrams_tfidf = vectorizer.fit_transform(X_ngrams).toarray()

In [8]:
# Concatenate N-grams and entropy as features
X = np.hstack((X_ngrams_tfidf, X_entropy))

# Encode target labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [9]:
# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights_dict = dict(enumerate(class_weights))

In [11]:
# Define models
models_dict = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'k-NN': KNeighborsClassifier(),
    'Gaussian Naive Bayes': GaussianNB(),
    'Neural Network': None,  # Placeholder for Neural Network that we'll define below
    'SVC (Linear Kernel)': SVC(kernel='linear', probability=True),
    'SVC (RBF Kernel)': SVC(kernel='rbf', probability=True),
    'SVC (Poly Kernel)': SVC(kernel='poly', probability=True),
    'SVC (Sigmoid Kernel)': SVC(kernel='sigmoid', probability=True),
    'AdaBoost': AdaBoostClassifier(),
    'Linear Discriminant Analysis': LinearDiscriminantAnalysis(),
    'Quadratic Discriminant Analysis': QuadraticDiscriminantAnalysis(),
    'Multinomial Naive Bayes': MultinomialNB(),
}

In [12]:
# Setup Stratified K-Fold cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [13]:
def create_neural_network(input_dim):
    model = models.Sequential()
    model.add(layers.Dense(128, activation='relu', input_dim=input_dim))
    model.add(layers.Dropout(0.3))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(3, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [14]:
# Replace the None placeholder with the actual neural network model
models_dict['Neural Network'] = create_neural_network(X.shape[1])

# Lists to store results
results = {}
roc_figures = {}


In [15]:
# Loop through models
for model_name, model in models_dict.items():
    accuracy_scores, precision_scores, recall_scores, f1_scores, roc_auc_scores = [], [], [], [], []
    tprs, aucs = [], []
    mean_fpr = np.linspace(0, 1, 100)
    
    for train_index, test_index in skf.split(X, y_encoded):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y_encoded[train_index], y_encoded[test_index]
        
        # Train model
        if model_name == 'Neural Network':  # Custom NN model
            model = create_neural_network(X_train.shape[1])  # Re-initialize NN model for each fold
            history = model.fit(X_train, pd.get_dummies(y_train), epochs=10, batch_size=32, validation_data=(X_test, pd.get_dummies(y_test)), class_weight=class_weights_dict, verbose=0)
            y_pred_prob = model.predict(X_test)
            y_pred = np.argmax(y_pred_prob, axis=1)
        else:
            model.fit(X_train, y_train)
            y_pred_prob = model.predict_proba(X_test)
            y_pred = model.predict(X_test)
        
        # Calculate metrics
        accuracy_scores.append(accuracy_score(y_test, y_pred))
        precision_scores.append(precision_score(y_test, y_pred, average='weighted'))
        recall_scores.append(recall_score(y_test, y_pred, average='weighted'))
        f1_scores.append(f1_score(y_test, y_pred, average='weighted'))
        roc_auc_scores.append(roc_auc_score(pd.get_dummies(y_test), y_pred_prob, multi_class='ovr'))  # Correct ROC AUC for multi-class
        
        # Compute ROC curve and AUC
        fpr, tpr, _ = roc_curve(pd.get_dummies(y_test).to_numpy().ravel(), y_pred_prob.ravel())
        tprs.append(np.interp(mean_fpr, fpr, tpr))
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)
    
    # Calculate mean and standard deviation for scores
    results[model_name] = {
        'accuracy': (np.mean(accuracy_scores), np.std(accuracy_scores)),
        'precision': (np.mean(precision_scores), np.std(precision_scores)),
        'recall': (np.mean(recall_scores), np.std(recall_scores)),
        'f1': (np.mean(f1_scores), np.std(f1_scores)),
        'roc_auc': (np.mean(roc_auc_scores), np.std(roc_auc_scores)),
    }

    # Generate ROC curves
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', alpha=.8)
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, color='b', label=f'{model_name} (AUC = {mean_auc:.2f})', lw=2, alpha=.8)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc='lower right')
    roc_figures[model_name] = plt.gcf()
    plt.close()





In [16]:
# Display results like in the table format you provided
for model_name, scores in results.items():
    print(f"{model_name} - Accuracy: {scores['accuracy'][0]:.4f} ± {scores['accuracy'][1]:.4f}")
    print(f"{model_name} - Precision: {scores['precision'][0]:.4f} ± {scores['precision'][1]:.4f}")
    print(f"{model_name} - Recall: {scores['recall'][0]:.4f} ± {scores['recall'][1]:.4f}")
    print(f"{model_name} - F1-Score: {scores['f1'][0]:.4f} ± {scores['f1'][1]:.4f}")
    print(f"{model_name} - ROC AUC: {scores['roc_auc'][0]:.4f} ± {scores['roc_auc'][1]:.4f}")
    print("\n")

Logistic Regression - Accuracy: 0.8925 ± 0.0175
Logistic Regression - Precision: 0.8930 ± 0.0173
Logistic Regression - Recall: 0.8925 ± 0.0175
Logistic Regression - F1-Score: 0.8920 ± 0.0179
Logistic Regression - ROC AUC: 0.9763 ± 0.0065


Random Forest - Accuracy: 0.9332 ± 0.0123
Random Forest - Precision: 0.9332 ± 0.0124
Random Forest - Recall: 0.9332 ± 0.0123
Random Forest - F1-Score: 0.9330 ± 0.0124
Random Forest - ROC AUC: 0.9899 ± 0.0031


Gradient Boosting - Accuracy: 0.9298 ± 0.0117
Gradient Boosting - Precision: 0.9298 ± 0.0117
Gradient Boosting - Recall: 0.9298 ± 0.0117
Gradient Boosting - F1-Score: 0.9295 ± 0.0119
Gradient Boosting - ROC AUC: 0.9870 ± 0.0050


k-NN - Accuracy: 0.8937 ± 0.0123
k-NN - Precision: 0.8954 ± 0.0119
k-NN - Recall: 0.8937 ± 0.0123
k-NN - F1-Score: 0.8940 ± 0.0123
k-NN - ROC AUC: 0.9676 ± 0.0078


Gaussian Naive Bayes - Accuracy: 0.8426 ± 0.0143
Gaussian Naive Bayes - Precision: 0.8423 ± 0.0144
Gaussian Naive Bayes - Recall: 0.8426 ± 0.0143
Gaussian 

In [17]:
# To save the ROC figures
for model_name, fig in roc_figures.items():
    fig.savefig(f'ROC_{model_name}.png')