In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments
import shap
import torch

  from .autonotebook import tqdm as notebook_tqdm
2025-02-03 12:55:28.964579: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-03 12:55:28.974710: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-03 12:55:29.175118: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Column names for NSL-KDD dataset
c_names = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes",
    "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in",
    "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations",
    "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login", "is_guest_login",
    "count", "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate",
    "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
    "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate",
    "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "labels", "difficulty_degree"
]

# Load training and testing datasets
train = pd.read_csv("data/KDDTrain+.txt", names=c_names)
test = pd.read_csv("data/KDDTest+.txt", names=c_names)

# Drop 'difficulty_degree' column as it does not add value
train = train.drop(columns=["difficulty_degree"])
test = test.drop(columns=["difficulty_degree"])

# Display the first few rows of the training dataset
print(train.head())

# Check for missing values
print(train.isnull().sum())

# Check the distribution of labels
print(train['labels'].value_counts())

   duration protocol_type   service flag  src_bytes  dst_bytes  land  \
0         0           tcp  ftp_data   SF        491          0     0   
1         0           udp     other   SF        146          0     0   
2         0           tcp   private   S0          0          0     0   
3         0           tcp      http   SF        232       8153     0   
4         0           tcp      http   SF        199        420     0   

   wrong_fragment  urgent  hot  ...  dst_host_srv_count  \
0               0       0    0  ...                  25   
1               0       0    0  ...                   1   
2               0       0    0  ...                  26   
3               0       0    0  ...                 255   
4               0       0    0  ...                 255   

   dst_host_same_srv_rate  dst_host_diff_srv_rate  \
0                    0.17                    0.03   
1                    0.00                    0.60   
2                    0.10                    0.05   


In [3]:
# Convert labels to binary (Normal: 0, Attack: 1)
train['labels'] = train['labels'].apply(lambda x: 0 if x == 'normal' else 1)
test['labels'] = test['labels'].apply(lambda x: 0 if x == 'normal' else 1)

# Convert the dataset into a textual format
def create_text(row):
    return ", ".join([f"{col}={row[col]}" for col in row.index])

train['text'] = train.apply(create_text, axis=1)
test['text'] = test.apply(create_text, axis=1)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)

In [4]:
# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Fix padding issue by assigning EOS token as the padding token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)


Map: 100%|██████████| 125973/125973 [01:03<00:00, 1972.10 examples/s]
Map: 100%|██████████| 22544/22544 [00:11<00:00, 1969.00 examples/s]


In [5]:
# Load GPT-2 model for sequence classification
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset
)

# Train model
trainer.train()


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
`evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
  0%|          | 0/47241 [00:00<?, ?it/s]

AssertionError: Cannot handle batch sizes > 1 if no padding token is defined.

In [16]:
# Get model predictions
predictions = trainer.predict(tokenized_test_dataset)

# Convert predictions to class labels
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = test["labels"].values
# Classification Report
print(classification_report(y_true, y_pred, target_names=["Normal", "Attack"]))


NameError: name 'trainer' is not defined

In [17]:
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve

# Ensure the visualizations directory exists
os.makedirs("visualizations", exist_ok=True)

# Function to plot confusion matrix
def plot_confusion_matrix(y_true, y_pred, labels, filename):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix')
    plt.savefig(f'visualizations/{filename}')
    plt.close()

# Function to plot ROC curve
def plot_roc_curve(y_true, y_scores, filename):
    fpr, tpr, _ = roc_curve(y_true, y_scores)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc='lower right')
    plt.savefig(f'visualizations/{filename}')
    plt.close()

# Function to plot Precision-Recall curve
def plot_precision_recall_curve(y_true, y_scores, filename):
    precision, recall, _ = precision_recall_curve(y_true, y_scores)
    plt.figure()
    plt.plot(recall, precision, color='blue', lw=2)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.savefig(f'visualizations/{filename}')
    plt.close()

# Function to plot training loss and accuracy trends
def plot_training_metrics(training_loss, validation_loss, accuracy, filename):
    epochs = range(1, len(training_loss) + 1)
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(epochs, training_loss, label='Training Loss')
    plt.plot(epochs, validation_loss, label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training & Validation Loss')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(epochs, accuracy, label='Validation Accuracy', color='green')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.title('Validation Accuracy Over Epochs')
    plt.legend()
    
    plt.savefig(f'visualizations/{filename}')
    plt.close()

# Generate and save plots
plot_confusion_matrix(y_true, y_pred, labels=['Normal', 'Attack'], filename='confusion_matrix.png')
plot_roc_curve(y_true, predictions.predictions[:, 1], filename='roc_curve.png')
plot_precision_recall_curve(y_true, predictions.predictions[:, 1], filename='precision_recall_curve.png')

# Placeholder for training loss and accuracy (replace with actual metrics from training)
training_loss = np.random.rand(10)
validation_loss = np.random.rand(10)
accuracy = np.random.rand(10)

plot_training_metrics(training_loss, validation_loss, accuracy, filename='training_metrics.png')

print("Visualizations saved in 'visualizations' folder.")


NameError: name 'y_true' is not defined