In [None]:
# Install missing dependencies
!pip install datasets scikit-learn transformers

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
from datasets import Dataset
from sklearn.model_selection import train_test_split
from google.colab import files
import pandas as pd

In [None]:
# Load the trained model and tokenizer from Hugging Face Hub
model_name = "ccaug/modernbert-multiclass"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.58M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.67k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/598M [00:00<?, ?B/s]

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [None]:
# Upload the new dataset
print("Please upload the 'CICDDoSdataset.csv' file:")
uploaded = files.upload()

Please upload the 'CICDDoSdataset.csv' file:


Saving CICDDoSdataset.csv to CICDDoSdataset.csv


In [None]:
# Load the dataset
df = pd.read_csv('CICDDoSdataset.csv', low_memory=False)

In [None]:
# Remove leading/trailing spaces from column names (if needed)
df.columns = df.columns.str.strip()

In [None]:
# Display unique labels
unique_labels = df['Label'].unique()
print(f"Number of unique labels: {len(unique_labels)}")
print(unique_labels)

Number of unique labels: 7
['NetBIOS' 'BENIGN' 'LDAP' 'Portmap' 'Syn' 'MSSQL' 'UDP']


In [None]:
# Keep only the 7 desired labels
desired_labels = ['NetBIOS', 'BENIGN', 'LDAP', 'Portmap', 'Syn', 'MSSQL', 'UDP']
df = df[df['Label'].isin(desired_labels)]

In [None]:
# Convert categorical labels to numerical labels
label_mapping = {label: i for i, label in enumerate(desired_labels)}
df['label'] = df['Label'].map(label_mapping)

In [None]:
# Prepare text data (concatenate features as text input)
def convert_to_text(row):
    return " ".join(map(str, row))

df['text'] = df.drop(columns=['Label', 'label']).apply(convert_to_text, axis=1)

In [None]:
# Convert to Hugging Face Dataset format
dataset = Dataset.from_pandas(df[['text', 'label']])

In [None]:
# Tokenization
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/14000 [00:00<?, ? examples/s]

In [None]:
# Split dataset into train and test sets ensuring balanced classes
train_data, test_data = train_test_split(df, test_size=0.5, stratify=df['label'], random_state=42, shuffle=True)

test_dataset = Dataset.from_pandas(test_data[['text', 'label']])
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

In [None]:
# Predict function using the model directly
def predict_batch(batch):
    # Tokenize the input text
    inputs = tokenizer(batch['text'], padding=True, truncation=True, return_tensors="pt").to(device)

    # Get predictions
    with torch.no_grad():
        logits = model(**inputs).logits

    # Get predicted labels (argmax to choose the class with the highest score)
    predicted_labels = logits.argmax(dim=-1).cpu().numpy()

    return {"predictions": predicted_labels}

In [None]:
# Apply prediction with smaller batch size
predictions = test_dataset.map(predict_batch, batched=True, batch_size=6)  # Reduce batch size here

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

In [None]:
# Get the predicted labels and true labels
pred_labels = predictions['predictions']
true_labels = predictions['label']

In [None]:
# Calculate metrics
accuracy = accuracy_score(true_labels, pred_labels)
precision = precision_score(true_labels, pred_labels, average='weighted')
recall = recall_score(true_labels, pred_labels, average='weighted')
f1 = f1_score(true_labels, pred_labels, average='weighted')
conf_matrix = confusion_matrix(true_labels, pred_labels)

In [None]:
# Display results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nClassification Report:")
print(classification_report(true_labels, pred_labels, target_names=desired_labels))

print("\nConfusion Matrix:")
print(conf_matrix)

Accuracy: 0.9710
Precision: 0.9720
Recall: 0.9710
F1 Score: 0.9654

Classification Report:
              precision    recall  f1-score   support

     NetBIOS       1.00      0.99      0.99      1999
      BENIGN       0.98      0.99      0.99       909
        LDAP       0.93      0.99      0.96       997
     Portmap       1.00      0.05      0.10        95
         Syn       0.95      1.00      0.97      1000
       MSSQL       0.97      0.93      0.95      1000
         UDP       0.97      0.99      0.98      1000

    accuracy                           0.97      7000
   macro avg       0.97      0.85      0.85      7000
weighted avg       0.97      0.97      0.97      7000


Confusion Matrix:
[[1973    0    0    0    0    1   25]
 [   0  904    1    0    1    1    2]
 [   0    0  991    0    0    6    0]
 [   0   22    4    5   55    9    0]
 [   0    0    0    0 1000    0    0]
 [   0    0   64    0    0  932    4]
 [   0    0    0    0    0    8  992]]
