In [None]:
# Install missing dependencies
!pip install datasets scikit-learn transformers

Collecting datasets
  Downloading datasets-3.4.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.4.0-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xx

In [None]:
import torch
import pandas as pd
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import EarlyStoppingCallback
from google.colab import files
from sklearn.model_selection import train_test_split

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
# Upload the dataset
print("Please upload the 'fulldatasetsmall.csv' file:")
uploaded = files.upload()

Please upload the 'fulldatasetsmall.csv' file:


Saving fulldatasetsmall.csv to fulldatasetsmall.csv


In [None]:
# Load the dataset
df = pd.read_csv('fulldatasetsmall.csv', low_memory=False)

In [None]:
# Remove leading/trailing spaces from column names (if needed)
df.columns = df.columns.str.strip()

# Display unique labels
unique_labels = df['Label'].unique()
print(f"Number of unique labels: {len(unique_labels)}")
print(unique_labels)

Number of unique labels: 9
['UDP' 'LDAP' 'NetBIOS' 'MSSQL' 'BENIGN' 'Portmap' 'Syn' 'SSL' nan]


In [None]:
# Keep only the 7 desired labels
desired_labels = ['NetBIOS', 'BENIGN', 'LDAP', 'Portmap', 'Syn', 'MSSQL', 'UDP', 'SSL']
df = df[df['Label'].isin(desired_labels)]

In [None]:
# Convert categorical labels to numerical labels
label_mapping = {label: i for i, label in enumerate(desired_labels)}
df['label'] = df['Label'].map(label_mapping)

In [None]:
# Prepare text data (concatenate features as text input)
def convert_to_text(row):
    return " ".join(map(str, row))

df['text'] = df.drop(columns=['Label', 'label']).apply(convert_to_text, axis=1)

In [None]:
# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
# Split dataset into train and test sets ensuring balanced classes
train_data, test_data = train_test_split(df, test_size=0.4, stratify=df['label'], random_state=42)

In [None]:
missing_labels = set(df['label'].unique()) - set(train_data['label'].unique())
if missing_labels:
    print(f"Warning: Some labels are missing in training: {missing_labels}")
else:
    print("All labels are present in the training set.")

All labels are present in the training set.


In [None]:
val_data, new_test_data = train_test_split(test_data, test_size=0.4, stratify=test_data['label'], random_state=42)

In [None]:
# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_data[['text', 'label']])
test_dataset = Dataset.from_pandas(new_test_data[['text', 'label']])

In [None]:
# Tokenization
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Map:   0%|          | 0/640 [00:00<?, ? examples/s]

In [None]:
# Load ModernBERT model for multi-class classification (7 labels)
model = AutoModelForSequenceClassification.from_pretrained(
    "answerdotai/ModernBERT-base", num_labels=len(desired_labels)
).to(device)

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.gradient_checkpointing_enable()

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# Define the custom compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)

    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    precision = precision_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')

    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }

In [None]:
# Define early stopping with patience of 1 evaluations
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=1,  # Stop if no improvement after 1 evaluations
    early_stopping_threshold=0.01  # Minimum change to qualify as an improvement
)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",  # Evaluate every 'eval_steps'
    eval_steps=25,  # Evaluate every 100 steps
    learning_rate=5e-5,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    num_train_epochs=1,  # Set high number of epochs; early stopping will control when to stop
    weight_decay=0.01,
    push_to_hub=False,
    report_to="none",
    fp16=True,
    gradient_accumulation_steps=2,
    dataloader_num_workers=8,
    gradient_checkpointing=True,
    load_best_model_at_end=True,  # Load the best model when stopping early
    metric_for_best_model="f1",  # Define the metric to monitor (e.g., "f1")
    greater_is_better=True,  # Set to True if a higher metric is better
)



In [None]:
# Initialize Trainer with early stopping and custom metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]  # Add early stopping callback
)

  trainer = Trainer(


In [None]:
# Train the model
trainer.train()

W0315 19:59:37.537000 244 torch/_inductor/utils.py:1137] [1/0] Not enough SMs to use max_autotune_gemm mode


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
25,No log,1.546486,0.4375,0.295419,0.24881,0.4375
50,No log,0.681519,0.748437,0.714398,0.782552,0.748437
75,No log,0.532062,0.828125,0.781573,0.765133,0.828125
100,No log,0.303021,0.9125,0.900216,0.915386,0.9125
125,No log,0.158561,0.9625,0.958743,0.956098,0.9625
150,No log,0.084445,0.978125,0.974322,0.970996,0.978125


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
25,No log,1.546486,0.4375,0.295419,0.24881,0.4375
50,No log,0.681519,0.748437,0.714398,0.782552,0.748437
75,No log,0.532062,0.828125,0.781573,0.765133,0.828125
100,No log,0.303021,0.9125,0.900216,0.915386,0.9125
125,No log,0.158561,0.9625,0.958743,0.956098,0.9625
150,No log,0.084445,0.978125,0.974322,0.970996,0.978125
175,No log,0.078547,0.985938,0.982106,0.978409,0.985938


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Could not locate the best model at ./results/checkpoint-175/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


TrainOutput(global_step=175, training_loss=1.3126681082589287, metrics={'train_runtime': 11039.6142, 'train_samples_per_second': 0.217, 'train_steps_per_second': 0.018, 'total_flos': 1.14499398795264e+16, 'train_loss': 1.3126681082589287, 'epoch': 0.875})

In [None]:
import os

# Defined in the secrets tab in Google Colab
hf_token = os.getenv('hf_token')

In [None]:
from huggingface_hub import login

# Authenticate with the Hugging Face Hub
login(token=hf_token)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Save the model
trainer.save_model("./modernbert_ddos")
tokenizer.save_pretrained("./modernbert_ddos")

print("Fine-tuning complete. Model saved.")

Fine-tuning complete. Model saved.


In [None]:
# Push the model to the Hugging Face Hub
model_name = "ccaug/modernbert_ddos"
trainer.push_to_hub(model_name, token=hf_token)

 # Push the tokenizer as well
tokenizer.push_to_hub(model_name, token=hf_token)

print(f"Model and tokenizer pushed to the Hugging Face Hub under {model_name}")

model.safetensors:   0%|          | 0.00/598M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

Model and tokenizer pushed to the Hugging Face Hub under ccaug/modernbert_ddos


In [None]:
!zip -r modernbert_ddos.zip ./modernbert_ddos

  adding: modernbert_ddos/ (stored 0%)
  adding: modernbert_ddos/training_args.bin (deflated 52%)
  adding: modernbert_ddos/model.safetensors (deflated 7%)
  adding: modernbert_ddos/tokenizer_config.json (deflated 95%)
  adding: modernbert_ddos/config.json (deflated 60%)
  adding: modernbert_ddos/README.md (deflated 66%)
  adding: modernbert_ddos/special_tokens_map.json (deflated 79%)
  adding: modernbert_ddos/tokenizer.json (deflated 82%)


In [None]:
from google.colab import files
files.download("modernbert_ddos.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>