In [1]:
from huggingface_hub import login

login("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")

#huawei-noah/TinyBERT_General_4L_312D

In [2]:
#Global Variables
NUM_CLIENTS = 10
BATCH_SIZE = 8
FEATURES_NUM = 48

In [3]:
import pandas as pd
import torch
from torch.optim import AdamW
from tqdm import tqdm
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

  import pynvml  # type: ignore[import]


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [15]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2).to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
model = torch.load("full_model.pth")
model.to(device)

  model = torch.load("full_model.pth")


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [16]:
import torch._dynamo
torch._dynamo.config.suppress_errors = True

original_model = model

# Compile the model based on TorchDynamo for faster execution
model = torch.compile(model, backend="aot_eager")

In [8]:
from torch.utils.data import Dataset

class LoaderDataset(Dataset): # Divide data into batches
    def __init__(self, encodings, labels):
        self.encodings = encodings        # X
        self.labels = labels              # y

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Convert each row to tensor
        input_ids = torch.tensor(self.encodings[idx][0], dtype=torch.long)
        attention_mask = torch.tensor(self.encodings[idx][1], dtype=torch.long)
        label = torch.tensor(self.labels[idx], dtype=torch.long)

        return {"input_ids": input_ids, "attention_mask": attention_mask, "label": label}

In [9]:
def load_dataset(partition_id: int):
    partition_filename = f'Partitions/client_{partition_id+1}.csv'
    dataset = pd.read_csv(partition_filename)
    return dataset

In [10]:
from datasets import Dataset

def convert_to_text(dataset):
    texts = dataset.apply(lambda row: " ".join([f"{col}={row[col]}" for col in dataset.columns if col != "Label"]), axis=1)

    # Combine texts and label attributes and convert into a huggingface dataset
    hf_dataset = Dataset.from_dict({
        "text": texts.tolist(),
        "label": dataset["Label"].tolist()
    })

    return hf_dataset

In [11]:
def tokenize(hf_dataset):
    tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True)

    tokenized_datasets = hf_dataset.map(tokenize_function, batched=True)
    tokenized_datasets = tokenized_datasets.remove_columns(["text"])
    tokenized_datasets.set_format("torch")

    return tokenized_datasets

In [12]:
from sklearn.model_selection import train_test_split

def split_dataset(tokenized_datasets):
    tokenized_datasets = tokenized_datasets.to_pandas()

    # Split data into features (X) and labels (y)
    X = tokenized_datasets[['input_ids', 'attention_mask']]  # tokenized result
    y = tokenized_datasets['label']  # Target labels
    y = y.replace({'Benign': 0, 'Malicious': 1})
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Convert data to numpy arrays for compatibility
    X_train = X_train.values.tolist()
    y_train = y_train.values.tolist()
    X_test = X_test.values.tolist()
    y_test = y_test.values.tolist()

    return X_train, y_train, X_test, y_test

In [13]:
from torch.utils.data import DataLoader

def convert_to_dataloaders(X_train, y_train, X_test, y_test):
    # Create PyTorch datasets
    train_dataset = LoaderDataset(X_train, y_train)
    val_dataset = LoaderDataset(X_test, y_test)
    
    # Create DataLoader instances for batching
    trainloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True) #num_workers=4, pin_memory=True)
    testloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True) #num_workers=4, pin_memory=True)

    return trainloader, testloader

In [14]:
# First Step: Load the dataset partition and pre-process it for training and validation

def pre_processing(partition_id: int):

    # Step 1: Load partition data from CSV file
    dataset = load_dataset(partition_id)

    # Step 2: Preprocess the dataset (Convert Network Flows into Texts)
    hf_dataset = convert_to_text(dataset)

    # Step 3: Tokenization
    tokenized_datasets = tokenize(hf_dataset)

    # Step 4: Split Data into training and testing sets
    X_train, y_train, X_test, y_test = split_dataset(tokenized_datasets)

    # Step 5: Convert to DataLoaders
    trainloader, testloader = convert_to_dataloaders(X_train, y_train, X_test, y_test)

    return trainloader, testloader
    

In [18]:
from torch import GradScaler, autocast
from torch.optim import AdamW
from tqdm import tqdm   # For showing training bar
from transformers import Adafactor      


# Effective batch size = 32 but the GPU only fits micro_batch_size = 16
effective_batch_size = 32
micro_batch_size = BATCH_SIZE
accumulation_steps = effective_batch_size // micro_batch_size  # The weights will be updated each 4 steps instead of each batch

# Fourth Step: Model Fine Tuning
def train(model, trainloader, epochs: int):
    #optimizer = AdamW(model.parameters(), lr=5e-5)
    optimizer = Adafactor(
        model.parameters(),
        #lr=1e-4,              # You can also set lr=None for auto-scaling
        scale_parameter=True, # Let Adafactor handle lr scaling
        relative_step=True    # Recommended for large models
    )
    epochs = 3

    original_model.train()  # set model to training mode
    scaler = GradScaler()
    
    for epoch in range(epochs):
        correct, total, epoch_loss = 0, 0, 0.0
        loop = tqdm(trainloader, leave=True)
        
        for step, batch in enumerate(loop):
            input_ids, attention_mask, labels = batch['input_ids'].to(device, non_blocking=True), batch['attention_mask'].to(device, non_blocking=True), batch['label'].to(device, non_blocking=True) 

            with autocast(device_type="cuda"): #stores tensors in 16-bit floats (FP16).
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
            
            if (step + 1) % accumulation_steps == 0:
                scaler.scale(loss).backward()   # backprop
                scaler.step(optimizer)          # update weights
                scaler.update()
                torch.cuda.empty_cache()        # releases unused cached memory, which can help avoid out-of-memory (OOM) errors at the cost of ~10% slower training.

            # metrics + print progress 
            epoch_loss += loss.item()  # Use .item() to get scalar value
            total += labels.size(0)

            # Get logits from model output
            logits = outputs.logits                            
            predictions = torch.argmax(logits, dim=1)   
            correct += (predictions == labels).sum().item()
            loop.set_description(f"Epoch {epoch}")
            loop.set_postfix(loss=loss.item())
            torch.cuda.empty_cache()  # to free some space in cuda
        epoch_loss /= len(trainloader.dataset)
        epoch_acc = correct / total * 100  # Convert to percentage
        print(f"Epoch {epoch+1}: train loss {epoch_loss:.4f}, accuracy {epoch_acc:.2f}%")

In [20]:
#Fifth Step: Model Testing
def test(model, testloader):
    original_model.eval()  # set model to evaluation mode
    correct = 0
    total = 0

    with torch.no_grad():
        loop = tqdm(testloader, leave=True)

        for batch in loop:
            input_ids, attention_mask, labels = batch['input_ids'].to(device, non_blocking=True), batch['attention_mask'].to(device, non_blocking=True), batch['label'].to(device, non_blocking=True) 

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predicted = torch.argmax(logits, dim=1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = correct / total * 100
    print(f"Test Accuracy: {accuracy:.4f}")

In [21]:
trainloader, testloader = pre_processing(0)

Map:   0%|          | 0/86624 [00:00<?, ? examples/s]

  y = y.replace({'Benign': 0, 'Malicious': 1})


In [22]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [24]:
torch.cuda.empty_cache()

In [23]:
print_gpu_utilization()

GPU memory occupied: 1242 MB.


In [24]:
train(model, trainloader, 1)
#loss, accuracy = test(net, valloader)
#print(f"Epoch {epoch+1}: validation loss {loss}, accuracy {accuracy}")

Epoch 0:   7%|▋         | 611/8663 [01:06<14:39,  9.16it/s, loss=0.706] 


KeyboardInterrupt: 

In [19]:
torch.save(model, "full_model.pth")

In [35]:
test(model, testloader)

  0%|          | 0/2166 [00:01<?, ?it/s]


AttributeError: 'NoneType' object has no attribute 'item'

In [18]:
# Example of Inferencing

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")

# Move inputs to the same device as model
inputs = {key: val.to(device) for key, val in inputs.items()}

# Forward pass (no gradient needed)
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits  # raw scores

# Convert logits to probabilities
probs = torch.softmax(logits, dim=1)

# Get predicted class
pred_class = torch.argmax(probs, dim=1).item()  # 0 or 1

# Map to labels
labels = {0: "Normal", 1: "Malicious"}
print(f"Predicted class: {labels[pred_class]}, Probabilities: {probs.cpu().numpy()}")

Predicted class: Normal, Probabilities: [[0.55743384 0.44256613]]


In [None]:
# Sixth Step: GPU Usage Optimization

# Gradient Accumulation
 