# **LoRA Netwok Data Fine-Tuning**

In [1]:
!pip install transformers datasets peft accelerate bitsandbytes pandas certifi huggingface_hub dotenv scikit-learn --quiet
!pip install torch torchvision torchaudio --quiet

In [2]:
import torch
import os
from huggingface_hub import login
import pandas as pd
import json
import sklearn
from sklearn.model_selection import train_test_split
from dotenv import load_dotenv
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import torch
from peft import get_peft_model


  from .autonotebook import tqdm as notebook_tqdm


Authenticate, Login to Hugging Face and Device Configuration

In [3]:
# Load environment variables from .env
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")

if HF_TOKEN is None:
    print("Error: HF_TOKEN not found in .env file.  Please set it.")
    exit()


In [4]:
# Log in to Hugging Face Hub
try:
    login(token=HF_TOKEN)  # Log in using the token
    print("Successfully logged in to Hugging Face Hub.")
except Exception as e:
    print(f"Error logging in to Hugging Face Hub: {e}")
    exit()

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Successfully logged in to Hugging Face Hub.


In [5]:
# Device Configuration
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS device")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA device")
else:
    device = torch.device("cpu")
    print("No GPU found. Using CPU.")

Using MPS device


Load & Format Dataset

In [6]:
# Load your CSV file, handling potential encoding issues
try:
    df = pd.read_csv("Iot_device_network_logs.csv", encoding='utf-8')
except UnicodeDecodeError:
    print("Error: Could not decode CSV using UTF-8.  Trying latin1 encoding.")
    df = pd.read_csv("Iot_device_network_logs.csv", encoding='latin1')  # or 'ISO-8859-1'

In [7]:
nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns')
df.info()

There are 477426 rows and 14 columns
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 477426 entries, 0 to 477425
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   frame.number  477426 non-null  int64  
 1   frame.time    477426 non-null  int64  
 2   frame.len     477426 non-null  int64  
 3   eth.src       477426 non-null  int64  
 4   eth.dst       477426 non-null  int64  
 5   ip.src        477426 non-null  int64  
 6   ip.dst        477426 non-null  int64  
 7   ip.proto      477426 non-null  float64
 8   ip.len        477426 non-null  float64
 9   tcp.len       477426 non-null  float64
 10  tcp.srcport   477426 non-null  float64
 11  tcp.dstport   477426 non-null  float64
 12  Value         477426 non-null  float64
 13  normality     477426 non-null  int64  
dtypes: float64(6), int64(8)
memory usage: 51.0 MB


In [8]:
# Mapping of traffic types
attacks = {
    0: "Normal traffic between IoT devices.",
    1: "Misconfiguration detected. Possibly a device sending out-of-spec data.",
    2: "DDoS attack detected. High volume of traffic to a specific target.",
    3: "Data type probing. Strange payloads (e.g., strings) sent to devices expecting specific types.",
    4: "Network scan identified. The source is likely scanning for active devices or open ports.",
    5: "Man-in-the-Middle attack detected. Suspicious interception between source and destination."
}

# Function to format the input and output using an instruction-based format
def format_log(row, dataset_type):
    instruction = "Determine the type of network traffic for the given log entry."
    input_data = (
        f"Source IP: {row['ip.src']}, "
        f"Destination IP: {row['ip.dst']}, "
        f"Protocol: {row['ip.proto']}, "
        f"Source Port: {row['tcp.srcport']}, "
        f"Destination Port: {row['tcp.dstport']}, "
        f"Frame Length: {row['frame.len']} bytes."
    )
    output = attacks.get(int(row['normality']), "Unknown traffic type.")

    formatted_entry = {
        "instruction": instruction,
        "input": input_data,
        "output": output
    }

    if dataset_type == 'train':
        train_data.append(formatted_entry)
    elif dataset_type == 'valid':
        valid_data.append(formatted_entry)
    else:  # test
        test_data.append(formatted_entry)

# Creating balanced samples by attack type
N_PER_TYPE = 100
samples = []
for attack_type in range(6):
    subset = df[df["normality"] == attack_type].sample(n=N_PER_TYPE, random_state=42)
    samples.append(subset)
balanced_df = pd.concat(samples).sample(frac=1, random_state=42)

# Splitting data into train, valid, and test
train_df, temp_df = train_test_split(balanced_df, test_size=0.3, random_state=42)  # 70% train, 30% temp
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)  # 15% valid, 15% test

# Initialize the lists
train_data = []
valid_data = []
test_data = []

# Apply formatting
for _, row in train_df.iterrows():
    format_log(row, 'train')

for _, row in valid_df.iterrows():
    format_log(row, 'valid')

for _, row in test_df.iterrows():
    format_log(row, 'test')

# Export (optional)
with open("train_data_logs.json", "w") as f:
    json.dump(train_data, f, indent=2)

with open("valid_data_logs.json", "w") as f:
    json.dump(valid_data, f, indent=2)

with open("test_data_logs.json", "w") as f:
    json.dump(test_data, f, indent=2)

# Display example
print("Training example:")
print(train_data[0])

Training example:
{'instruction': 'Determine the type of network traffic for the given log entry.', 'input': 'Source IP: 1921680121.0, Destination IP: 1921680198.0, Protocol: 1.0, Source Port: 0.0, Destination Port: 0.0, Frame Length: 98.0 bytes.', 'output': 'DDoS attack detected. High volume of traffic to a specific target.'}


Load & Tokenize Dataset

In [9]:
# Load sample data
df_json = pd.read_json("train_data_logs.json")

# Initialize tokenizer
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Calculate lengths
input_lengths = []
output_lengths = []

for i, row in df_json.iterrows():
    combined_input = row["instruction"] + " " + row["input"]
    input_tokens = tokenizer(combined_input)["input_ids"]
    output_tokens = tokenizer(row["output"])["input_ids"]
    
    input_lengths.append(len(input_tokens))
    output_lengths.append(len(output_tokens))

# Analysis of token lengths
input_series = pd.Series(input_lengths)
output_series = pd.Series(output_lengths)

print("Input token length statistics:")
print(input_series.describe())

print("\nOutput token length statistics:")
print(output_series.describe())

Input token length statistics:
count    420.000000
mean      85.714286
std       13.796658
min       68.000000
25%       68.000000
50%       90.000000
75%      101.000000
max      103.000000
dtype: float64

Output token length statistics:
count    420.000000
mean      17.257143
std        5.042594
min        8.000000
25%       16.000000
50%       18.000000
75%       21.000000
max       24.000000
dtype: float64


In [10]:
# Load the JSON data into a Pandas DataFrame
df_json = pd.read_json("train_data_logs.json")
dataset = Dataset.from_pandas(df_json)

# Load the pre-trained tokenizer
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # Ensure TinyLlama is available
tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
tokenizer.pad_token = tokenizer.eos_token

# Define a suitable max length for inputs and outputs
# Considering the data statistics, Set max_length to 150
max_input_length = 150  # This accounts for instruction + input combo
max_output_length = 150  # For the output token length, since 24 was max



def tokenize(example):
    # Combine instruction and input, and ensure consistent max_length for inputs
    prompts = [i + " " + x for i, x in zip(example["instruction"], example["input"])]
    inputs = tokenizer(prompts, truncation=True, padding="max_length", max_length=max_input_length)

    # Ensure the labels follow the max length meant for output, i.e., response generation
    labels = tokenizer(example["output"], truncation=True, padding="max_length", max_length=max_output_length)["input_ids"]

    # Convert labels, ensuring they retain the same batch length
    inputs["labels"] = labels
    return inputs

# Map the tokenization process over the entire dataset
tokenized_dataset = dataset.map(tokenize, batched=True, batch_size=16)

Map:   0%|          | 0/420 [00:00<?, ? examples/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Map: 100%|██████████| 420/420 [00:00<00:00, 10454.77 examples/s]


In [11]:
# Load the validation and test data from JSON files or directly from DataFrames
valid_data_df = pd.read_json("valid_data_logs.json")
test_data_df = pd.read_json("test_data_logs.json")

valid_dataset = Dataset.from_pandas(valid_data_df)
test_dataset = Dataset.from_pandas(test_data_df)

# Tokenize validation and test datasets
valid_tokenized_dataset = valid_dataset.map(tokenize, batched=True, batch_size=16)
test_tokenized_dataset = test_dataset.map(tokenize, batched=True, batch_size=16)

Map: 100%|██████████| 90/90 [00:00<00:00, 4809.19 examples/s]
Map: 100%|██████████| 90/90 [00:00<00:00, 9602.10 examples/s]


Load Model

In [12]:
# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16  # Use float16 for reduced memory usage
)

model.to(device)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb): 

Lora Configuartion

In [13]:
from peft import LoraConfig, get_peft_model

# LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # Adjust based on the model
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


Compute Metrics

In [14]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

def compute_metrics(pred):
    labels = pred.label_ids
    predictions = pred.predictions  # Raw logits or probabilities

    # Check for correct shape of predictions (should be (batch_size, num_classes))
    if len(predictions.shape) == 1:
        print("Error: predictions should have shape (batch_size, num_classes).")
        return {}

    # Convert probabilities/logits to predicted class labels
    try:
        predictions = np.argmax(predictions, axis=-1)
    except Exception as e:
        print(f"Error converting predictions to class labels: {e}")
        return {}

    # Flatten arrays if needed
    if labels.ndim > 1:
        labels = labels.flatten()
    if predictions.ndim > 1:
        predictions = predictions.flatten()

    # Now check if shapes are compatible *before* calling accuracy_score
    if labels.shape != predictions.shape:
        print(f"Error: Labels and predictions have incompatible shapes: {labels.shape} vs {predictions.shape}")
        return {}

    try:
        acc = accuracy_score(labels, predictions)
    except Exception as e:
        print(f"Error computing accuracy: {e}")
        return {}

    try:
        precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted', zero_division=0)
    except Exception as e:
        print(f"Error computing precision/recall/f1: {e}")
        return {}

    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}

Train Model

In [15]:
training_args = TrainingArguments(
    output_dir="./lora_tinyllama_network_logs",
    per_device_train_batch_size=6,
    num_train_epochs=10,
    logging_steps=10,
    save_steps=100,
    gradient_checkpointing=False,  # Disable to avoid grad_fn error
    optim="adamw_torch",
    report_to= "none"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=valid_tokenized_dataset, 
    compute_metrics=compute_metrics
)

trainer.train()



Step,Training Loss
10,11.6296
20,6.264
30,3.9468
40,2.4532
50,1.5398
60,1.1938
70,1.082
80,0.9578
90,0.8989
100,0.8013




TrainOutput(global_step=700, training_loss=0.7484269183022635, metrics={'train_runtime': 784.3245, 'train_samples_per_second': 5.355, 'train_steps_per_second': 0.892, 'total_flos': 3914714603520000.0, 'train_loss': 0.7484269183022635, 'epoch': 10.0})

Run Evaluation

In [16]:
# Optional: Evaluate on the validation dataset after training
trainer.evaluate(eval_dataset=valid_tokenized_dataset)

# Test evaluation (specific if you want a separate test evaluation)
test_results = trainer.evaluate(eval_dataset=test_tokenized_dataset)
print(test_results)





{'eval_loss': 0.2837545573711395, 'eval_accuracy': 0.8759259259259259, 'eval_precision': 0.847046226188803, 'eval_recall': 0.8759259259259259, 'eval_f1': 0.8612413793466028, 'eval_runtime': 8.3416, 'eval_samples_per_second': 10.789, 'eval_steps_per_second': 1.439, 'epoch': 10.0}


Evaluate the Finetuned model

In [17]:
# Evaluating on the Validation and Test set
results_finetuned_valid = trainer.evaluate(eval_dataset=valid_tokenized_dataset)
results_finetuned_test = trainer.evaluate(eval_dataset=test_tokenized_dataset)
print("Test Results:", results_finetuned_test)
print("Validation Results:", results_finetuned_valid)



Test Results: {'eval_loss': 0.2837545573711395, 'eval_accuracy': 0.8759259259259259, 'eval_precision': 0.847046226188803, 'eval_recall': 0.8759259259259259, 'eval_f1': 0.8612413793466028, 'eval_runtime': 8.295, 'eval_samples_per_second': 10.85, 'eval_steps_per_second': 1.447, 'epoch': 10.0}
Validation Results: {'eval_loss': 0.28205257654190063, 'eval_accuracy': 0.8774074074074074, 'eval_precision': 0.8523760658864911, 'eval_recall': 0.8774074074074074, 'eval_f1': 0.8647063362624825, 'eval_runtime': 8.1708, 'eval_samples_per_second': 11.015, 'eval_steps_per_second': 1.469, 'epoch': 10.0}


Evaluate the original model

In [18]:
# Load the pre-trained tokenizer and model
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" 
original_tokenizer = AutoTokenizer.from_pretrained(model_id)
original_tokenizer.pad_token = original_tokenizer.eos_token
original_model = AutoModelForCausalLM.from_pretrained(model_id)

# Define a Trainer 
trainer_original = Trainer(
    model=original_model,
    processing_class=original_tokenizer,
    compute_metrics=compute_metrics, 
)

# Evaluate the original model
results_original_valid = trainer_original.evaluate(eval_dataset=valid_tokenized_dataset)
results_original_test = trainer_original.evaluate(eval_dataset=test_tokenized_dataset)


print("Validation Results (Original Model):", results_original_valid)
print("Test Results (Original Model):", results_original_test)





Validation Results (Original Model): {'eval_loss': 13.783700942993164, 'eval_model_preparation_time': 0.0015, 'eval_accuracy': 0.0022962962962962963, 'eval_precision': 0.1576343692351355, 'eval_recall': 0.0022962962962962963, 'eval_f1': 0.002731853187789008, 'eval_runtime': 9.7694, 'eval_samples_per_second': 9.212, 'eval_steps_per_second': 1.228}
Test Results (Original Model): {'eval_loss': 13.77518081665039, 'eval_model_preparation_time': 0.0015, 'eval_accuracy': 0.001851851851851852, 'eval_precision': 0.09812861903201135, 'eval_recall': 0.001851851851851852, 'eval_f1': 0.0018478689628232616, 'eval_runtime': 36.3382, 'eval_samples_per_second': 2.477, 'eval_steps_per_second': 0.33}


In [19]:
# ACCURACY COMPARISON BOX

data = {
    'Dataset': ['Validation', 'Test'],
    'Original Model': [results_original_valid['eval_accuracy'], results_original_test['eval_accuracy']],
    'Fine-tuned Model': [results_finetuned_valid['eval_accuracy'], results_finetuned_test['eval_accuracy']]
}

comparison_df = pd.DataFrame(data)
print("\nAccuracy Comparison:\n")
print(comparison_df.to_string(index=False)) 


Accuracy Comparison:

   Dataset  Original Model  Fine-tuned Model
Validation        0.002296          0.877407
      Test        0.001852          0.875926


Original and Finetuned Trainable parameters and model size

In [20]:
# Assuming you have your model and LoRA configuration (lora_config)
# model = get_peft_model(model, lora_config) # if you have not get_peft_model, then you should
model.print_trainable_parameters()

trainable params: 1,126,400 || all params: 1,101,174,784 || trainable%: 0.1023


In [21]:
def get_model_size_in_mb(model):
   """Calculates the model size in megabytes."""
   param_size = 0
   for param in model.parameters():
       param_size += param.numel() * param.element_size()

   buffer_size = 0
   for buffer in model.buffers():
       buffer_size += buffer.numel() * buffer.element_size()

   size_all_mb = (param_size + buffer_size) / 1024**2
   return size_all_mb

def get_trainable_parameters(model):
    """Counts the number of trainable parameters in a PyTorch model."""
    total_params = 0
    trainable_params = 0
    for param in model.parameters():
        total_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    return total_params, trainable_params


# Load the original full-precision model
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
model_fp16  = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16  
)
model_fp16_size_mb = get_model_size_in_mb(model_fp16)


# # Load model with quantization
# model_int8 = AutoModelForCausalLM.from_pretrained(model_name, load_in_8bit=True, device_map='auto')

# # Configure LoRA
# lora_config = LoraConfig(
#     r=8,
#     lora_alpha=32,
#     target_modules=["q_proj", "v_proj"],
#     lora_dropout=0.1,
#     bias="none",
#     task_type="CAUSAL_LM"
# )

# Apply LoRA *after* quantization
model_lora = get_peft_model(model_fp16, lora_config)

#Use  get_trainable_parameters function
total_params, trainable_params = get_trainable_parameters(model_lora)
trainable_params_size_mb = sum(p.numel() * p.element_size() for p in model_lora.parameters() if p.requires_grad) / 1024**2 

# Calculate Overall Reduction
reduction = (model_fp16_size_mb - trainable_params_size_mb) / model_fp16_size_mb * 100

print(f"Original Model Size (FP16): {model_fp16_size_mb:.2f} MB")
print(f"Trainable Parameters Size (LoRA): {trainable_params_size_mb:.2f} MB")
print(f"Overall Parameter Reduction: {reduction:.2f}%")

Original Model Size (FP16): 2098.18 MB
Trainable Parameters Size (LoRA): 4.30 MB
Overall Parameter Reduction: 99.80%
