# LLM Distillation Training for LOTL Detection

This notebook trains a lightweight LLM to distill Claude-Sonnet-4.5's reasoning for LOTL attack detection.
Run this on Google Colab for GPU acceleration.


In [1]:
# Install dependencies
!pip install torch transformers sentencepiece accelerate
!pip install sentence-transformers scikit-learn numpy pandas

# Upload data.jsonl to Colab
# Use the file uploader or mount Google Drive




In [3]:
import json
import torch
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, 
    TrainingArguments, Trainer, DataCollatorForLanguageModeling
)
from torch.utils.data import Dataset
from pathlib import Path
import numpy as np

# Import our modules (upload these files to Colab)
from data_loader import load_dataset, filter_label_agreement, get_labels
from llm_distiller import LLMDistiller


ModuleNotFoundError: No module named 'data_loader'

## Load and Prepare Data


In [None]:
# Load dataset
events = load_dataset('data.jsonl')
print(f"Loaded {len(events)} events")

# Filter events where Claude and ground truth agree
filtered_events, _ = filter_label_agreement(events)
print(f"Kept {len(filtered_events)} events with agreement")

# Get labels (Claude labels for distillation target)
labels = get_labels(filtered_events, use_claude_label=True)

print(f"Base training set: {len(filtered_events)} events")

# Optional: apply augmentation to increase data diversity
try:
    from augmentation import augment_data

    augmented_events, augmented_labels = augment_data(
        filtered_events,
        labels,
        augmentation_factor=1,
        random_seed=42,
    )
    print(f"After augmentation: {len(augmented_events)} events")
    distillation_events = augmented_events
    distillation_labels = augmented_labels
except Exception as e:
    print(f"Warning: augmentation not available or failed: {e}")
    distillation_events = filtered_events
    distillation_labels = labels

print(f"Using {len(distillation_events)} events for distillation training")


## Prepare Training Data


In [None]:
# Initialize distiller
distiller = LLMDistiller(model_name="microsoft/DialoGPT-small")

# Prepare training pairs (prompt, response)
training_pairs = distiller.prepare_training_data(filtered_events)
print(f"Prepared {len(training_pairs)} training pairs")

# Show example
print("\nExample training pair:")
print(f"Prompt: {training_pairs[0][0][:200]}...")
print(f"Response: {training_pairs[0][1][:200]}...")


## Create Dataset Class


In [None]:
class LOTLDataset(Dataset):
    def __init__(self, training_pairs, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        # Format as "prompt <sep> response"
        self.texts = []
        for prompt, response in training_pairs:
            text = f"{prompt} <|endoftext|> {response} <|endoftext|>"
            self.texts.append(text)
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

# Load tokenizer and model
model_name = "microsoft/DialoGPT-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_name)

# Create dataset
dataset = LOTLDataset(training_pairs, tokenizer)
print(f"Dataset size: {len(dataset)}")


In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./llm_distillation_output',
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    warmup_steps=100,
    logging_steps=50,
    save_steps=500,
    evaluation_strategy="no",
    save_total_limit=2,
    prediction_loss_only=True,
    fp16=torch.cuda.is_available(),  # Use FP16 if GPU available
)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Causal LM, not masked LM
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

# Train
print("Starting training...")
trainer.train()
print("Training complete!")


In [None]:
# Save model and tokenizer
model.save_pretrained('./llm_distillation_model')
tokenizer.save_pretrained('./llm_distillation_model')

print("Model saved to ./llm_distillation_model")

# Download the model
from google.colab import files
import shutil

# Create zip file
shutil.make_archive('llm_distillation_model', 'zip', './llm_distillation_model')
files.download('llm_distillation_model.zip')

print("Model downloaded! Extract and place in lotl_detector/models/llm_distillation/")
