In [1]:
# Install required packages
!pip install datasets huggingface_hub transformers

Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting transformers
  Downloading transformers-4.55.0-py3-none-any.whl.metadata (39 kB)
Collecting filelock (from datasets)
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting numpy>=1.17 (from datasets)
  Downloading numpy-2.3.2-cp312-cp312-macosx_14_0_x86_64.whl.metadata (62 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-21.0.0-cp312-cp312-macosx_12_0_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Using cached pandas-2.3.1-cp312-cp312-macosx_10_13_x86_64.whl.metadata (91 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.4-py3-none-any.whl.metadata (4.9 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Downloading 

In [2]:
from huggingface_hub import HfApi, dataset_info
import requests

# Check the dataset repository structure
api = HfApi()
try:
    info = api.dataset_info("Trendyol/Trendyol-Cybersecurity-Instruction-Tuning-Dataset")
    print("Dataset exists!")
    print(f"Dataset ID: {info.id}")
    print(f"Tags: {info.tags}")
    print(f"Number of files: {len(info.siblings) if info.siblings else 0}")
    
    if info.siblings:
        print("\nFiles in repository:")
        for file in info.siblings:
            print(f"  - {file.rfilename}")
except Exception as e:
    print(f"Error accessing dataset info: {e}")

  from .autonotebook import tqdm as notebook_tqdm


Dataset exists!
Dataset ID: Trendyol/Trendyol-Cybersecurity-Instruction-Tuning-Dataset
Tags: ['task_categories:text-generation', 'task_categories:question-answering', 'language:en', 'license:apache-2.0', 'size_categories:10K<n<100K', 'format:json', 'modality:text', 'library:datasets', 'library:pandas', 'library:mlcroissant', 'library:polars', 'region:us', 'cybersecurity', 'defensive-security', 'instruction-tuning', 'threat-intelligence', 'incident-response', 'security-operations']
Number of files: 3

Files in repository:
  - .gitattributes
  - CyberSec-Dataset_escaped.jsonl
  - README.md


In [3]:
from datasets import load_dataset

# Method 1: Try loading with explicit data_files parameter
print("Method 1: Loading with explicit data_files...")
try:
    ds1 = load_dataset(
        "Trendyol/Trendyol-Cybersecurity-Instruction-Tuning-Dataset",
        data_files="CyberSec-Dataset_escaped.jsonl"
    )
    print("✅ Method 1 SUCCESS!")
    print(f"Dataset loaded with {len(ds1['train'])} examples")
    print(f"Features: {ds1['train'].features}")
except Exception as e:
    print(f"❌ Method 1 FAILED: {e}")
    print()

Method 1: Loading with explicit data_files...


Generating train split: 53201 examples [00:01, 38475.06 examples/s]


✅ Method 1 SUCCESS!
Dataset loaded with 53201 examples
Features: {'system': Value('string'), 'user': Value('string'), 'assistant': Value('string')}


In [4]:
# Method 2: Try specifying the file format explicitly
print("Method 2: Loading with explicit format...")
try:
    ds2 = load_dataset(
        "json",
        data_files="hf://datasets/Trendyol/Trendyol-Cybersecurity-Instruction-Tuning-Dataset/CyberSec-Dataset_escaped.jsonl"
    )
    print("✅ Method 2 SUCCESS!")
    print(f"Dataset loaded with {len(ds2['train'])} examples")
except Exception as e:
    print(f"❌ Method 2 FAILED: {e}")
    print()

# Method 3: Try with streaming=True to avoid local caching issues
print("Method 3: Loading with streaming...")
try:
    ds3 = load_dataset(
        "Trendyol/Trendyol-Cybersecurity-Instruction-Tuning-Dataset",
        data_files="CyberSec-Dataset_escaped.jsonl",
        streaming=True
    )
    print("✅ Method 3 SUCCESS!")
    print("Streaming dataset created successfully")
    # Convert to regular dataset for consistency
    ds3_regular = ds3['train'].take(100)  # Take first 100 examples as test
    print(f"Streaming sample taken: {len(list(ds3_regular))} examples")
except Exception as e:
    print(f"❌ Method 3 FAILED: {e}")
    print()

Method 2: Loading with explicit format...


Generating train split: 53201 examples [00:02, 19401.64 examples/s]


✅ Method 2 SUCCESS!
Dataset loaded with 53201 examples
Method 3: Loading with streaming...
✅ Method 3 SUCCESS!
Streaming dataset created successfully
Streaming sample taken: 100 examples


In [6]:
# Display sample data from the successfully loaded dataset
print("=== DATASET SAMPLE ===")
print(f"Total examples: {len(ds1['train'])}")
print(f"Features: {list(ds1['train'].features.keys())}")
print()

# Show first example
sample = ds1['train'][0]
print("Example 1:")
print(f"System: {sample['system'][:200]}..." if len(sample['system']) > 200 else f"System: {sample['system']}")
print(f"User: {sample['user'][:200]}..." if len(sample['user']) > 200 else f"User: {sample['user']}")
print(f"Assistant: {sample['assistant'][:200]}..." if len(sample['assistant']) > 200 else f"Assistant: {sample['assistant']}")
print()

# Show data distribution
print("=== BASIC STATISTICS ===")
sample_data = ds1['train'].select(range(min(1000, len(ds1['train']))))  # Sample first 1000 or total length
system_lengths = [len(ex) for ex in sample_data['system']]
user_lengths = [len(ex) for ex in sample_data['user']]
assistant_lengths = [len(ex) for ex in sample_data['assistant']]

print(f"Average system prompt length: {sum(system_lengths)/len(system_lengths):.1f} chars")
print(f"Average user message length: {sum(user_lengths)/len(user_lengths):.1f} chars")
print(f"Average assistant response length: {sum(assistant_lengths)/len(assistant_lengths):.1f} chars")

=== DATASET SAMPLE ===
Total examples: 53201
Features: ['system', 'user', 'assistant']

Example 1:
System: You are a highly specialized AI assistant for advanced cyber-defense whose mission is to deliver accurate, in-depth, actionable guidance on information-security principles—confidentiality, integrity, ...
User: Analyze encrypted C2 channels using TLS. Discuss traffic analysis techniques to fingerprint malicious sessions.
Assistant: Encrypted Command and Control (C2) channels utilizing Transport Layer Security (TLS) present significant challenges for network defenders, as traditional packet inspection methods cannot directly anal...

=== BASIC STATISTICS ===
Average system prompt length: 1085.0 chars
Average user message length: 145.7 chars
Average assistant response length: 2084.9 chars


In [9]:
# First, let's fix the quantization issue by updating the model loading approach
!pip install -q bitsandbytes accelerate

# Also ensure we have compatible versions
!pip install -q transformers>=4.36.0 peft>=0.6.0

zsh:1: 4.36.0 not found


In [10]:
# Fix the model loading with proper BitsAndBytesConfig
from transformers import BitsAndBytesConfig
import torch

class CybersecurityFineTunerFixed:
    """Fixed fine-tuning class for cybersecurity-specific models."""
    
    def __init__(self, config: dict):
        """Initialize the fine-tuner with configuration."""
        self.config = config
        self.model_name = config.get('model_name', 'microsoft/DialoGPT-medium')  # Using a smaller model for testing
        self.output_dir = config.get('output_dir', './cybersecurity-lora')
        self.max_length = config.get('max_length', 512)
        
        # Check for CUDA availability
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {self.device}")
        
        # Initialize components
        self.tokenizer = None
        self.model = None
        
    def load_tokenizer(self):
        """Load and configure the tokenizer."""
        print(f"Loading tokenizer from {self.model_name}")
        
        from transformers import AutoTokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        
        # Set pad token if not present
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            print("Set pad_token to eos_token")
        
        print("Tokenizer loaded successfully")
    
    def load_model(self):
        """Load the base model with proper quantization configuration."""
        print(f"Loading model from {self.model_name}")
        
        from transformers import AutoModelForCausalLM
        
        # Proper quantization configuration
        if self.device == "cuda":
            # Create proper BitsAndBytesConfig
            quantization_config = BitsAndBytesConfig(
                load_in_8bit=True,
                bnb_8bit_compute_dtype=torch.float16,
                bnb_8bit_use_double_quant=True,
            )
            
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_name,
                quantization_config=quantization_config,
                device_map="auto",
                torch_dtype=torch.float16,
                trust_remote_code=True,
            )
            print("Model loaded with 8-bit quantization")
        else:
            # No quantization for CPU
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_name,
                torch_dtype=torch.float16,
                trust_remote_code=True,
            )
            print("Model loaded without quantization (CPU)")
        
        print(f"Model loaded on device: {next(self.model.parameters()).device}")
        
    def configure_lora(self):
        """Configure and apply LoRA to the model."""
        print("Configuring LoRA")
        
        from peft import LoraConfig, get_peft_model, TaskType
        
        # Get target modules based on the model architecture
        target_modules = []
        if "gpt" in self.model_name.lower():
            target_modules = ["c_attn", "c_proj", "c_fc"]
        elif "llama" in self.model_name.lower():
            target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
        else:
            # Generic targets that work for most transformer models
            target_modules = ["q_proj", "v_proj"]
        
        lora_config = LoraConfig(
            task_type=TaskType.CAUSAL_LM,
            inference_mode=False,
            r=self.config.get('lora_r', 16),
            lora_alpha=self.config.get('lora_alpha', 32),
            lora_dropout=self.config.get('lora_dropout', 0.1),
            target_modules=target_modules,
        )
        
        self.model = get_peft_model(self.model, lora_config)
        
        # Print trainable parameters
        self.model.print_trainable_parameters()
        print("LoRA configuration applied")
    
    def prepare_streaming_dataset(self, streaming_dataset):
        """Convert streaming dataset to regular dataset for training."""
        print("Converting streaming dataset to regular dataset...")
        
        # Take a subset for training (you can adjust this number)
        max_samples = self.config.get('max_samples', 1000)
        
        # Convert iterable dataset to list
        train_data = []
        count = 0
        for example in streaming_dataset['train']:
            if count >= max_samples:
                break
            train_data.append(example)
            count += 1
            
            if count % 100 == 0:
                print(f"Processed {count} examples...")
        
        # Convert to Hugging Face Dataset
        from datasets import Dataset
        dataset = Dataset.from_list(train_data)
        
        print(f"Converted {len(dataset)} examples to regular dataset")
        return dataset
    
    def format_dataset(self, dataset):
        """Format the dataset for instruction following."""
        print("Formatting dataset")
        
        def format_prompt(example):
            """Format examples for instruction following."""
            # Handle the cybersecurity dataset format (system, user, assistant)
            if "system" in example and "user" in example and "assistant" in example:
                # Create a chat-like format
                prompt = f"System: {example['system']}\n\nUser: {example['user']}\n\nAssistant: {example['assistant']}"
            elif "instruction" in example and "response" in example:
                prompt = f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['response']}"
            elif "question" in example and "answer" in example:
                prompt = f"### Question:\n{example['question']}\n\n### Answer:\n{example['answer']}"
            else:
                # Fallback: convert to string
                prompt = str(example)
            
            return {"text": prompt}
        
        # Apply formatting
        formatted_dataset = dataset.map(format_prompt)
        print("Dataset formatting completed")
        return formatted_dataset
    
    def tokenize_dataset(self, dataset):
        """Tokenize the dataset for training."""
        print("Tokenizing dataset")
        
        def tokenize_function(examples):
            """Tokenize the dataset for training."""
            return self.tokenizer(
                examples["text"],
                truncation=True,
                padding=False,
                max_length=self.max_length,
                return_overflowing_tokens=False,
            )
        
        # Tokenize the dataset
        tokenized_dataset = dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=dataset.column_names,
            desc="Tokenizing dataset"
        )
        
        # Create train/validation split
        split_ratio = self.config.get('validation_split', 0.1)
        if split_ratio > 0:
            split_dataset = tokenized_dataset.train_test_split(test_size=split_ratio)
            train_dataset = split_dataset["train"]
            eval_dataset = split_dataset["test"]
        else:
            train_dataset = tokenized_dataset
            eval_dataset = None
        
        print(f"Training samples: {len(train_dataset)}")
        if eval_dataset:
            print(f"Validation samples: {len(eval_dataset)}")
        
        return train_dataset, eval_dataset
    
    def train_streaming(self, streaming_dataset):
        """Execute the complete training pipeline with streaming dataset."""
        print("Starting training pipeline with streaming dataset")
        
        # Load components
        self.load_tokenizer()
        self.load_model()
        self.configure_lora()
        
        # Prepare dataset
        dataset = self.prepare_streaming_dataset(streaming_dataset)
        formatted_dataset = self.format_dataset(dataset)
        train_dataset, eval_dataset = self.tokenize_dataset(formatted_dataset)
        
        # Create trainer
        from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
        
        # Training arguments
        training_args = TrainingArguments(
            output_dir=self.output_dir,
            per_device_train_batch_size=self.config.get('batch_size', 1),
            per_device_eval_batch_size=self.config.get('eval_batch_size', 1),
            gradient_accumulation_steps=self.config.get('gradient_accumulation_steps', 4),
            num_train_epochs=self.config.get('epochs', 1),
            learning_rate=self.config.get('learning_rate', 2e-4),
            fp16=self.device == "cuda",
            logging_steps=self.config.get('logging_steps', 10),
            eval_steps=self.config.get('eval_steps', 100),
            save_steps=self.config.get('save_steps', 500),
            evaluation_strategy="steps" if eval_dataset else "no",
            save_strategy="steps",
            load_best_model_at_end=eval_dataset is not None,
            warmup_steps=self.config.get('warmup_steps', 50),
            lr_scheduler_type="cosine",
            report_to=None,
            remove_unused_columns=False,
            dataloader_pin_memory=False,
        )
        
        # Data collator
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False,
        )
        
        # Create trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            data_collator=data_collator,
            tokenizer=self.tokenizer,
        )
        
        print("Starting training...")
        trainer.train()
        
        # Save model
        print("Saving model...")
        trainer.save_model()
        self.tokenizer.save_pretrained(self.output_dir)
        
        print("Training completed successfully!")

print("Fixed CybersecurityFineTuner class created!")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/tilmann/Documents/GitHub/finetune-gpt-oss-cybersecurity/.venv/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/tilmann/Documents/GitHub/finetune-gpt-oss-cybersecurity/.venv/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/tilmann/Documents/GitHub/finetune-gpt-oss

Fixed CybersecurityFineTuner class created!


In [11]:
# Create a configuration for the fixed fine-tuner
config_fixed = {
    "model_name": "microsoft/DialoGPT-medium",  # Using a smaller, more compatible model
    "output_dir": "./cybersecurity-lora-fixed",
    "max_length": 512,
    "max_samples": 500,  # Limit samples for testing
    "validation_split": 0.1,
    "lora_r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.1,
    "batch_size": 1,
    "eval_batch_size": 1,
    "gradient_accumulation_steps": 4,
    "epochs": 1,  # Just 1 epoch for testing
    "learning_rate": 2e-4,
    "logging_steps": 10,
    "eval_steps": 50,
    "save_steps": 100,
    "warmup_steps": 20,
}

print("Configuration created for fixed fine-tuner")
print(f"Will use model: {config_fixed['model_name']}")
print(f"Max samples for training: {config_fixed['max_samples']}")
print(f"Device: {'CUDA' if torch.cuda.is_available() else 'CPU'}")

Configuration created for fixed fine-tuner
Will use model: microsoft/DialoGPT-medium
Max samples for training: 500
Device: CPU


In [12]:
# Test the fixed fine-tuner with your streaming dataset
print("Testing the fixed fine-tuner...")
print(f"Using streaming dataset: {type(ds)}")
print(f"Dataset splits: {list(ds.keys())}")

# Create the fixed fine-tuner
fine_tuner_fixed = CybersecurityFineTunerFixed(config_fixed)

# Let's first test the dataset preparation without training
print("\n=== Testing dataset preparation ===")
test_dataset = fine_tuner_fixed.prepare_streaming_dataset(ds)
print(f"Dataset conversion successful! Got {len(test_dataset)} examples")

# Show a sample
if len(test_dataset) > 0:
    sample = test_dataset[0]
    print(f"\nSample data fields: {list(sample.keys())}")
    print(f"Sample system (first 100 chars): {sample['system'][:100]}...")
    print(f"Sample user (first 100 chars): {sample['user'][:100]}...")
    print(f"Sample assistant (first 100 chars): {sample['assistant'][:100]}...")

Testing the fixed fine-tuner...
Using streaming dataset: <class 'datasets.dataset_dict.IterableDatasetDict'>
Dataset splits: ['train']
Using device: cpu

=== Testing dataset preparation ===
Converting streaming dataset to regular dataset...
Processed 100 examples...
Processed 200 examples...
Processed 300 examples...
Processed 400 examples...
Processed 500 examples...
Converted 500 examples to regular dataset
Dataset conversion successful! Got 500 examples

Sample data fields: ['system', 'user', 'assistant']
Sample system (first 100 chars): You are a highly specialized AI assistant for advanced cyber-defense whose mission is to deliver acc...
Sample user (first 100 chars): Analyze encrypted C2 channels using TLS. Discuss traffic analysis techniques to fingerprint maliciou...
Sample assistant (first 100 chars): Encrypted Command and Control (C2) channels utilizing Transport Layer Security (TLS) present signifi...


In [13]:
# Now let's run the actual training with a very small setup for testing
print("=== Starting actual training ===")

# Create a smaller config for quick testing
config_small = config_fixed.copy()
config_small.update({
    "max_samples": 50,  # Very small for quick test
    "batch_size": 1,
    "gradient_accumulation_steps": 2,
    "epochs": 1,
    "logging_steps": 5,
    "eval_steps": 20,
    "save_steps": 50,
    "warmup_steps": 5,
})

print("Creating fine-tuner with small config for testing...")
fine_tuner_small = CybersecurityFineTunerFixed(config_small)

print("Starting training (this may take a few minutes even with small dataset)...")
try:
    fine_tuner_small.train_streaming(ds)
    print("✅ Training completed successfully!")
except Exception as e:
    print(f"❌ Training failed with error: {e}")
    import traceback
    traceback.print_exc()

=== Starting actual training ===
Creating fine-tuner with small config for testing...
Using device: cpu
Starting training (this may take a few minutes even with small dataset)...
Starting training pipeline with streaming dataset
Loading tokenizer from microsoft/DialoGPT-medium
Set pad_token to eos_token
Tokenizer loaded successfully
Loading model from microsoft/DialoGPT-medium
❌ Training failed with error: Due to a serious vulnerability issue in `torch.load`, even with `weights_only=True`, we now require users to upgrade torch to at least v2.6 in order to use the function. This version restriction does not apply when loading files with safetensors.
See the vulnerability report here https://nvd.nist.gov/vuln/detail/CVE-2025-32434


Traceback (most recent call last):
  File "/var/folders/r7/wfm9wlx552s4w8c7qp7n015m0000gn/T/ipykernel_24090/293461785.py", line 22, in <module>
    fine_tuner_small.train_streaming(ds)
  File "/var/folders/r7/wfm9wlx552s4w8c7qp7n015m0000gn/T/ipykernel_24090/2784349576.py", line 197, in train_streaming
    self.load_model()
  File "/var/folders/r7/wfm9wlx552s4w8c7qp7n015m0000gn/T/ipykernel_24090/2784349576.py", line 62, in load_model
    self.model = AutoModelForCausalLM.from_pretrained(
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/tilmann/Documents/GitHub/finetune-gpt-oss-cybersecurity/.venv/lib/python3.12/site-packages/transformers/models/auto/auto_factory.py", line 600, in from_pretrained
    return model_class.from_pretrained(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/tilmann/Documents/GitHub/finetune-gpt-oss-cybersecurity/.venv/lib/python3.12/site-packages/transformers/modeling_utils.py", line 316, in _wrapper
    return func(*args, **kwargs)
 

In [14]:
# SOLUTION FOR GOOGLE COLAB
print("=== SOLUTIONS FOR GOOGLE COLAB ===")
print()

print("1. TORCH VERSION ISSUE:")
print("In Google Colab, run this first:")
print("!pip install torch>=2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118")
print("Then restart runtime!")
print()

print("2. QUANTIZATION ISSUE FIX:")
print("For Google Colab with CUDA, use this updated code:")
print()

# Provide the corrected Colab-specific code
colab_code = '''
# FOR GOOGLE COLAB - Run this in your Colab notebook:

# 1. First install/upgrade packages
!pip install torch>=2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install -q transformers>=4.40.0 peft>=0.7.0 bitsandbytes accelerate datasets

# 2. Restart runtime, then use this code:

from transformers import BitsAndBytesConfig
import torch

# Fixed quantization config for newer transformers
def create_quantization_config():
    return BitsAndBytesConfig(
        load_in_8bit=True,
        bnb_8bit_compute_dtype=torch.float16,
        bnb_8bit_quant_type="nf8",
        bnb_8bit_use_double_quant=True,
    )

# Updated model loading for Colab
def load_model_colab(model_name):
    from transformers import AutoModelForCausalLM
    
    if torch.cuda.is_available():
        quantization_config = create_quantization_config()
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=quantization_config,
            device_map="auto",
            torch_dtype=torch.float16,
            trust_remote_code=True,
            use_safetensors=True,  # This helps with the security issue
        )
    else:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            trust_remote_code=True,
            use_safetensors=True,
        )
    return model

# 3. For your cybersecurity dataset, use this simple approach:
def simple_train_with_streaming_dataset(ds, model_name="microsoft/DialoGPT-medium"):
    from datasets import Dataset
    from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
    from transformers import DataCollatorForLanguageModeling
    from peft import LoraConfig, get_peft_model, TaskType
    
    # Convert streaming to regular dataset (first 1000 examples)
    train_data = []
    for i, example in enumerate(ds['train']):
        if i >= 1000:  # Limit for memory
            break
        # Format the data
        text = f"System: {example['system']}\\n\\nUser: {example['user']}\\n\\nAssistant: {example['assistant']}"
        train_data.append({"text": text})
    
    dataset = Dataset.from_list(train_data)
    print(f"Prepared {len(dataset)} examples")
    
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    model = load_model_colab(model_name)
    
    # Apply LoRA
    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=16,
        lora_alpha=32,
        lora_dropout=0.1,
        target_modules=["c_attn", "c_proj"] if "gpt" in model_name.lower() else ["q_proj", "v_proj"],
    )
    model = get_peft_model(model, lora_config)
    
    # Tokenize dataset
    def tokenize_function(examples):
        return tokenizer(examples["text"], truncation=True, padding=False, max_length=512)
    
    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
    
    # Training
    training_args = TrainingArguments(
        output_dir="./cybersecurity-lora",
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        num_train_epochs=1,
        learning_rate=2e-4,
        fp16=True if torch.cuda.is_available() else False,
        logging_steps=10,
        save_steps=500,
        report_to=None,
    )
    
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        data_collator=data_collator,
    )
    
    trainer.train()
    trainer.save_model()
    return model, tokenizer

# Usage in Colab:
# model, tokenizer = simple_train_with_streaming_dataset(ds)
'''

print("COPY THIS CODE TO GOOGLE COLAB:")
print("=" * 50)
print(colab_code)
print("=" * 50)

=== SOLUTIONS FOR GOOGLE COLAB ===

1. TORCH VERSION ISSUE:
In Google Colab, run this first:
!pip install torch>=2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
Then restart runtime!

2. QUANTIZATION ISSUE FIX:
For Google Colab with CUDA, use this updated code:

COPY THIS CODE TO GOOGLE COLAB:

# FOR GOOGLE COLAB - Run this in your Colab notebook:

# 1. First install/upgrade packages
!pip install torch>=2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install -q transformers>=4.40.0 peft>=0.7.0 bitsandbytes accelerate datasets

# 2. Restart runtime, then use this code:

from transformers import BitsAndBytesConfig
import torch

# Fixed quantization config for newer transformers
def create_quantization_config():
    return BitsAndBytesConfig(
        load_in_8bit=True,
        bnb_8bit_compute_dtype=torch.float16,
        bnb_8bit_quant_type="nf8",
        bnb_8bit_use_double_quant=True,
    )

# Updated model loading fo

In [1]:
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install -q git+https://github.com/huggingface/transformers triton==3.4 git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels
!pip uninstall -q torchvision torchaudio -y

In [2]:
!pip install -q peft

In [3]:
import argparse
import json
import logging
import os
import sys
from pathlib import Path
from typing import Dict, List, Optional, Tuple


In [4]:
import torch
from datasets import Dataset, load_dataset
from peft import LoraConfig, PeftModel, TaskType, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
import yaml


2025-08-09 19:58:15.997567: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [6]:
class CybersecurityFineTuner:
    """Fine-tuning class for cybersecurity-specific GPT-OSS20B model."""
    
    def __init__(self, config: Dict):
        """Initialize the fine-tuner with configuration."""
        self.config = config
        self.model_name = config.get('model_name', 'openai/gpt-oss-20b')
        self.output_dir = config.get('output_dir', './gpt-oss-cybersecurity-lora')
        self.max_length = config.get('max_length', 512)
        
        # Check for CUDA availability
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        if self.device == "cpu":
            logger.warning("CUDA not available. Training will be very slow on CPU.")
        
        logger.info(f"Using device: {self.device}")
        
        # Initialize components
        self.tokenizer = None
        self.model = None
        self.dataset = None
        
    def load_tokenizer(self) -> None:
        """Load and configure the tokenizer."""
        logger.info(f"Loading tokenizer from {self.model_name}")
        
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        
        # Set pad token if not present
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            logger.info("Set pad_token to eos_token")
        
        logger.info("Tokenizer loaded successfully")
    
    def load_model(self) -> None:
        """Load the base model with quantization."""
        logger.info(f"Loading model from {self.model_name}")
        
        # Model loading configuration
        model_kwargs = {
            "torch_dtype": torch.float16,
            "device_map": "auto",
            "trust_remote_code": True,
        }
        
        # Add quantization if CUDA is available
        if self.device == "cuda":
            model_kwargs["load_in_8bit"] = True
            logger.info("Using 8-bit quantization")
        
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            **model_kwargs
        )
        
        logger.info(f"Model loaded on device: {self.model.device}")
        
    def configure_lora(self) -> None:
        """Configure and apply LoRA to the model."""
        logger.info("Configuring LoRA")
        
        lora_config = LoraConfig(
            task_type=TaskType.CAUSAL_LM,
            inference_mode=False,
            r=self.config.get('lora_r', 16),
            lora_alpha=self.config.get('lora_alpha', 32),
            lora_dropout=self.config.get('lora_dropout', 0.1),
            target_modules=[
                "q_proj", "k_proj", "v_proj", "o_proj",
                "gate_proj", "up_proj", "down_proj"
            ]
        )
        
        self.model = get_peft_model(self.model, lora_config)
        
        # Print trainable parameters
        self.model.print_trainable_parameters()
        logger.info("LoRA configuration applied")
    
    def load_cybersecurity_dataset(self, dataset_path: str) -> None:
        """Load and prepare the cybersecurity dataset."""
        logger.info(f"Loading dataset from {dataset_path}")
        
        if dataset_path.endswith('.jsonl'):
            # Load JSONL file
            data = []
            with open(dataset_path, 'r', encoding='utf-8') as f:
                for line in f:
                    data.append(json.loads(line.strip()))
            
            self.dataset = Dataset.from_list(data)
            
        elif dataset_path.endswith('.json'):
            # Load JSON file
            with open(dataset_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Handle different JSON structures
            if isinstance(data, list):
                self.dataset = Dataset.from_list(data)
            elif isinstance(data, dict) and 'data' in data:
                self.dataset = Dataset.from_list(data['data'])
            else:
                raise ValueError("Unsupported JSON structure")
                
        else:
            # Try loading from Hugging Face datasets
            self.dataset = load_dataset(dataset_path, split='train')
        
        logger.info(f"Dataset loaded with {len(self.dataset)} examples")
    
    def format_dataset(self) -> None:
        """Format the dataset for instruction following."""
        logger.info("Formatting dataset")
        
        def format_prompt(example):
            """Format examples for instruction following."""
            # Try different common field combinations
            if "instruction" in example and "response" in example:
                prompt = f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['response']}"
            elif "question" in example and "answer" in example:
                prompt = f"### Question:\n{example['question']}\n\n### Answer:\n{example['answer']}"
            elif "input" in example and "output" in example:
                prompt = f"### Input:\n{example['input']}\n\n### Output:\n{example['output']}"
            elif "prompt" in example and "completion" in example:
                prompt = f"### Instruction:\n{example['prompt']}\n\n### Response:\n{example['completion']}"
            elif "text" in example:
                # Use as-is if already formatted
                prompt = example['text']
            else:
                # Fallback: use first two string fields found
                string_fields = [k for k, v in example.items() if isinstance(v, str)]
                if len(string_fields) >= 2:
                    prompt = f"### Input:\n{example[string_fields[0]]}\n\n### Output:\n{example[string_fields[1]]}"
                else:
                    logger.warning(f"Could not format example: {example.keys()}")
                    prompt = str(example)
            
            return {"text": prompt}
        
        # Apply formatting
        self.dataset = self.dataset.map(format_prompt)
        logger.info("Dataset formatting completed")
    
    def tokenize_dataset(self) -> Tuple[Dataset, Dataset]:
        """Tokenize the dataset and create train/validation splits."""
        logger.info("Tokenizing dataset")
        
        def tokenize_function(examples):
            """Tokenize the dataset for training."""
            return self.tokenizer(
                examples["text"],
                truncation=True,
                padding=False,
                max_length=self.max_length,
                return_overflowing_tokens=False,
            )
        
        # Tokenize the dataset
        tokenized_dataset = self.dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=self.dataset.column_names,
            desc="Tokenizing dataset"
        )
        
        # Create train/validation split
        split_ratio = self.config.get('validation_split', 0.1)
        if split_ratio > 0:
            split_dataset = tokenized_dataset.train_test_split(test_size=split_ratio)
            train_dataset = split_dataset["train"]
            eval_dataset = split_dataset["test"]
        else:
            train_dataset = tokenized_dataset
            eval_dataset = None
        
        logger.info(f"Training samples: {len(train_dataset)}")
        if eval_dataset:
            logger.info(f"Validation samples: {len(eval_dataset)}")
        
        return train_dataset, eval_dataset
    
    def create_trainer(self, train_dataset: Dataset, eval_dataset: Optional[Dataset] = None) -> Trainer:
        """Create and configure the trainer."""
        logger.info("Creating trainer")
        
        # Training arguments
        training_args = TrainingArguments(
            output_dir=self.output_dir,
            per_device_train_batch_size=self.config.get('batch_size', 2),
            per_device_eval_batch_size=self.config.get('eval_batch_size', 2),
            gradient_accumulation_steps=self.config.get('gradient_accumulation_steps', 8),
            num_train_epochs=self.config.get('epochs', 3),
            learning_rate=self.config.get('learning_rate', 2e-4),
            fp16=self.device == "cuda",
            logging_steps=self.config.get('logging_steps', 10),
            eval_steps=self.config.get('eval_steps', 100),
            save_steps=self.config.get('save_steps', 500),
            evaluation_strategy="steps" if eval_dataset else "no",
            save_strategy="steps",
            load_best_model_at_end=eval_dataset is not None,
            metric_for_best_model="eval_loss" if eval_dataset else None,
            greater_is_better=False,
            warmup_steps=self.config.get('warmup_steps', 100),
            lr_scheduler_type="cosine",
            report_to=None,
            remove_unused_columns=False,
            dataloader_pin_memory=False,
        )
        
        # Data collator
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False,
        )
        
        # Create trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            data_collator=data_collator,
            tokenizer=self.tokenizer,
        )
        
        return trainer
    
    def train(self, dataset_path: str) -> None:
        """Execute the complete training pipeline."""
        logger.info("Starting training pipeline")
        
        # Load components
        self.load_tokenizer()
        self.load_model()
        self.configure_lora()
        
        # Prepare dataset
        self.load_cybersecurity_dataset(dataset_path)
        self.format_dataset()
        train_dataset, eval_dataset = self.tokenize_dataset()
        
        # Create and run trainer
        trainer = self.create_trainer(train_dataset, eval_dataset)
        
        logger.info("Starting training...")
        trainer.train()
        
        # Save model
        logger.info("Saving model...")
        trainer.save_model()
        self.tokenizer.save_pretrained(self.output_dir)
        
        logger.info("Training completed successfully!")
    
    def generate_response(self, prompt: str, max_length: int = 256) -> str:
        """Generate response using the fine-tuned model."""
        if not self.model or not self.tokenizer:
            raise ValueError("Model and tokenizer must be loaded first")
        
        # Format the prompt
        formatted_prompt = f"### Instruction:\n{prompt}\n\n### Response:\n"
        
        # Tokenize input
        inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.model.device)
        
        # Generate response
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_length=len(inputs['input_ids'][0]) + max_length,
                temperature=0.7,
                do_sample=True,
                top_p=0.9,
                pad_token_id=self.tokenizer.eos_token_id,
                eos_token_id=self.tokenizer.eos_token_id,
            )
        
        # Decode and return response
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract just the generated part
        generated_text = response[len(formatted_prompt):]
        
        return generated_text.strip()


In [7]:
def load_fine_tuned_model(base_model_path: str, lora_adapter_path: str) -> Tuple[PeftModel, AutoTokenizer]:
    """Load a fine-tuned model for inference."""
    logger.info(f"Loading fine-tuned model from {lora_adapter_path}")
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(lora_adapter_path)
    
    # Load base model
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_path,
        torch_dtype=torch.float16,
        device_map="auto",
        load_in_8bit=True if torch.cuda.is_available() else False
    )
    
    # Load LoRA adapter
    model = PeftModel.from_pretrained(base_model, lora_adapter_path)
    
    return model, tokenizer


def create_sample_config() -> Dict:
    """Create a sample configuration."""
    return {
        "model_name": "openai/gpt-oss-20b",
        "output_dir": "./gpt-oss-cybersecurity-lora",
        "max_length": 512,
        "validation_split": 0.1,
        "lora_r": 16,
        "lora_alpha": 32,
        "lora_dropout": 0.1,
        "batch_size": 2,
        "eval_batch_size": 2,
        "gradient_accumulation_steps": 8,
        "epochs": 3,
        "learning_rate": 2e-4,
        "logging_steps": 10,
        "eval_steps": 100,
        "save_steps": 500,
        "warmup_steps": 100,
    }


def create_sample_dataset(output_path: str) -> None:
    """Create a sample cybersecurity dataset for testing."""
    sample_data = [
        {
            "instruction": "What are the key indicators of a potential SQL injection attack?",
            "response": "Key indicators of SQL injection attacks include: 1) Unusual database queries in logs, 2) Error messages revealing database structure, 3) Unexpected application behavior when special characters are entered, 4) Slow database performance, 5) Authentication bypasses, and 6) Data exfiltration patterns in network traffic."
        },
        {
            "instruction": "How should an organization respond to a ransomware attack?",
            "response": "Ransomware response steps: 1) Immediately isolate affected systems, 2) Activate incident response team, 3) Assess scope of infection, 4) Preserve evidence, 5) Restore from clean backups if available, 6) Report to law enforcement and relevant authorities, 7) Communicate with stakeholders, 8) Conduct post-incident analysis, and 9) Improve security measures based on lessons learned."
        },
        {
            "instruction": "What is the principle of least privilege in cybersecurity?",
            "response": "The principle of least privilege is a security concept where users, applications, and systems are granted only the minimum access rights necessary to perform their functions. This reduces the attack surface by limiting potential damage if credentials are compromised and helps prevent lateral movement in case of a breach."
        }
    ]
    
    with open(output_path, 'w', encoding='utf-8') as f:
        for item in sample_data:
            f.write(json.dumps(item) + '\n')
    
    logger.info(f"Sample dataset created at {output_path}")


In [8]:
def main():
    """Main function to handle command line arguments and execute training."""
    parser = argparse.ArgumentParser(description='Fine-tune GPT-OSS20B for cybersecurity')
    parser.add_argument('--config', type=str, help='Path to configuration YAML file')
    parser.add_argument('--dataset', type=str, required=True, help='Path to dataset file')
    parser.add_argument('--create-sample-config', action='store_true', 
                       help='Create sample configuration file')
    parser.add_argument('--create-sample-dataset', type=str,
                       help='Create sample dataset at specified path')
    parser.add_argument('--inference', type=str, help='Path to fine-tuned model for inference')
    parser.add_argument('--prompt', type=str, help='Prompt for inference')
    
    args = parser.parse_args()
    
    # Create sample configuration
    if args.create_sample_config:
        config = create_sample_config()
        with open('config.yaml', 'w') as f:
            yaml.dump(config, f, default_flow_style=False)
        logger.info("Sample configuration created as 'config.yaml'")
        return
    
    # Create sample dataset
    if args.create_sample_dataset:
        create_sample_dataset(args.create_sample_dataset)
        return
    
    # Inference mode
    if args.inference:
        if not args.prompt:
            logger.error("--prompt is required for inference")
            sys.exit(1)
        
        model, tokenizer = load_fine_tuned_model("openai/gpt-oss-20b", args.inference)
        
        # Create temporary fine-tuner for inference
        config = {"model_name": "openai/gpt-oss-20b"}
        fine_tuner = CybersecurityFineTuner(config)
        fine_tuner.model = model
        fine_tuner.tokenizer = tokenizer
        
        response = fine_tuner.generate_response(args.prompt)
        print(f"Query: {args.prompt}")
        print(f"Response: {response}")
        return
    
    # Training mode
    if not args.dataset:
        logger.error("--dataset is required for training")
        sys.exit(1)
    
    # Load configuration
    if args.config:
        with open(args.config, 'r') as f:
            config = yaml.safe_load(f)
    else:
        config = create_sample_config()
        logger.info("Using default configuration")
    
    # Create and run fine-tuner
    fine_tuner = CybersecurityFineTuner(config)
    fine_tuner.train(args.dataset)


if __name__ == "__main__":
    main()

usage: ipykernel_launcher.py [-h] [--config CONFIG] --dataset DATASET
                             [--create-sample-config]
                             [--create-sample-dataset CREATE_SAMPLE_DATASET]
                             [--inference INFERENCE] [--prompt PROMPT]
ipykernel_launcher.py: error: the following arguments are required: --dataset


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [16]:
# config = create_sample_config()
# with open('config.yaml', 'w') as f:
#     yaml.dump(config, f, default_flow_style=False)
# logger.info("Sample configuration created as 'config.yaml'")

In [17]:
# config = yaml.safe_load("config.yaml")

In [7]:
from datasets import load_dataset

ds = load_dataset(
        "Trendyol/Trendyol-Cybersecurity-Instruction-Tuning-Dataset",
        data_files="CyberSec-Dataset_escaped.jsonl",
        streaming=True
)

In [8]:
# Create and run fine-tuner
fine_tuner = CybersecurityFineTuner(config)
fine_tuner.train(ds)

NameError: name 'CybersecurityFineTuner' is not defined

In [18]:
config = create_sample_config()
logger.info("Using default configuration")
    
# Create and run fine-tuner
fine_tuner = CybersecurityFineTuner(config)
# fine_tuner.train(args.dataset)

2025-08-09 20:10:16,025 - __main__ - INFO - Using default configuration
2025-08-09 20:10:16,029 - __main__ - INFO - Using device: cpu


In [19]:
fine_tuner.train()

TypeError: CybersecurityFineTuner.train() missing 1 required positional argument: 'dataset_path'

In [None]:
dataset_path = 