In [55]:
# Install required packages
!pip install datasets huggingface_hub transformers



In [56]:
from huggingface_hub import HfApi, dataset_info
import requests

# Check the dataset repository structure
api = HfApi()
try:
    info = api.dataset_info("Trendyol/Trendyol-Cybersecurity-Instruction-Tuning-Dataset")
    print("Dataset exists!")
    print(f"Dataset ID: {info.id}")
    print(f"Tags: {info.tags}")
    print(f"Number of files: {len(info.siblings) if info.siblings else 0}")

    if info.siblings:
        print("\nFiles in repository:")
        for file in info.siblings:
            print(f"  - {file.rfilename}")
except Exception as e:
    print(f"Error accessing dataset info: {e}")

Dataset exists!
Dataset ID: Trendyol/Trendyol-Cybersecurity-Instruction-Tuning-Dataset
Tags: ['task_categories:text-generation', 'task_categories:question-answering', 'language:en', 'license:apache-2.0', 'size_categories:10K<n<100K', 'format:json', 'modality:text', 'library:datasets', 'library:pandas', 'library:mlcroissant', 'library:polars', 'region:us', 'cybersecurity', 'defensive-security', 'instruction-tuning', 'threat-intelligence', 'incident-response', 'security-operations']
Number of files: 3

Files in repository:
  - .gitattributes
  - CyberSec-Dataset_escaped.jsonl
  - README.md


In [57]:
from datasets import load_dataset

# Method 1: Try loading with explicit data_files parameter
print("Method 1: Loading with explicit data_files...")
try:
    ds1 = load_dataset(
        "Trendyol/Trendyol-Cybersecurity-Instruction-Tuning-Dataset",
        data_files="CyberSec-Dataset_escaped.jsonl"
    )
    print("✅ Method 1 SUCCESS!")
    print(f"Dataset loaded with {len(ds1['train'])} examples")
    print(f"Features: {ds1['train'].features}")
except Exception as e:
    print(f"❌ Method 1 FAILED: {e}")
    print()

Method 1: Loading with explicit data_files...
✅ Method 1 SUCCESS!
Dataset loaded with 53201 examples
Features: {'system': Value('string'), 'user': Value('string'), 'assistant': Value('string')}


In [58]:
# Method 2: Try specifying the file format explicitly
print("Method 2: Loading with explicit format...")
try:
    ds2 = load_dataset(
        "json",
        data_files="hf://datasets/Trendyol/Trendyol-Cybersecurity-Instruction-Tuning-Dataset/CyberSec-Dataset_escaped.jsonl"
    )
    print("✅ Method 2 SUCCESS!")
    print(f"Dataset loaded with {len(ds2['train'])} examples")
except Exception as e:
    print(f"❌ Method 2 FAILED: {e}")
    print()

# Method 3: Try with streaming=True to avoid local caching issues
print("Method 3: Loading with streaming...")
try:
    ds3 = load_dataset(
        "Trendyol/Trendyol-Cybersecurity-Instruction-Tuning-Dataset",
        data_files="CyberSec-Dataset_escaped.jsonl",
        streaming=True
    )
    print("✅ Method 3 SUCCESS!")
    print("Streaming dataset created successfully")
    # Convert to regular dataset for consistency
    ds3_regular = ds3['train'].take(100)  # Take first 100 examples as test
    print(f"Streaming sample taken: {len(list(ds3_regular))} examples")
except Exception as e:
    print(f"❌ Method 3 FAILED: {e}")
    print()

Method 2: Loading with explicit format...
✅ Method 2 SUCCESS!
Dataset loaded with 53201 examples
Method 3: Loading with streaming...
✅ Method 3 SUCCESS!
Streaming dataset created successfully
Streaming sample taken: 100 examples


In [59]:
# Display sample data from the successfully loaded dataset
print("=== DATASET SAMPLE ===")
print(f"Total examples: {len(ds1['train'])}")
print(f"Features: {list(ds1['train'].features.keys())}")
print()

# Show first example
sample = ds1['train'][0]
print("Example 1:")
print(f"System: {sample['system'][:200]}..." if len(sample['system']) > 200 else f"System: {sample['system']}")
print(f"User: {sample['user'][:200]}..." if len(sample['user']) > 200 else f"User: {sample['user']}")
print(f"Assistant: {sample['assistant'][:200]}..." if len(sample['assistant']) > 200 else f"Assistant: {sample['assistant']}")
print()

# Show data distribution
print("=== BASIC STATISTICS ===")
sample_data = ds1['train'].select(range(min(1000, len(ds1['train']))))  # Sample first 1000 or total length
system_lengths = [len(ex) for ex in sample_data['system']]
user_lengths = [len(ex) for ex in sample_data['user']]
assistant_lengths = [len(ex) for ex in sample_data['assistant']]

print(f"Average system prompt length: {sum(system_lengths)/len(system_lengths):.1f} chars")
print(f"Average user message length: {sum(user_lengths)/len(user_lengths):.1f} chars")
print(f"Average assistant response length: {sum(assistant_lengths)/len(assistant_lengths):.1f} chars")

=== DATASET SAMPLE ===
Total examples: 53201
Features: ['system', 'user', 'assistant']

Example 1:
System: You are a highly specialized AI assistant for advanced cyber-defense whose mission is to deliver accurate, in-depth, actionable guidance on information-security principles—confidentiality, integrity, ...
User: Analyze encrypted C2 channels using TLS. Discuss traffic analysis techniques to fingerprint malicious sessions.
Assistant: Encrypted Command and Control (C2) channels utilizing Transport Layer Security (TLS) present significant challenges for network defenders, as traditional packet inspection methods cannot directly anal...

=== BASIC STATISTICS ===
Average system prompt length: 1085.0 chars
Average user message length: 145.7 chars
Average assistant response length: 2084.9 chars


In [60]:
# First, let's fix the quantization issue by updating the model loading approach
!pip install -q bitsandbytes accelerate

# Also ensure we have compatible versions
!pip install -q transformers>=4.36.0 peft>=0.6.0

In [62]:
# set up training

from transformers import BitsAndBytesConfig
import torch

class CybersecurityFineTunerFixed:
    """Fixed fine-tuning class for cybersecurity-specific models."""

    def __init__(self, config: dict):
        """Initialize the fine-tuner with configuration."""
        self.config = config
        self.model_name = config.get('model_name', 'microsoft/DialoGPT-medium')  # Using a smaller model for testing
        self.output_dir = config.get('output_dir', './cybersecurity-lora')
        self.max_length = config.get('max_length', 512)

        # Check for CUDA availability
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {self.device}")

        # Initialize components
        self.tokenizer = None
        self.model = None

    def load_tokenizer(self):
        """Load and configure the tokenizer."""
        print(f"Loading tokenizer from {self.model_name}")

        from transformers import AutoTokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)

        # Set pad token if not present
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            print("Set pad_token to eos_token")

        print("Tokenizer loaded successfully")

    def load_model(self):
        """Load the base model with proper quantization configuration."""
        print(f"Loading model from {self.model_name}")

        from transformers import AutoModelForCausalLM

        # Proper quantization configuration
        if self.device == "cuda":
            # Create proper BitsAndBytesConfig
            quantization_config = BitsAndBytesConfig(
                load_in_8bit=True,
                bnb_8bit_compute_dtype=torch.float16,
                bnb_8bit_use_double_quant=True,
            )

            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_name,
                quantization_config=quantization_config,
                device_map="auto",
                torch_dtype=torch.float16,
                trust_remote_code=True,
            )
            print("Model loaded with 8-bit quantization")
        else:
            # No quantization for CPU
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_name,
                torch_dtype=torch.float16,
                trust_remote_code=True,
            )
            print("Model loaded without quantization (CPU)")

        print(f"Model loaded on device: {next(self.model.parameters()).device}")

    def configure_lora(self):
        """Configure and apply LoRA to the model."""
        print("Configuring LoRA")

        from peft import LoraConfig, get_peft_model, TaskType

        # Get target modules based on the model architecture
        target_modules = []
        if "gpt" in self.model_name.lower():
            target_modules = ["c_attn", "c_proj", "c_fc"]
        elif "llama" in self.model_name.lower():
            target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
        else:
            # Generic targets that work for most transformer models
            target_modules = ["q_proj", "v_proj"]

        lora_config = LoraConfig(
            task_type=TaskType.CAUSAL_LM,
            inference_mode=False,
            r=self.config.get('lora_r', 16),
            lora_alpha=self.config.get('lora_alpha', 32),
            lora_dropout=self.config.get('lora_dropout', 0.1),
            target_modules=target_modules,
        )

        self.model = get_peft_model(self.model, lora_config)

        # Print trainable parameters
        self.model.print_trainable_parameters()
        print("LoRA configuration applied")

    def prepare_streaming_dataset(self, streaming_dataset):
        """Convert streaming dataset to regular dataset for training."""
        print("Converting streaming dataset to regular dataset...")

        # Take a subset for training (you can adjust this number)
        max_samples = self.config.get('max_samples', 1000)

        # Convert iterable dataset to list
        train_data = []
        count = 0
        for example in streaming_dataset['train']:
            if count >= max_samples:
                break
            train_data.append(example)
            count += 1

            if count % 100 == 0:
                print(f"Processed {count} examples...")

        # Convert to Hugging Face Dataset
        from datasets import Dataset
        dataset = Dataset.from_list(train_data)

        print(f"Converted {len(dataset)} examples to regular dataset")
        return dataset

    def format_dataset(self, dataset):
        """Format the dataset for instruction following."""
        print("Formatting dataset")

        def format_prompt(example):
            """Format examples for instruction following."""
            # Handle the cybersecurity dataset format (system, user, assistant)
            if "system" in example and "user" in example and "assistant" in example:
                # Create a chat-like format
                prompt = f"System: {example['system']}\n\nUser: {example['user']}\n\nAssistant: {example['assistant']}"
            elif "instruction" in example and "response" in example:
                prompt = f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['response']}"
            elif "question" in example and "answer" in example:
                prompt = f"### Question:\n{example['question']}\n\n### Answer:\n{example['answer']}"
            else:
                # Fallback: convert to string
                prompt = str(example)

            return {"text": prompt}

        # Apply formatting
        formatted_dataset = dataset.map(format_prompt)
        print("Dataset formatting completed")
        return formatted_dataset

    def tokenize_dataset(self, dataset):
        """Tokenize the dataset for training."""
        print("Tokenizing dataset")

        def tokenize_function(examples):
            """Tokenize the dataset for training."""
            return self.tokenizer(
                examples["text"],
                truncation=True,
                padding=False,
                max_length=self.max_length,
                return_overflowing_tokens=False,
            )

        # Tokenize the dataset
        tokenized_dataset = dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=dataset.column_names,
            desc="Tokenizing dataset"
        )

        # Create train/validation split
        split_ratio = self.config.get('validation_split', 0.1)
        if split_ratio > 0:
            split_dataset = tokenized_dataset.train_test_split(test_size=split_ratio)
            train_dataset = split_dataset["train"]
            eval_dataset = split_dataset["test"]
        else:
            train_dataset = tokenized_dataset
            eval_dataset = None

        print(f"Training samples: {len(train_dataset)}")
        if eval_dataset:
            print(f"Validation samples: {len(eval_dataset)}")

        return train_dataset, eval_dataset

    def train_streaming(self, streaming_dataset):
        """Execute the complete training pipeline with streaming dataset."""
        print("Starting training pipeline with streaming dataset")

        # Load components
        self.load_tokenizer()
        self.load_model()
        self.configure_lora()

        # Prepare dataset
        dataset = self.prepare_streaming_dataset(streaming_dataset)
        formatted_dataset = self.format_dataset(dataset)
        train_dataset, eval_dataset = self.tokenize_dataset(formatted_dataset)

        # Create trainer
        from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

        # Training arguments
        # Removed evaluation_strategy as it is causing TypeError in this environment
        training_args = TrainingArguments(
            output_dir=self.output_dir,
            per_device_train_batch_size=self.config.get('batch_size', 1),
            per_device_eval_batch_size=self.config.get('eval_batch_size', 1),
            gradient_accumulation_steps=self.config.get('gradient_accumulation_steps', 4),
            num_train_epochs=self.config.get('epochs', 1),
            learning_rate=self.config.get('learning_rate', 2e-4),
            fp16=self.device == "cuda",
            logging_steps=self.config.get('logging_steps', 10),
            eval_steps=self.config.get('eval_steps', 100),
            save_steps=self.config.get('save_steps', 500),
            save_strategy="steps",
            load_best_model_at_end=False, # Set load_best_model_at_end to False to avoid strategy mismatch error
            warmup_steps=self.config.get('warmup_steps', 50),
            lr_scheduler_type="cosine",
            report_to=None,
            remove_unused_columns=False,
            dataloader_pin_memory=False,
        )

        # Data collator
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False,
        )

        # Create trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            data_collator=data_collator,
            tokenizer=self.tokenizer,
        )

        print("Starting training...")
        trainer.train()

        # Save model
        print("Saving model...")
        trainer.save_model()
        self.tokenizer.save_pretrained(self.output_dir)

        print("Training completed successfully!")

print("Fixed CybersecurityFineTuner class created!")

Fixed CybersecurityFineTuner class created!


In [63]:
# Create a configuration for the fixed fine-tuner
config_fixed = {
    "model_name": "microsoft/DialoGPT-medium",  # Using a smaller, more compatible model
    "output_dir": "./cybersecurity-lora-fixed",
    "max_length": 512,
    "max_samples": 500,  # Limit samples for testing
    "validation_split": 0.1,
    "lora_r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.1,
    "batch_size": 1,
    "eval_batch_size": 1,
    "gradient_accumulation_steps": 4,
    "epochs": 1,  # Just 1 epoch for testing
    "learning_rate": 2e-4,
    "logging_steps": 10,
    "eval_steps": 50,
    "save_steps": 100,
    "warmup_steps": 20,
}

print("Configuration created for fixed fine-tuner")
print(f"Will use model: {config_fixed['model_name']}")
print(f"Max samples for training: {config_fixed['max_samples']}")
print(f"Device: {'CUDA' if torch.cuda.is_available() else 'CPU'}")

Configuration created for fixed fine-tuner
Will use model: microsoft/DialoGPT-medium
Max samples for training: 500
Device: CUDA


In [64]:
# Test the fixed fine-tuner with your streaming dataset
print("Testing the fixed fine-tuner...")
print(f"Using streaming dataset: {type(ds3)}")
print(f"Dataset splits: {list(ds3.keys())}")

# Create the fixed fine-tuner
fine_tuner_fixed = CybersecurityFineTunerFixed(config_fixed)

# Let's first test the dataset preparation without training
print("\n=== Testing dataset preparation ===")
test_dataset = fine_tuner_fixed.prepare_streaming_dataset(ds3)
print(f"Dataset conversion successful! Got {len(test_dataset)} examples")

# Show a sample
if len(test_dataset) > 0:
    sample = test_dataset[0]
    print(f"\nSample data fields: {list(sample.keys())}")
    print(f"Sample system (first 100 chars): {sample['system'][:100]}...")
    print(f"Sample user (first 100 chars): {sample['user'][:100]}...")
    print(f"Sample assistant (first 100 chars): {sample['assistant'][:100]}...")

Testing the fixed fine-tuner...
Using streaming dataset: <class 'datasets.dataset_dict.IterableDatasetDict'>
Dataset splits: ['train']
Using device: cuda

=== Testing dataset preparation ===
Converting streaming dataset to regular dataset...
Processed 100 examples...
Processed 200 examples...
Processed 300 examples...
Processed 400 examples...
Processed 500 examples...
Converted 500 examples to regular dataset
Dataset conversion successful! Got 500 examples

Sample data fields: ['system', 'user', 'assistant']
Sample system (first 100 chars): You are a highly specialized AI assistant for advanced cyber-defense whose mission is to deliver acc...
Sample user (first 100 chars): Analyze encrypted C2 channels using TLS. Discuss traffic analysis techniques to fingerprint maliciou...
Sample assistant (first 100 chars): Encrypted Command and Control (C2) channels utilizing Transport Layer Security (TLS) present signifi...


In [65]:
# Now let's run the actual training with a very small setup for testing
print("=== Starting actual training ===")

# Create a smaller config for quick testing
config_small = config_fixed.copy()
config_small.update({
    "max_samples": 50,  # Very small for quick test
    "batch_size": 1,
    "gradient_accumulation_steps": 2,
    "epochs": 1,
    "logging_steps": 5,
    "eval_steps": 20,
    "save_steps": 50,
    "warmup_steps": 5,
})

print("Creating fine-tuner with small config for testing...")
fine_tuner_small = CybersecurityFineTunerFixed(config_small)

print("Starting training (this may take a few minutes even with small dataset)...")
try:
    fine_tuner_small.train_streaming(ds3)
    print("✅ Training completed successfully!")
except Exception as e:
    print(f"❌ Training failed with error: {e}")
    import traceback
    traceback.print_exc()

=== Starting actual training ===
Creating fine-tuner with small config for testing...
Using device: cuda
Starting training (this may take a few minutes even with small dataset)...
Starting training pipeline with streaming dataset
Loading tokenizer from microsoft/DialoGPT-medium
Set pad_token to eos_token
Tokenizer loaded successfully
Loading model from microsoft/DialoGPT-medium
Model loaded with 8-bit quantization
Model loaded on device: cuda:0
Configuring LoRA
trainable params: 6,291,456 || all params: 361,114,624 || trainable%: 1.7422
LoRA configuration applied
Converting streaming dataset to regular dataset...
Converted 50 examples to regular dataset
Formatting dataset


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Dataset formatting completed
Tokenizing dataset


Tokenizing dataset:   0%|          | 0/50 [00:00<?, ? examples/s]

Training samples: 45
Validation samples: 5
Starting training...


  trainer = Trainer(


Step,Training Loss
5,9.7889
10,8.5423
15,7.6826
20,7.5065


Saving model...
Training completed successfully!
✅ Training completed successfully!


In [None]:
# Now let's run the actual training with the full dataset
print("=== Starting actual training (full dataset) ===")

# Create a configuration for the training fine-tuner
config_train = config_fixed.copy()
config_train.update({
    "model_name": "microsoft/DialoGPT-medium",  # Using a smaller, more compatible model
    "output_dir": "./cybersecurity-lora-fixed",
    "max_length": 512,
    # "max_samples": 500,  # Limit samples for testing
    "validation_split": 0.1,
    "lora_r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.1,
    "batch_size": 1,
    "eval_batch_size": 1,
    "gradient_accumulation_steps": 4,
    "epochs": 3,  # Just 1 epoch for testing
    "learning_rate": 2e-4,
    "logging_steps": 10,
    "eval_steps": 50,
    "save_steps": 100,
    "warmup_steps": 20,
})

print("Configuration created for training fine-tuner")
print(f"Will use model: {config_fixed['model_name']}")
# print(f"Max samples for training: {config_fixed['max_samples']}")
print(f"Device: {'CUDA' if torch.cuda.is_available() else 'CPU'}")

print("Creating fine-tuner with regular config for training...")
fine_tuner = CybersecurityFineTunerFixed(config_train)

print("Starting training (this may take a few minutes even with small dataset)...")
try:
    fine_tuner.train_streaming(ds3)
    print("✅ Training completed successfully!")
except Exception as e:
    print(f"❌ Training failed with error: {e}")
    import traceback
    traceback.print_exc()

=== Starting actual training (full dataset) ===
Configuration created for training fine-tuner
Will use model: microsoft/DialoGPT-medium
Device: CUDA
Creating fine-tuner with regular config for training...
Using device: cuda
Starting training (this may take a few minutes even with small dataset)...
Starting training pipeline with streaming dataset
Loading tokenizer from microsoft/DialoGPT-medium
Set pad_token to eos_token
Tokenizer loaded successfully
Loading model from microsoft/DialoGPT-medium
Model loaded with 8-bit quantization
Model loaded on device: cuda:0
Configuring LoRA
trainable params: 6,291,456 || all params: 361,114,624 || trainable%: 1.7422
LoRA configuration applied
Converting streaming dataset to regular dataset...
Processed 100 examples...
Processed 200 examples...
Processed 300 examples...
Processed 400 examples...
Processed 500 examples...
Converted 500 examples to regular dataset
Formatting dataset


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset formatting completed
Tokenizing dataset


Tokenizing dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Training samples: 450
Validation samples: 50
Starting training...


  trainer = Trainer(


Step,Training Loss
10,9.786
20,8.2343
30,6.9671
40,6.0203
50,5.2857
60,4.5051
70,3.6769
80,3.1565
90,2.9443
100,2.8218




In [None]:
# # SOLUTION FOR GOOGLE COLAB
# print("=== SOLUTIONS FOR GOOGLE COLAB ===")
# print()

# print("1. TORCH VERSION ISSUE:")
# print("In Google Colab, run this first:")
# print("!pip install torch>=2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118")
# print("Then restart runtime!")
# print()

# print("2. QUANTIZATION ISSUE FIX:")
# print("For Google Colab with CUDA, use this updated code:")
# print()

# # Provide the corrected Colab-specific code
# colab_code = '''
# # FOR GOOGLE COLAB - Run this in your Colab notebook:

# # 1. First install/upgrade packages
# !pip install torch>=2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# !pip install -q transformers>=4.40.0 peft>=0.7.0 bitsandbytes accelerate datasets

# # 2. Restart runtime, then use this code:

# from transformers import BitsAndBytesConfig
# import torch

# # Fixed quantization config for newer transformers
# def create_quantization_config():
#     return BitsAndBytesConfig(
#         load_in_8bit=True,
#         bnb_8bit_compute_dtype=torch.float16,
#         bnb_8bit_quant_type="nf8",
#         bnb_8bit_use_double_quant=True,
#     )

# # Updated model loading for Colab
# def load_model_colab(model_name):
#     from transformers import AutoModelForCausalLM

#     if torch.cuda.is_available():
#         quantization_config = create_quantization_config()
#         model = AutoModelForCausalLM.from_pretrained(
#             model_name,
#             quantization_config=quantization_config,
#             device_map="auto",
#             torch_dtype=torch.float16,
#             trust_remote_code=True,
#             use_safetensors=True,  # This helps with the security issue
#         )
#     else:
#         model = AutoModelForCausalLM.from_pretrained(
#             model_name,
#             torch_dtype=torch.float16,
#             trust_remote_code=True,
#             use_safetensors=True,
#         )
#     return model

# # 3. For your cybersecurity dataset, use this simple approach:
# def simple_train_with_streaming_dataset(ds, model_name="microsoft/DialoGPT-medium"):
#     from datasets import Dataset
#     from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
#     from transformers import DataCollatorForLanguageModeling
#     from peft import LoraConfig, get_peft_model, TaskType

#     # Convert streaming to regular dataset (first 1000 examples)
#     train_data = []
#     for i, example in enumerate(ds['train']):
#         if i >= 1000:  # Limit for memory
#             break
#         # Format the data
#         text = f"System: {example['system']}\\n\\nUser: {example['user']}\\n\\nAssistant: {example['assistant']}"
#         train_data.append({"text": text})

#     dataset = Dataset.from_list(train_data)
#     print(f"Prepared {len(dataset)} examples")

#     # Load tokenizer and model
#     tokenizer = AutoTokenizer.from_pretrained(model_name)
#     if tokenizer.pad_token is None:
#         tokenizer.pad_token = tokenizer.eos_token

#     model = load_model_colab(model_name)

#     # Apply LoRA
#     lora_config = LoraConfig(
#         task_type=TaskType.CAUSAL_LM,
#         r=16,
#         lora_alpha=32,
#         lora_dropout=0.1,
#         target_modules=["c_attn", "c_proj"] if "gpt" in model_name.lower() else ["q_proj", "v_proj"],
#     )
#     model = get_peft_model(model, lora_config)

#     # Tokenize dataset
#     def tokenize_function(examples):
#         return tokenizer(examples["text"], truncation=True, padding=False, max_length=512)

#     tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

#     # Training
#     training_args = TrainingArguments(
#         output_dir="./cybersecurity-lora",
#         per_device_train_batch_size=1,
#         gradient_accumulation_steps=4,
#         num_train_epochs=1,
#         learning_rate=2e-4,
#         fp16=True if torch.cuda.is_available() else False,
#         logging_steps=10,
#         save_steps=500,
#         report_to=None,
#     )

#     data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
#     trainer = Trainer(
#         model=model,
#         args=training_args,
#         train_dataset=tokenized_dataset,
#         data_collator=data_collator,
#     )

#     trainer.train()
#     trainer.save_model()
#     return model, tokenizer

# # Usage in Colab:
# # model, tokenizer = simple_train_with_streaming_dataset(ds)
# '''

# print("COPY THIS CODE TO GOOGLE COLAB:")
# print("=" * 50)
# print(colab_code)
# print("=" * 50)