In [None]:
%pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
%pip install -q transformers>=4.43.0
%pip install -q accelerate>=0.21.0
%pip install -q peft>=0.4.0
%pip install -q bitsandbytes>=0.41.0
%pip install -q trl>=0.7.0
%pip install -q datasets
%pip install -q scipy
%pip install -q tensorboard
%pip install -q wandb
%pip install -q sentencepiece
%pip install -q protobuf


In [None]:

import os
import json
from pathlib import Path
from datasets import Dataset, DatasetDict
import torch
import json
import pandas as pd
from datasets import Dataset, DatasetDict

import os
import warnings
warnings.filterwarnings("ignore")

# FIX 1: Set tokenizer parallelism environment variable BEFORE importing anything
os.environ["TOKENIZERS_PARALLELISM"] = "false"

print("Environment configured:")
print(f"✅ TOKENIZERS_PARALLELISM = {os.environ.get('TOKENIZERS_PARALLELISM')}")
print("✅ Warnings suppressed")

from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import warnings
warnings.filterwarnings("ignore")

# Setup device and check GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
print(f"GPU Name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'No GPU'}")
print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB" if torch.cuda.is_available() else "No GPU")


In [None]:
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
HF_TOKEN = "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"

# QLoRA configuration - optimized for H100
qlora_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# LoRA configuration - balanced for instruction tuning
lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False
)

# Training hyperparameters - optimized for H100 with 40 cores
training_config = {
    "output_dir": "./llama-3.1-8b-corporate-assistant",
    "num_train_epochs": 5,
    "per_device_train_batch_size": 2,
    "per_device_eval_batch_size": 4,
    "gradient_accumulation_steps": 4,
    "gradient_checkpointing": True,
    "optim": "paged_adamw_32bit",
    "logging_steps": 10,
    "learning_rate": 3e-4,
    "weight_decay": 0.001,
    "fp16": False,
    "bf16": True,
    "max_grad_norm": 0.3,
    "max_steps": -1,
    "warmup_ratio": 0.03,
    "group_by_length": True,
    "lr_scheduler_type": "cosine",
    "report_to": "tensorboard",
    "eval_strategy": "epoch",
    "save_strategy": "epoch",
    "save_steps": 100,
    "eval_steps": 100,
    "save_total_limit": 1,  # Reduced to avoid conflicts
    "load_best_model_at_end": False,  # FIX: Prevents 'method' object error
    "dataloader_num_workers": 2,  # Reduced from 8 to avoid conflicts
    "remove_unused_columns": False,
    "push_to_hub": False,
    "seed": 42,
    "data_seed": 42,
    "ddp_find_unused_parameters": False,  # Helps with distributed training
}

print("✅ Training configuration loaded with fixes applied")
print(f"📊 Total epochs: {training_config['num_train_epochs']}")
print(f"📊 Effective batch size: {training_config['per_device_train_batch_size'] * training_config['gradient_accumulation_steps']}")
print(f"📊 Learning rate: {training_config['learning_rate']}")
print(f"⚠️  load_best_model_at_end: {training_config['load_best_model_at_end']} (disabled to prevent errors)")
print(f"⚠️  dataloader_num_workers: {training_config['dataloader_num_workers']} (reduced to prevent conflicts)")

In [None]:
def check_file_and_load_dataset(file_path):
    """Check file exists and load dataset safely"""
    print(f"🔍 Checking file: {file_path}")
    
    # Check if file exists
    path = Path(file_path)
    if not path.exists():
        print(f"❌ File not found: {file_path}")
        print("📝 Available files in directory:")
        parent_dir = path.parent
        if parent_dir.exists():
            for file in parent_dir.iterdir():
                if file.suffix in ['.jsonl', '.json']:
                    print(f"   - {file}")
        return None
    
    if path.stat().st_size == 0:
        print(f"❌ File is empty: {file_path}")
        return None
    
    print(f"✅ File found, size: {path.stat().st_size} bytes")
    
    # Load data
    data = []
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line_num, line in enumerate(f, 1):
                line = line.strip()
                if not line:
                    continue
                try:
                    entry = json.loads(line)
                    # Validate entry
                    if isinstance(entry, dict) and 'instruction' in entry and 'output' in entry:
                        # Ensure system message exists
                        if 'system' not in entry:
                            entry['system'] = "You are a helpful corporate AI assistant that helps employees with internal processes, applications, and workplace questions."
                        data.append(entry)
                    else:
                        print(f"⚠️  Skipping invalid entry at line {line_num}")
                except json.JSONDecodeError:
                    print(f"⚠️  Invalid JSON at line {line_num}")
                    continue
        
        print(f"✅ Loaded {len(data)} valid entries")
        return data
        
    except Exception as e:
        print(f"❌ Error loading file: {e}")
        return None

In [None]:
# def load_jsonl_data(file_path):
#     """Load data from JSONL file"""
#     data = []
#     with open(file_path, 'r', encoding='utf-8') as f:
#         for line in f:
#             data.append(json.loads(line.strip()))
#     return data

In [None]:
def convert_to_conversational_format(data_entry):
    """Convert system/instruction/output format to conversational format"""
    system_message = data_entry.get('system', 'You are a helpful assistant.')
    instruction = data_entry.get('instruction', '')
    output = data_entry.get('output', '')
    
    conversation = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_message}<|eot_id|><|start_header_id|>user<|end_header_id|>

{instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{output}<|eot_id|>"""
    
    return conversation

In [None]:
def preprocess_dataset_safe(file_path, test_size=0.1):
    """Load and preprocess the dataset with error handling"""
    print("=" * 50)
    print("DATASET LOADING AND PREPROCESSING")
    print("=" * 50)
    
    # Try to load the dataset
    raw_data = check_file_and_load_dataset(file_path)
    
    if raw_data is None or len(raw_data) == 0:
        print("❌ No valid data found. Please check your dataset file.")
        return None
    
    # Convert to conversational format
    print("🔄 Converting to conversational format...")
    conversations = []
    
    for i, entry in enumerate(raw_data):
        try:
            conv = convert_to_conversational_format(entry)
            conversations.append({"text": conv})
        except Exception as e:
            print(f"⚠️  Error processing entry {i}: {e}")
            continue
    
    if len(conversations) == 0:
        print("❌ No conversations could be created")
        return None
    
    print(f"✅ Created {len(conversations)} conversations")
    
    # Create dataset
    try:
        dataset = Dataset.from_list(conversations)
        
        # Split into train/validation
        if len(conversations) > 1:
            dataset = dataset.train_test_split(test_size=test_size, seed=42)
        else:
            # Single example case
            dataset = DatasetDict({
                'train': dataset,
                'test': dataset.select([0])
            })
        
        print(f"📊 Train examples: {len(dataset['train'])}")
        print(f"📊 Validation examples: {len(dataset['test'])}")
        
        # Show sample
        print("\n=== SAMPLE CONVERSATION ===")
        sample_text = dataset['train'][0]['text']
        print(sample_text[:300] + "..." if len(sample_text) > 300 else sample_text)
        
        return dataset
        
    except Exception as e:
        print(f"❌ Error creating dataset: {e}")
        return None

# Update your dataset path here
DATASET_PATH = "/home/azureuser/cloudfiles/code/Users/746582/llama-8b-ft-11th-june/azure_instruction_dataset.jsonl"

# Load dataset with error handling
dataset = preprocess_dataset_safe(DATASET_PATH)

# Check if dataset loaded successfully
if dataset is None:
    print("\n" + "=" * 50)
    print("DATASET LOADING FAILED!")
    print("=" * 50)
    print("Please check:")
    print("1. File path is correct")
    print("2. File exists and is not empty") 
    print("3. File contains valid JSONL format")
    print("4. Each line has 'instruction' and 'output' fields")
    raise ValueError("Dataset loading failed. Cannot proceed with training.")
else:
    print("\n" + "=" * 50)
    print("DATASET LOADED SUCCESSFULLY!")
    print("=" * 50)
    print("✅ Ready to proceed with model loading")

In [None]:
# def preprocess_dataset(file_path, test_size=0.1):
#     """Load and preprocess the dataset"""
#     print("Loading dataset...")
#     raw_data = load_jsonl_data(file_path)
#     print(f"Loaded {len(raw_data)} examples")
    
#     # Convert to conversational format
#     print("Converting to conversational format...")
#     conversations = []
#     for entry in raw_data:
#         conv = convert_to_conversational_format(entry)
#         conversations.append({"text": conv})
    
#     # Create dataset
#     dataset = Dataset.from_list(conversations)
    
#     # Split into train/validation
#     dataset = dataset.train_test_split(test_size=test_size, seed=42)
    
#     print(f"Train examples: {len(dataset['train'])}")
#     print(f"Validation examples: {len(dataset['test'])}")
    
#     return dataset

In [None]:
# Load your dataset (replace with your file path)
DATASET_PATH = "/home/azureuser/cloudfiles/code/Users/746582/llama-8b-ft-11th-june/azure_instruction_dataset.jsonl"  # Update this path
dataset = preprocess_dataset_safe(DATASET_PATH)

# Display sample
print("\n=== Sample Conversation ===")
print(dataset['train'][0]['text'][:500] + "...")


In [None]:
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    token=HF_TOKEN,
    trust_remote_code=True
)

# Set padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Loading model with QLoRA...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=qlora_config,
    device_map="auto",
    token=HF_TOKEN,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16
)

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

# Apply LoRA
model = get_peft_model(model, lora_config)

In [None]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"Trainable params: {trainable_params:,} || All params: {all_param:,} || Trainable%: {100 * trainable_params / all_param:.2f}")

print_trainable_parameters(model)


In [None]:
print("🏋️ Creating trainer with error handling...")

# Create training arguments
training_args = TrainingArguments(**training_config)

# Verify dataset is loaded
if 'dataset' not in locals() or dataset is None:
    raise ValueError("Dataset not loaded! Please run the dataset loading cell first.")

print(f"✅ Dataset verified: {len(dataset['train'])} train, {len(dataset['test'])} test examples")

# Create trainer with multiple fallback options
trainer = None
creation_method = ""

# Option 1: Try with formatting function
try:
    trainer = SFTTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        peft_config=lora_config,
        formatting_func=lambda x: x["text"],
    )
    creation_method = "SFTTrainer with formatting_func"
    print("✅ Trainer created successfully with formatting_func")
    
except Exception as e:
    print(f"❌ SFTTrainer with formatting_func failed: {str(e)[:100]}...")
    
    # Option 2: Try without formatting function
    try:
        trainer = SFTTrainer(
            model=model,
            args=training_args,
            train_dataset=dataset["train"],
            eval_dataset=dataset["test"],
            peft_config=lora_config,
        )
        creation_method = "SFTTrainer without formatting_func"
        print("✅ Trainer created successfully without formatting_func")
        
    except Exception as e2:
        print(f"❌ SFTTrainer without formatting_func failed: {str(e2)[:100]}...")
        
        # Option 3: Manual tokenization with standard Trainer
        print("🔄 Falling back to standard Trainer with manual tokenization...")
        
        from transformers import Trainer, DataCollatorForLanguageModeling
        
        def tokenize_function(examples):
            return tokenizer(
                examples["text"],
                truncation=True,
                padding=False,
                max_length=1536,
                return_tensors=None
            )
        
        # Tokenize datasets
        tokenized_train = dataset["train"].map(
            tokenize_function,
            batched=True,
            remove_columns=dataset["train"].column_names,
        )
        
        tokenized_eval = dataset["test"].map(
            tokenize_function,
            batched=True,
            remove_columns=dataset["test"].column_names,
        )
        
        # Data collator
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=False,
            return_tensors="pt",
        )
        
        # Create standard trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_eval,
            data_collator=data_collator,
        )
        creation_method = "Standard Trainer with manual tokenization"
        print("✅ Standard Trainer created successfully")

if trainer is None:
    raise RuntimeError("Failed to create trainer with any method!")

print(f"\n🎯 Trainer created using: {creation_method}")
print("🚀 Ready for training!")

# Verify trainer is working
try:
    print("🔍 Testing trainer setup...")
    sample_batch = next(iter(trainer.get_train_dataloader()))
    print(f"✅ Sample batch shape: {sample_batch['input_ids'].shape}")
    print("✅ Trainer setup verified successfully!")
except Exception as e:
    print(f"⚠️  Warning: Trainer test failed: {e}")
    print("Will attempt training anyway...")

In [None]:
print("🚀 Starting training process...")
print("=" * 60)

# Check GPU memory before training
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print(f"🔋 GPU memory before training: {torch.cuda.memory_allocated()/1e9:.2f}GB allocated")

training_completed = False
training_error = None

try:
    # Start training
    print("🏃‍♂️ Beginning model training...")
    trainer.train()
    training_completed = True
    print("🎉 Training completed successfully!")
    
except Exception as e:
    training_error = str(e)
    print(f"❌ Training failed with error: {training_error}")
    
    # Try to save what we have
    print("💾 Attempting to save current model state...")
    try:
        output_dir = "./llama-3.1-8b-partial-save"
        trainer.model.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)
        print(f"✅ Partial model saved to {output_dir}")
    except Exception as save_error:
        print(f"❌ Could not save partial model: {save_error}")

# Final status
print("\n" + "=" * 60)
if training_completed:
    print("✅ TRAINING STATUS: COMPLETED SUCCESSFULLY")
    
    # Show training metrics if available
    if hasattr(trainer, 'state') and trainer.state.log_history:
        print("\n📊 Final Training Metrics:")
        last_log = trainer.state.log_history[-1] if trainer.state.log_history else {}
        for key, value in last_log.items():
            if isinstance(value, (int, float)):
                print(f"   {key}: {value:.4f}")
else:
    print("❌ TRAINING STATUS: FAILED")
    print(f"Error: {training_error}")
    print("Check the error above and try adjusting batch size or other parameters")

print("=" * 60)

In [None]:
print("Saving model...")
try:
    output_dir = "./llama-3.1-8b-corporate-assistant-final"
    
    # Save the model state (PEFT adapters)
    trainer.model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    
    print(f"✅ Model saved to {output_dir}")
    
    # Save training metrics if available
    if hasattr(trainer, 'state') and trainer.state.log_history:
        with open(f"{output_dir}/training_metrics.json", "w") as f:
            json.dump(trainer.state.log_history, f, indent=2)
        print("✅ Training metrics saved!")
    
except Exception as e:
    print(f"❌ Error saving model: {e}")

print("\n" + "="*50)
print("TRAINING PROCESS COMPLETED!")
print("="*50)