In [None]:
!pip install transformers datasets peft bitsandbytes accelerate huggingface-hub accelerate flash-attn

import torch
from torch.utils.data import Dataset
import torch.distributed as dist
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)
from datasets import load_dataset
from huggingface_hub import login, notebook_login
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training
)
import json
from typing import Dict, List, Optional
from accelerate import Accelerator
import os
import gc

# Environment Variable Configuration
#
# TOKENIZERS_PARALLELISM:
# - Current setting: "false"
# - Purpose: Controls parallel tokenization operations in the transformers library
# - Effects:
#   - When "false": Disables parallel tokenization to prevent deadlocks in multiprocessing
#   - When "true": Enables parallel tokenization which can speed up processing but may cause
#                  issues in certain environments (especially notebooks)
# - Change impacts:
#   - Setting to "true" might improve tokenization speed but could cause stability issues
#   - Recommended to keep "false" for notebook environments or when using DataLoader
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# PYTORCH_CUDA_ALLOC_CONF:
# - Current setting: 'max_split_size_mb:50'
# - Purpose: Controls CUDA memory allocation behavior
# - Parameters:
#   - max_split_size_mb: Maximum size of a single CUDA memory block that can be split
# - Effects:
#   - Helps prevent memory fragmentation
#   - Reduces the likelihood of out-of-memory errors during training
# - Change impacts:
#   - Increasing the value (e.g., to 100) allows larger contiguous memory blocks but may
#     increase fragmentation
#   - Decreasing the value reduces fragmentation but may impact performance
#   - Values too small might cause allocation failures
#   - Values too large might lead to memory fragmentation
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:50'


# log in if you're using a gated model
notebook_login()

In [None]:
# Add these helper functions at the top
def clear_memory():
    gc.collect()
    torch.cuda.empty_cache()
    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            with torch.cuda.device(f'cuda:{i}'):
                torch.cuda.empty_cache()

# Function to move model between devices
def move_model(model, device):
    clear_memory()
    return model.to(device)

def format_data(example):
    """Format the conversation data into the expected chat format."""
    try:
        user_message = example['messages'][0]['content']
        assistant_message = example['messages'][1]['content']
        
        # Simple format without special tokens
        full_prompt = f"{user_message}\n{assistant_message}"
        
        return {
            "full_prompt": full_prompt,
            "ground_truth": assistant_message
        }
    except Exception as e:
        print(f"Error formatting example: {e}")
        return None

def collate_fn(examples):
    """Collate function similar to the working vision-language approach"""
    input_ids = []
    attention_mask = []
    labels = []
    
    for example in examples:
        input_ids.append(torch.tensor(example['input_ids']))
        attention_mask.append(torch.tensor(example['attention_mask']))
        labels.append(torch.tensor(example['labels']))
    
    # Pad sequences
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)
    
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

def process_dataset(dataset, tokenizer):
    """Process and tokenize the dataset"""
    def tokenize_function(examples):
        tokenized = tokenizer(
            examples['full_prompt'],
            truncation=True,
            max_length=5000,
            padding=False,
            return_tensors=None
        )
        
        # Create labels (copy input_ids)
        labels = [ids.copy() for ids in tokenized['input_ids']]
        
        # Find the position where assistant response starts
        for idx, text in enumerate(examples['full_prompt']):
            try:
                # Find where the assistant message starts
                user_message_len = len(tokenizer(examples['full_prompt'][idx].split('\n')[0])['input_ids'])
                # Mask the user message part
                labels[idx][:user_message_len] = [-100] * user_message_len
            except Exception as e:
                print(f"Error processing example {idx}: {e}")
                continue
                
        return {
            'input_ids': tokenized['input_ids'],
            'attention_mask': tokenized['attention_mask'],
            'labels': labels
        }
    
    # Format dataset
    formatted_dataset = dataset.map(format_data, remove_columns=dataset.column_names)
    
    # Remove None entries
    formatted_dataset = formatted_dataset.filter(lambda x: x is not None)
    
    # Tokenize dataset
    tokenized_dataset = formatted_dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=formatted_dataset.column_names
    )
    
    return tokenized_dataset

# Get number of available GPUs
num_gpus = torch.cuda.device_count()
print(f"Number of available GPUs: {num_gpus}")

# Initialize tokenizer
base_model_id = "microsoft/Phi-3.5-mini-instruct"
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    padding_side="left",
    add_eos_token=True
)

# Set padding token
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
)

# LoRA Configuration
lora_config = LoraConfig(
    # r: Controls the rank of LoRA adapters
    # - Increasing r (e.g., 32, 64) = More model capacity but higher VRAM usage and training time
    # - Decreasing r (e.g., 4, 8) = Less VRAM but may underfit on complex tasks
    # - Rule of thumb: Double r if seeing underfitting, halve if running out of memory
    r=512,
    
    lora_alpha=256,
    
    # Target modules for adaptation
    # - Removing layers = Faster training but might miss important adaptations
    # - Start with attention layers (q,k,v,o) if VRAM limited
    # - Add MLP layers (gate,up,down) if more capacity needed
    # target_modules=[
    #     "q_proj",
    #     "k_proj",
    #     "v_proj",
    #     "o_proj",
    #     "gate_proj",
    #     "up_proj",
    #     "down_proj",
    # ],
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
    ],
    # target_modules=
    #     "all-linear",

    
    # use_rslora: Rank-Stabilized scaling
    # - True = Better stability for higher ranks but slightly more compute
    # - False = Original LoRA scaling, might be unstable with high ranks
    use_rslora=True,
    
    # modules_to_save: Full-rank modules to train
    # - Adding more = Better adaptation but much higher memory cost
    # - Removing all = Pure LoRA training, minimum memory usage
    # - lm_head commonly included for vocabulary adaptation
    modules_to_save=["lm_head", "embedding_tokens"],
    
    # bias: Which bias terms to train
    # - "none" = No bias training, minimum memory
    # - "all" = Train all biases, better for distribution shifts
    # - "lora_only" = Middle ground, only LoRA biases
    bias="lora_only",
    
    # lora_dropout: Regularization strength
    # - Higher (e.g., 0.2, 0.3) = More regularization, good for small datasets
    # - Lower (e.g., 0.05, 0.0) = Less regularization, better for large datasets
    # - Zero = No dropout, maximum learning but might overfit
    lora_dropout=0.1,
    
    # task_type: Configures model behavior
    # - "CAUSAL_LM" = Standard autoregressive training
    # - "SEQ_2_SEQ_LM" = For encoder-decoder models
    # - "SEQ_CLS" = For sequence classification
    # - Changing affects loss computation and forward pass behavior
    task_type="CAUSAL_LM"
)


model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
print(model)


# Training Arguments with distributed training settings
training_args = TrainingArguments(
    output_dir="./phi-v3",
    num_train_epochs=2,
    per_device_train_batch_size=1,  # Actual batch size per GPU
    gradient_accumulation_steps=1,   # Number of forward passes before backward
    
    # Learning rate ranges:
    # - 1e-5: Very conservative, slower learning
    # - 1e-4: Standard for many tasks
    # - 2e-4: Aggressive, good with larger batches
    # - 5e-4+: Very aggressive, risk of divergence
    learning_rate=1e-4,
    
    # Precision options
    bf16=True,    # BFloat16: Better numerical stability than Float16
    fp16=False,   # Disabled when using BFloat16
    
    # Optimizer settings
    optim="adamw_torch",  # default
    
    # Logging and evaluation
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=50,
    
    # Warmup steps ranges:
    # - 0-50: Minimal warmup, good for small datasets
    # - 100-500: Standard range for most tasks
    # - 1000+: Very gradual warmup, safer for large learning rates
    warmup_steps=25,
    
    # Gradient clipping ranges:
    # - 0.1-0.3: Very conservative, stable but slow
    # - 0.5-1.0: Standard range
    # - 1.0-3.0: Aggressive, faster but risk of instability
    max_grad_norm=.5,
    
    # Learning rate schedule
    lr_scheduler_type="cosine",  # Cosine: Good default, smooth decay
    
    # Memory optimization flags
    gradient_checkpointing=True,  # True saves memory but ~20% slower,
    gradient_checkpointing_kwargs = {"use_reentrant": False}, # use reentrant checkpointing

)

# Load and process datasets
train_file = "data/input/training_data.jsonl"
eval_file = "data/input/validation_data.jsonl"

train_dataset = load_dataset('json', data_files=train_file, split='train')
print(f"Training dataset size: {len(train_dataset)}")
eval_dataset = load_dataset('json', data_files=eval_file, split='train')
print(f"Eval dataset size: {len(eval_dataset)}")

train_dataset = train_dataset.select(range(100))
eval_dataset = eval_dataset.select(range(20))

# Process datasets
train_processed = process_dataset(train_dataset, tokenizer)
eval_processed = process_dataset(eval_dataset, tokenizer)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_processed,
    eval_dataset=eval_processed,
    data_collator=collate_fn,
)

# Disable model caching during training
model.gradient_checkpointing_enable()
model.config.use_cache = False

trainer.train()
print("Training completed. Preparing to save...")
clear_memory()
model.save_pretrained(training_args.output_dir, safe_serialization=True)
tokenizer.save_pretrained(training_args.output_dir)

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from peft import PeftModel


document_text = r"""
Summarize each event. Include the date, names of persons involved, and the details of the event. 

Below is the document you will review:

H\nTermination recommended; Resigned\nTerminated prior to IA findings for failure.\nACTION\nOfficer Ornelas resigned on 12/27/2016\ncollateral duty position.\nAppeal in process.\nTermination recommended; Resigned\nOn 3/10/15, demotion to police officer;\nprobation.\nMosqueda resigned on 1/31/19 prior to\nsuspension from hostage negotiator SWAT\nNotice to Terminate served. Resigned\n160-hour suspension, PDSA served\nTerminated 3/18/15 for failure to pass\nremoval from training officer position;\nNotice to Terminate served 7/19/18;\nResigned 10/14/16 prior to the findings.\n9/15/14\n2/24/16:\n5/1/16.\n10/10/2016.\nto pass probation.\nTermination recommended; Officer\nprior to the completion of this case.\n\u20b2\nMar 20 2015\nMar 20 2015\nFinding Dt\nApr 04.2018\nFeb 5 2019\nSep 10 2014\nDec 02.2015\nOct 05 2016\nF\nFinding\nSustained\nSustained\nSustained\nSustained\nSustained Jun 15 2016\nSustained\nSustained |Oct 20 2016\nSustained.\nSustained |Jan 25 2017\nSustained Feb:02-2015\nSustained\nFalsification of Work-Related\nAllegation\nDishonesty\nDishonesty; False Statements\nDocuments; False Statements\nDishonesty; Falsification of Work-\nOn-Duty Sexual Relations\nDocuments\nRelated Documents\nDestruction of Evidence\nFalse Statements\nFalsification of Work-Related\"\nDishonesty\n|On-Duty Sexual Relations\nD\nOfficer Marc Aguilar [1145]\nOfficer Kevin Schindler [1260]\nOfficer Hillary Bjorneboe [1226]\nOfficer Travis Brewer [1132]\nOfficer Jeremy Salcido [1273]\nOfficer Doug Mansker [843]\nDetective Damacio Diaz [854]\nOfficer Manuel Ornelas [989]\nDetective Justin Lewis [1015]\nOfficer Enrique Mosqueda (1242) |Sexual Solicitation\nSr. Officer Kyle Ursery [969):\nC\nOct 06 2017\nOct 16 2014\nOct 16 2014\nFeb 25 2015\nJun 06 2016\nAug 02 2016\nJan 09 2015\nOct 13 2018\nMay 31 2016\nOct 05 2016\nInc Received Dt Involved Officer\nJun 24 2014\nB\nInternal\nInternal\nInternal\nInternal\nInternal\nInternal\nInternal\nInternal\nInternal\nInternal\nType\nInternal\nA\nIA2015-006

"""


def load_models(base_model_id, adapter_path=None):
    """Load both base and fine-tuned models for comparison"""
    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained(base_model_id)
    
    # Load base model and create pipeline
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_id,
        torch_dtype=torch.bfloat16,
        device_map="cuda"
    )
    base_pipe = pipeline(
        "text-generation",
        model=base_model,
        tokenizer=tokenizer,
    )

    # Load fine-tuned model
    if adapter_path:
        fine_tuned_model = AutoModelForCausalLM.from_pretrained(
            base_model_id,
            torch_dtype=torch.bfloat16,
            device_map="cuda"
        )
        fine_tuned_model = PeftModel.from_pretrained(fine_tuned_model, adapter_path)
        # Don't create a pipeline for the fine-tuned model
    else:
        fine_tuned_model = None
        tokenizer = None

    return base_pipe, (fine_tuned_model, tokenizer)

def generate_with_model(model, tokenizer, prompt, max_new_tokens=35000):
    """Generate response directly using model and tokenizer"""
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_k=10,
            top_p=0.95,
            pad_token_id=tokenizer.pad_token_id,
            use_cache=True,
        )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)[len(prompt):]

def generate_response(pipe_or_tuple, prompt):
    """Generate response using either pipeline or direct model"""
    if isinstance(pipe_or_tuple, tuple):
        # For fine-tuned model
        model, tokenizer = pipe_or_tuple
        return generate_with_model(model, tokenizer, prompt)
    else:
        # For base model (using pipeline)
        generation_args = {
            "max_new_tokens": 4096,
            "return_full_text": False,
            "temperature": 0.7,
            "do_sample": True,
            "top_k": 10,
            "top_p": 0.95
        }
        
        if not isinstance(prompt, list):
            prompt = [{"role": "user", "content": prompt}]
            
        output = pipe_or_tuple(prompt, **generation_args)
        return output[0]['generated_text']

def compare_models(test_prompt, base_model_id, adapter_path):
    """Compare responses from base and fine-tuned models"""
    print("Loading models...")
    base_pipe, fine_tuned = load_models(base_model_id, adapter_path)

    print("\nGenerating base model response...")
    base_response = generate_response(base_pipe, test_prompt)

    print("\nGenerating fine-tuned model response...")
    fine_tuned_response = generate_response(fine_tuned, test_prompt)

    print("\n=== Base Model Response ===")
    print(base_response)
    print("\n=== Base Model End ===")
    print("\n=== Fine-tuned Model Response ===")
    print(fine_tuned_response)
    print("\n=== Fine-tuned Model Response End ===")
    return base_response, fine_tuned_response
    
base_model_id = "microsoft/Phi-3.5-mini-instruct"
adapter_path = "./phi/checkpoint-1"
base_response, fine_tuned_response = compare_models(document_text, base_model_id, adapter_path)