In [None]:
# Ensure GPU runtime
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes
!pip install datasets
print("=== Installation Complete ===")

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-yxc9htmq/unsloth_1793bd66c669488ab87e51ccf51bdb8e
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-yxc9htmq/unsloth_1793bd66c669488ab87e51ccf51bdb8e
  Resolved https://github.com/unslothai/unsloth.git to commit c9b9a366e7a6110f9d58d5ed8db6bd27bc97fb71
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting unsloth_zoo>=2025.3.17 (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading unsloth_zoo-2025.3.17-py3-none-any.whl.metadata (8.0 kB)
Collecting tyro (from unsloth@ git+https://github.com/unslothai/unsloth.g

Collecting xformers
  Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting trl<0.9.0
  Downloading trl-0.8.6-py3-none-any.whl.metadata (11 kB)
Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl (43.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.4/43.4 MB[0m [31m51.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.8.6-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xformers, trl
  Attempting uninstall: trl
    Found existing installation: trl 0.15.2
    Uninstalling trl-0.15.2:
      Successfully uninstalled trl-0.15.2
Successfully installed trl-0.8.6 xformers-0.0.29.post3
=== Installation Complete ===


In [None]:
import torch
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset
from peft import PeftModel # <<< Needed to load adapters manually
import os
import gc
import time
# from huggingface_hub import login # Optional login

print("=== Imports Complete ===")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


    PyTorch 2.6.0+cu124 with CUDA 1204 (you have 2.5.1+cu124)
    Python  3.11.11 (you have 3.11.11)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


🦥 Unsloth Zoo will now patch everything to make training faster!
=== Imports Complete ===


In [None]:
# --- Base Model & General Config ---
model_name = "unsloth/tinyllama-bnb-4bit"
max_seq_length = 1024 # Standard length is fine
dtype = None
load_in_4bit = True

# --- LoRA Config (Used in BOTH phases) ---
# Define these ONCE to ensure consistency
lora_r = 16
lora_alpha = 32
lora_dropout = 0 # Use 0 for speed/consistency

# --- Phase 1 Config (Initial Finetune) ---
output_dir_phase1 = "tinyllama_dolly_phase1_checkpoint" # Directory for phase 1 outputs & adapters
dataset_subset_size_phase1 = 500 # Small subset for first phase
training_max_steps_phase1 = 40 # Short initial training run

# --- Phase 2 Config (Continued Finetune) ---
output_dir_phase2 = "tinyllama_dolly_phase2_continued" # Directory for phase 2 outputs & final adapters
# We can use the same dataset or a different one for phase 2
dataset_subset_size_phase2 = 500 # Using same subset size for simplicity
training_max_steps_phase2 = 60 # Train for more steps in phase 2 (total 40+60 = 100)
learning_rate_phase2 = 1e-4 # Often use a lower LR for continued fine-tuning (e.g., 1e-4 or 5e-5)

print("--- Configuration ---")
print(f"Model: {model_name}")
print(f"Max Seq Length: {max_seq_length}")
print(f"LoRA Config: r={lora_r}, alpha={lora_alpha}, dropout={lora_dropout}")
print("\nPhase 1 (Initial):")
print(f"  Output Dir: {output_dir_phase1}")
print(f"  Dataset Size: {dataset_subset_size_phase1}")
print(f"  Max Steps: {training_max_steps_phase1}")
print("\nPhase 2 (Continued):")
print(f"  Output Dir: {output_dir_phase2}")
print(f"  Dataset Size: {dataset_subset_size_phase2}")
print(f"  Max Steps: {training_max_steps_phase2}")
print(f"  Learning Rate: {learning_rate_phase2}")
print("=== Configuration Set ===")

--- Configuration ---
Model: unsloth/tinyllama-bnb-4bit
Max Seq Length: 1024
LoRA Config: r=16, alpha=32, dropout=0

Phase 1 (Initial):
  Output Dir: tinyllama_dolly_phase1_checkpoint
  Dataset Size: 500
  Max Steps: 40

Phase 2 (Continued):
  Output Dir: tinyllama_dolly_phase2_continued
  Dataset Size: 500
  Max Steps: 60
  Learning Rate: 0.0001
=== Configuration Set ===


In [None]:
print("--- Loading Model and Tokenizer (Base for Phase 1) ---")
start_time = time.time()
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
end_time = time.time()
print(f"Model loaded in {end_time - start_time:.2f}s.")

# === Add Chat Template Fix ===
chatml_template = """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{'<|im_start|>assistant\n'}}{% endif %}"""
if tokenizer.chat_template is None:
     tokenizer.chat_template = chatml_template
     print("Manually set tokenizer.chat_template to ChatML format.")
else: print("Tokenizer chat template already set.")
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token; print("Set pad_token=eos_token.")
# =============================

print("=== Model and Tokenizer Loaded (Phase 1) ===")

--- Loading Model and Tokenizer (Base for Phase 1) ---
==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/762M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/948 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Model loaded in 20.00s.
Manually set tokenizer.chat_template to ChatML format.
=== Model and Tokenizer Loaded (Phase 1) ===


In [None]:
print("--- Configuring LoRA (Phase 1) ---")
# Use parameters defined in Cell 3
model = FastLanguageModel.get_peft_model(
    model,
    r = lora_r,
    lora_alpha = lora_alpha,
    lora_dropout = lora_dropout,
    bias = "none",
    use_gradient_checkpointing = True,
    random_state = 3407,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
)
print("LoRA configured for Phase 1:")
print(f"  r={lora_r}, alpha={lora_alpha}, dropout={lora_dropout}")
print(model.print_trainable_parameters())
print("=== LoRA Configured (Phase 1) ===")

--- Configuring LoRA (Phase 1) ---


Unsloth 2025.3.19 patched 22 layers with 22 QKV layers, 22 O layers and 22 MLP layers.


LoRA configured for Phase 1:
  r=16, alpha=32, dropout=0
trainable params: 12,615,680 || all params: 1,112,664,064 || trainable%: 1.1338
None
=== LoRA Configured (Phase 1) ===


In [None]:
print("--- Loading and Preparing Dataset (Phase 1) ---")
try:
    dataset_phase1 = load_dataset("databricks/databricks-dolly-15k", split="train")
    dataset_phase1 = dataset_phase1.shuffle(seed=42).select(range(dataset_subset_size_phase1))
    print(f"Loaded {len(dataset_phase1)} Dolly examples for Phase 1.")
except Exception as e: print(f"Error loading dataset: {e}"); raise

# Define the formatting function (only need to define once if used in both phases)
def format_chat_prompt(examples):
    instructions = examples["instruction"]
    contexts = examples["context"]
    responses = examples["response"]
    formatted_texts = []
    for instruction, context, response in zip(instructions, contexts, responses):
        user_content = instruction
        if context and context.strip(): user_content = f"Context: {context.strip()}\n\nInstruction: {instruction.strip()}"
        messages = [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": user_content}, {"role": "assistant", "content": response}]
        try:
            formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
            formatted_texts.append(formatted)
        except Exception as e: formatted_texts.append("") # Handle potential errors
    return {"text": formatted_texts} # Output column name is 'text'

print("Applying formatting for Phase 1...")
try:
    dataset_phase1 = dataset_phase1.map(format_chat_prompt, batched=True, num_proc=2, remove_columns=list(dataset_phase1.features))
    print("Formatting applied for Phase 1.")
    print("Dataset features:", dataset_phase1.features)
except Exception as e: print(f"Error mapping dataset: {e}"); raise

print("=== Dataset Ready (Phase 1) ===")

--- Loading and Preparing Dataset (Phase 1) ---


README.md:   0%|          | 0.00/8.20k [00:00<?, ?B/s]

databricks-dolly-15k.jsonl:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15011 [00:00<?, ? examples/s]

Loaded 500 Dolly examples for Phase 1.
Applying formatting for Phase 1...


Map (num_proc=2):   0%|          | 0/500 [00:00<?, ? examples/s]

Formatting applied for Phase 1.
Dataset features: {'text': Value(dtype='string', id=None)}
=== Dataset Ready (Phase 1) ===


In [None]:
print("--- Configuring Trainer (Phase 1) ---")
try:
    trainer_phase1 = SFTTrainer(
        model=model, # The LoRA model from Cell 5
        tokenizer=tokenizer,
        train_dataset=dataset_phase1,
        dataset_text_field="text",
        max_seq_length=max_seq_length,
        dataset_num_proc=2,
        packing=True, # Use packing for efficiency

        args=TrainingArguments(
            per_device_train_batch_size=2,
            gradient_accumulation_steps=8, # Effective batch size 16
            warmup_steps=5,
            max_steps=training_max_steps_phase1, # Use Phase 1 steps
            learning_rate=2e-4, # Standard initial LR for LoRA
            fp16=not torch.cuda.is_bf16_supported(),
            bf16=torch.cuda.is_bf16_supported(),
            logging_steps=10,
            optim="adamw_8bit",
            seed=3407,
            output_dir=output_dir_phase1, # Use Phase 1 output dir for logs/checkpoints
            save_strategy="steps",
            save_steps=20, # Save checkpoint(s) during phase 1
            report_to="none", # Disable external reporting
        ),
    )
    print("Trainer configured for Phase 1.")
except Exception as e: print(f"Error configuring trainer: {e}"); raise
print("=== Trainer Configured (Phase 1) ===")

--- Configuring Trainer (Phase 1) ---


Generating train split: 0 examples [00:00, ? examples/s]

Trainer configured for Phase 1.
=== Trainer Configured (Phase 1) ===


In [None]:
print(f"--- Starting Training (Phase 1: {training_max_steps_phase1} steps) ---")
gc.collect(); torch.cuda.empty_cache()
start_train_time = time.time()
try:
    trainer_phase1.train()
    end_train_time = time.time()
    print(f"Phase 1 training finished in {(end_train_time - start_train_time)/60:.2f} minutes.")
except Exception as e: print(f"Error during training: {e}"); raise
print("=== Training Complete (Phase 1) ===")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 136 | Num Epochs = 5 | Total steps = 40
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 12,615,680/4,000,000,000 (0.32% trained)


--- Starting Training (Phase 1: 40 steps) ---
Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,7.1418
20,5.4013
30,2.7447
40,1.0689


Phase 1 training finished in 2.43 minutes.
=== Training Complete (Phase 1) ===


In [None]:
# Define the path where the initial adapters will be saved
adapter_checkpoint_path = os.path.join(output_dir_phase1, "phase1_final_adapters")
os.makedirs(adapter_checkpoint_path, exist_ok=True) # Ensure directory exists

print(f"\n--- Saving Phase 1 LoRA adapters to: {adapter_checkpoint_path} ---")
try:
    # Save the adapters trained in Phase 1
    trainer_phase1.model.save_pretrained(adapter_checkpoint_path) # Save the PEFT model
    tokenizer.save_pretrained(adapter_checkpoint_path) # Save tokenizer too
    print("Adapters and tokenizer saved.")
    # Verify files exist
    print("Verifying saved files:")
    !ls -lh {adapter_checkpoint_path}
except Exception as e: print(f"Error saving adapters: {e}"); raise
print("=== Adapters Saved (Phase 1) ===")


--- Saving Phase 1 LoRA adapters to: tinyllama_dolly_phase1_checkpoint/phase1_final_adapters ---
Adapters and tokenizer saved.
Verifying saved files:
total 53M
-rw-r--r-- 1 root root  797 Apr  8 09:03 adapter_config.json
-rw-r--r-- 1 root root  49M Apr  8 09:03 adapter_model.safetensors
-rw-r--r-- 1 root root 5.0K Apr  8 09:03 README.md
-rw-r--r-- 1 root root  552 Apr  8 09:03 special_tokens_map.json
-rw-r--r-- 1 root root 1.2K Apr  8 09:03 tokenizer_config.json
-rw-r--r-- 1 root root 3.5M Apr  8 09:03 tokenizer.json
-rw-r--r-- 1 root root 489K Apr  8 09:03 tokenizer.model
=== Adapters Saved (Phase 1) ===


In [None]:
print("\n--- Simulating Restart: Clearing Model and Trainer Objects ---")
# Delete variables to ensure we reload correctly for Phase 2
try:
    del model
    del trainer_phase1
    # Keep dataset variables if Phase 2 uses the same data, otherwise delete
    # del dataset_phase1
    gc.collect()
    torch.cuda.empty_cache()
    print("Cleared model, trainer, and CUDA cache.")
except NameError:
    print("Model or trainer object already deleted or not defined.")
print("=== State Cleared ===")


--- Simulating Restart: Clearing Model and Trainer Objects ---
Cleared model, trainer, and CUDA cache.
=== State Cleared ===


In [None]:
print("\n--- Reloading Base Model and Tokenizer (for Phase 2) ---")
# *** MUST load the ORIGINAL base model, NOT the Phase 1 output dir ***
start_time = time.time()
try:
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_name, # Original base model name from Cell 3
        max_seq_length = max_seq_length, # Use same max_seq_length
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    end_time = time.time()
    print(f"Base model reloaded in {end_time - start_time:.2f}s.")

    # === Re-apply Chat Template Fix ===
    if tokenizer.chat_template is None:
        tokenizer.chat_template = chatml_template # Use template defined earlier
        print("Manually set tokenizer.chat_template.")
    else: print("Tokenizer chat template already set.")
    if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token; print("Set pad_token=eos_token.")
    # ==================================
except Exception as e: print(f"Error reloading base model: {e}"); raise
print("=== Base Model Reloaded (Phase 2) ===")


--- Reloading Base Model and Tokenizer (for Phase 2) ---
==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Base model reloaded in 7.58s.
Manually set tokenizer.chat_template.
=== Base Model Reloaded (Phase 2) ===


In [None]:
print("\n--- Re-applying LoRA Configuration (Must Match Phase 1) ---")
# *** Parameters MUST exactly match those used in Cell 5 ***
try:
    model = FastLanguageModel.get_peft_model(
        model,
        r = lora_r,           # Use the same R value defined in Cell 3
        lora_alpha = lora_alpha, # Use the same Alpha value defined in Cell 3
        lora_dropout = lora_dropout, # Use the same Dropout value defined in Cell 3
        bias = "none",
        use_gradient_checkpointing = True,
        random_state = 3407, # Can keep same state or change if desired
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                          "gate_proj", "up_proj", "down_proj",],
    )
    print("LoRA configuration re-applied (structure ready for weights):")
    print(f"  r={lora_r}, alpha={lora_alpha}, dropout={lora_dropout}")
    # Note: Trainable params will be the same, but weights are currently base + random LoRA init
    print(model.print_trainable_parameters())
except Exception as e: print(f"Error re-applying LoRA config: {e}"); raise
print("=== LoRA Re-applied ===")


--- Re-applying LoRA Configuration (Must Match Phase 1) ---
LoRA configuration re-applied (structure ready for weights):
  r=16, alpha=32, dropout=0
trainable params: 12,615,680 || all params: 1,112,664,064 || trainable%: 1.1338
None
=== LoRA Re-applied ===


In [None]:
print(f"\n--- Loading Saved Adapter Weights from Phase 1 ({adapter_checkpoint_path}) ---")
# *** This step loads the actual trained weights from Phase 1 ***
try:
    # Check if adapter path exists
    if not os.path.exists(adapter_checkpoint_path):
        raise FileNotFoundError(f"Adapter checkpoint path not found: {adapter_checkpoint_path}. Ensure Phase 1 saving was successful.")

    model = PeftModel.from_pretrained(model, adapter_checkpoint_path)
    print("Phase 1 adapter weights loaded onto the model.")
    # Now the model object has the LoRA layers initialized with the trained weights
except Exception as e: print(f"Error loading Phase 1 adapter weights: {e}"); raise
print("=== Phase 1 Weights Loaded ===")


--- Loading Saved Adapter Weights from Phase 1 (tinyllama_dolly_phase1_checkpoint/phase1_final_adapters) ---
Phase 1 adapter weights loaded onto the model.
=== Phase 1 Weights Loaded ===




In [None]:
print("\n--- Loading and Preparing Dataset (Phase 2) ---")
# Option 1: Reuse the same dataset
# if 'dataset_phase1' in locals():
#     dataset_phase2 = dataset_phase1
#     print("Reusing dataset from Phase 1.")
# else:
# Option 2: Reload or load a different dataset (as configured in Cell 3)
try:
    dataset_phase2 = load_dataset("databricks/databricks-dolly-15k", split="train")
    # Use Phase 2 size and potentially different seed for variety if desired
    dataset_phase2 = dataset_phase2.shuffle(seed=1234).select(range(dataset_subset_size_phase2))
    print(f"Loaded {len(dataset_phase2)} Dolly examples for Phase 2.")
except Exception as e: print(f"Error loading Phase 2 dataset: {e}"); raise

print("Applying formatting for Phase 2...")
try:
    # Use the same formatting function defined in Cell 6
    dataset_phase2 = dataset_phase2.map(format_chat_prompt, batched=True, num_proc=2, remove_columns=list(dataset_phase2.features))
    print("Formatting applied for Phase 2.")
    print("Dataset features:", dataset_phase2.features)
except NameError: print("ERROR: format_chat_prompt function not defined. Run Cell 6 first."); raise
except Exception as e: print(f"Error mapping Phase 2 dataset: {e}"); raise

print("=== Dataset Ready (Phase 2) ===")


--- Loading and Preparing Dataset (Phase 2) ---
Loaded 500 Dolly examples for Phase 2.
Applying formatting for Phase 2...


Map (num_proc=2):   0%|          | 0/500 [00:00<?, ? examples/s]

Formatting applied for Phase 2.
Dataset features: {'text': Value(dtype='string', id=None)}
=== Dataset Ready (Phase 2) ===


In [None]:
print("\n--- Configuring Trainer (Phase 2 - Continued) ---")
try:
    trainer_phase2 = SFTTrainer(
        model=model, # The model WITH PHASE 1 ADAPTERS LOADED from Cell 13
        tokenizer=tokenizer,
        train_dataset=dataset_phase2, # Use Phase 2 dataset
        dataset_text_field="text",
        max_seq_length=max_seq_length,
        dataset_num_proc=2,
        packing=True,

        args=TrainingArguments(
            per_device_train_batch_size=2,
            gradient_accumulation_steps=8,
            warmup_steps=5, # Reset warmup or use fewer steps for continued tuning
            max_steps=training_max_steps_phase2, # Use Phase 2 steps from Cell 3
            learning_rate=learning_rate_phase2, # <<< Use Phase 2 learning rate from Cell 3 (potentially lower)
            fp16=not torch.cuda.is_bf16_supported(),
            bf16=torch.cuda.is_bf16_supported(),
            logging_steps=10,
            optim="adamw_8bit",
            seed=3407, # Can keep the same seed or change
            output_dir=output_dir_phase2, # <<< Use Phase 2 output dir for logs/checkpoints
            save_strategy="steps",
            save_steps=30, # Save checkpoints during phase 2
            report_to="none",
        ),
    )
    print("Trainer configured for Phase 2 (Continued).")
    print(f"  Learning Rate: {learning_rate_phase2}")
except Exception as e: print(f"Error configuring Phase 2 trainer: {e}"); raise
print("=== Trainer Configured (Phase 2) ===")


--- Configuring Trainer (Phase 2 - Continued) ---


Generating train split: 0 examples [00:00, ? examples/s]

Trainer configured for Phase 2 (Continued).
  Learning Rate: 0.0001
=== Trainer Configured (Phase 2) ===


In [None]:
print(f"--- Starting Training (Phase 2 - Continued: {training_max_steps_phase2} steps) ---")
# This will continue training the LoRA weights loaded in Cell 13
gc.collect(); torch.cuda.empty_cache()
start_train_time = time.time()
try:
    # We train the trainer_phase2 object
    trainer_phase2.train()
    end_train_time = time.time()
    print(f"Phase 2 (Continued) training finished in {(end_train_time - start_train_time)/60:.2f} minutes.")
    # Observe if the loss continues to decrease or stabilizes
except Exception as e: print(f"Error during Phase 2 training: {e}"); raise
print("=== Training Complete (Phase 2) ===")

--- Starting Training (Phase 2 - Continued: 60 steps) ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 118 | Num Epochs = 9 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 0/4,000,000,000 (0.00% trained)


Step,Training Loss
10,7.8371
20,7.8146
30,7.8285
40,7.8079
50,7.8326
60,7.8234


Phase 2 (Continued) training finished in 3.25 minutes.
=== Training Complete (Phase 2) ===


In [None]:
# Define the path for the final, updated adapters from the entire process
final_continued_adapter_path = os.path.join(output_dir_phase2, "final_continued_adapters")
os.makedirs(final_continued_adapter_path, exist_ok=True) # Ensure directory exists

print(f"\n--- Saving Final Updated LoRA adapters to: {final_continued_adapter_path} ---")
try:
    # Save the adapters that have been trained through both Phase 1 and Phase 2
    trainer_phase2.model.save_pretrained(final_continued_adapter_path)
    tokenizer.save_pretrained(final_continued_adapter_path)
    print("Final adapters and tokenizer saved.")
    print("Verifying saved files:")
    !ls -lh {final_continued_adapter_path}
except Exception as e: print(f"Error saving final adapters: {e}"); raise
print("=== Final Updated Adapters Saved ===")


--- Saving Final Updated LoRA adapters to: tinyllama_dolly_phase2_continued/final_continued_adapters ---
Final adapters and tokenizer saved.
Verifying saved files:
total 53M
-rw-r--r-- 1 root root  797 Apr  8 09:07 adapter_config.json
-rw-r--r-- 1 root root  49M Apr  8 09:07 adapter_model.safetensors
-rw-r--r-- 1 root root 5.0K Apr  8 09:07 README.md
-rw-r--r-- 1 root root  552 Apr  8 09:07 special_tokens_map.json
-rw-r--r-- 1 root root 1.2K Apr  8 09:07 tokenizer_config.json
-rw-r--r-- 1 root root 3.5M Apr  8 09:07 tokenizer.json
-rw-r--r-- 1 root root 489K Apr  8 09:07 tokenizer.model
=== Final Updated Adapters Saved ===


In [None]:
print("\n--- Running Inference Test (with Final Phase 2 Adapters) ---")
import warnings; warnings.filterwarnings("ignore")

# Ensure the final trained model is used
try:
    final_model = trainer_phase2.model # Get the model from the last trainer
    FastLanguageModel.for_inference(final_model)
    final_model.eval()
    print("Using final model from trainer_phase2 for inference.")
except NameError:
    print("Trainer object not found. Reloading model and adapters...")
    # Fallback: Reload base model, apply PEFT config, load FINAL adapters
    model, tokenizer = FastLanguageModel.from_pretrained(model_name, max_seq_length=max_seq_length, dtype=dtype, load_in_4bit=load_in_4bit)
    if tokenizer.chat_template is None: tokenizer.chat_template = chatml_template
    if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
    # Reapply PEFT config (necessary before loading weights)
    model = FastLanguageModel.get_peft_model(model, r=lora_r, lora_alpha=lora_alpha, lora_dropout=lora_dropout, target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj",], random_state=3407)
    # Load the FINAL adapters saved in Cell 17
    final_model = PeftModel.from_pretrained(model, final_continued_adapter_path)
    FastLanguageModel.for_inference(final_model)
    final_model.eval()
    print("Model reloaded with final continued adapters.")


# Test prompt
test_instruction = "What is full fine-tuning?"
messages = [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": test_instruction}]
inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
if not isinstance(inputs, torch.Tensor) and hasattr(inputs, 'input_ids'): inputs = inputs.input_ids

generation_params = { "max_new_tokens": 150, "use_cache": True, "do_sample": True, "temperature": 0.7, "top_p": 0.9, "eos_token_id": tokenizer.eos_token_id, "pad_token_id": tokenizer.eos_token_id, }

print("\nGenerating response with final adapters...")
response = "[Generation Error]"
try:
    with torch.no_grad(): outputs = final_model.generate(inputs, **generation_params)
    input_len = inputs.shape[-1]; output_len = outputs.shape[-1]
    if output_len > input_len: response = tokenizer.decode(outputs[0][input_len:], skip_special_tokens=True).strip()
    else: response = "[No new tokens]"
except Exception as e: print(f"Generation/Decoding Error: {e}")

print(f"\nUser: {test_instruction}"); print(f"\nAssistant (Final Adapters):\n{response}")

# Clean up
if 'final_model' in locals(): del final_model
if 'model' in locals(): del model # Delete base model if reloaded
if 'trainer_phase2' in locals(): del trainer_phase2
if 'inputs' in locals(): del inputs
if 'outputs' in locals(): del outputs
gc.collect(); torch.cuda.empty_cache()
print("\n=== Inference Test Complete (Final Adapters) ===")


--- Running Inference Test (with Final Phase 2 Adapters) ---
Trainer object not found. Reloading model and adapters...
==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model reloaded with final continued adapters.

Generating response with final adapters...

User: What is full fine-tuning?

Assistant (Final Adapters):
to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to 