In [1]:
# Ensure GPU runtime is selected (T4, L4, A100 recommended)
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes
!pip install datasets # For loading the OSCAR dataset
print("=== Installation Complete ===")

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-zw0zvu1f/unsloth_079fea4d5c7b4b70a12c00301b4d53a0
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-zw0zvu1f/unsloth_079fea4d5c7b4b70a12c00301b4d53a0
  Resolved https://github.com/unslothai/unsloth.git to commit c9b9a366e7a6110f9d58d5ed8db6bd27bc97fb71
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting unsloth_zoo>=2025.3.17 (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading unsloth_zoo-2025.3.17-py3-none-any.whl.metadata (8.0 kB)
Collecting tyro (from unsloth@ git+https://github.com/unslothai/unsloth.g

Collecting xformers
  Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting trl<0.9.0
  Downloading trl-0.8.6-py3-none-any.whl.metadata (11 kB)
Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl (43.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.4/43.4 MB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.8.6-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xformers, trl
  Attempting uninstall: trl
    Found existing installation: trl 0.15.2
    Uninstalling trl-0.15.2:
      Successfully uninstalled trl-0.15.2
Successfully installed trl-0.8.6 xformers-0.0.29.post3
=== Installation Complete ===


In [1]:
import torch
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset, Dataset # Import Dataset class for potential subset creation
# from peft import LoraConfig # Not strictly needed for basic config
import os
import gc # Garbage collector

# Hugging Face Login (Required for Llama 3 models)
from huggingface_hub import login
# IMPORTANT: Replace "hf_YOUR_HUGGINGFACE_TOKEN" with your actual HF token
try:
    login("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", add_to_git_credential=False)
    print("Hugging Face login successful.")
except Exception as e:
    print(f"Hugging Face login failed: {e}")
    print("Please ensure you have provided a valid Hugging Face token.")

print("=== Imports and Login Complete ===")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Hugging Face login successful.
=== Imports and Login Complete ===


In [10]:
# --- Major Parameters ---
TARGET_LANGUAGE_CODE = "hi" # Hindi language code (use 'sw' for Swahili, 'fr' for French etc.)
TARGET_LANGUAGE_NAME = "Hindi" # For print statements

max_seq_length = 2048 # Adjust based on VRAM. Longer helps learn context.
dtype = None # Auto-detect
load_in_4bit = True # Use 4-bit quantization

# --- Use the BASE MODEL for pretraining ---
model_name = "unsloth/Qwen2-0.5B-bnb-4bit" # Use Qwen2 0.5B BASE model (4-bit)

print(f"Configuration:")
print(f"  Model Name: {model_name} (BASE model)")
print(f"  Target Language: {TARGET_LANGUAGE_NAME} ({TARGET_LANGUAGE_CODE})")
print(f"  Max Sequence Length: {max_seq_length}")
print(f"  Load in 4-bit: {load_in_4bit}")
print("=== Configuration Set ===")

Configuration:
  Model Name: unsloth/Qwen2-0.5B-bnb-4bit (BASE model)
  Target Language: Hindi (hi)
  Max Sequence Length: 2048
  Load in 4-bit: True
=== Configuration Set ===


In [11]:
import time
start_time = time.time()
print(f"Loading BASE model ({model_name}) and tokenizer...")

# Load the base model specifically
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # Optional token if login failed
)

end_time = time.time()
print(f"Base model and tokenizer loaded in {end_time - start_time:.2f} seconds.")
# Note: We are NOT applying PEFT/LoRA *yet*. We load the base model first.
# We will add LoRA adapters before training, as is standard practice with Unsloth PEFT.
print("=== Base Model and Tokenizer Loaded ===")

Loading BASE model (unsloth/Qwen2-0.5B-bnb-4bit) and tokenizer...
==((====))==  Unsloth 2025.3.19: Fast Qwen2 patching. Transformers: 4.50.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/457M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/107 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Base model and tokenizer loaded in 16.59 seconds.
=== Base Model and Tokenizer Loaded ===


In [13]:
# === Cell 5: Load Raw Text Dataset (mc4 - Alternative) ===

# mc4 is another large multilingual dataset derived from Common Crawl.
# We need to specify the language subset.
dataset_name = "mc4"
subset_name = TARGET_LANGUAGE_CODE # Use the language code defined in Cell 3 (e.g., "hi" for Hindi)
subset_size = 20000 # Number of examples for assignment (adjust as needed)
min_length = 50     # Minimum character length filter

print(f"Loading alternative dataset: {dataset_name}, subset: {subset_name}")
print(f"Streaming dataset and taking the first {subset_size} examples...")

try:
    # Load mc4 using streaming, specifying the language code as the subset name
    streamed_dataset = load_dataset(
        dataset_name,
        subset_name, # Pass the language code here for mc4
        split="train",
        streaming=True,
        # trust_remote_code=True # May sometimes be needed for mc4 loading scripts
    )
    print("Dataset stream opened.")

    # Take a subset and filter (same logic as before)
    print(f"Filtering for examples with minimum length {min_length}...")
    # mc4 might have different column names, check features below. Assume 'text' for now.
    text_column = "text" # Default assumption for mc4
    filtered_iterable = (
        example for example in streamed_dataset.take(subset_size * 2) # Take more initially
        if len(example.get(text_column, "")) >= min_length
    )

    # Convert the filtered iterable subset to a standard Hugging Face Dataset
    print(f"Converting filtered stream to Dataset object (limit: {subset_size})...")
    dataset_list = [example for _, example in zip(range(subset_size), filtered_iterable)]

    # Create the dataset object from the list of dictionaries
    if not dataset_list:
         raise ValueError(f"No data found for language '{subset_name}' in mc4 in the first {subset_size*2} streamed examples with min length {min_length}. Try increasing subset_size or check language code.")

    dataset = Dataset.from_list(dataset_list)
    print(f"Successfully created dataset subset with {len(dataset)} examples.")

    # *** IMPORTANT: Verify the actual text column name ***
    print("\nDataset features:", dataset.features)
    if text_column not in dataset.features:
        print(f"*** WARNING: Expected text column '{text_column}' not found. Please check features and update the 'text_column' variable and potentially 'dataset_text_field' in Cell 7. ***")
        # You might need to look at dataset.features and choose the correct column containing the text.

    print(f"\nFirst example (using column '{text_column}'):")
    if text_column in dataset.features:
        print(dataset[0][text_column][:500]) # Print first 500 chars
    else:
        print("Cannot display example, text column name incorrect or missing.")


except Exception as e:
    print(f"Error loading or processing dataset '{dataset_name}' subset '{subset_name}': {e}")
    print(f"Please check the language code ('{subset_name}') and dataset availability. You might need to install 'tensorflow_datasets' if prompted by the error message for mc4.")
    # Sometimes mc4 loading depends on tfds: !pip install tensorflow_datasets
    raise

print("=== Raw Text Dataset (mc4) Loaded ===")

Loading alternative dataset: mc4, subset: hi
Streaming dataset and taking the first 20000 examples...
Dataset stream opened.
Filtering for examples with minimum length 50...
Converting filtered stream to Dataset object (limit: 20000)...
Successfully created dataset subset with 20000 examples.

Dataset features: {'text': Value(dtype='string', id=None), 'timestamp': Value(dtype='string', id=None), 'url': Value(dtype='string', id=None)}

First example (using column 'text'):
6 साल की बच्ची अपनी मां के लिए बनी मां | UPUKLive
6 साल की बच्ची अपनी मां के लिए बनी मां
जो प्यार, करुणा और देखभाल का स्वभाव ईश्वर ने बेटियों को दिया है, वह बेटों को हासिल नहीं है। मां को ब्रेन हैमरेज हो जाने के बाद छह साल की मासूम ने जिस तरह से मां की देखभाल की, उसे देखकर लगता है कि मां असल में बेटी है और बेटी मां है। काई चेंगचेंग जब महज छह साल की थी, तो उसकी मां चेन ली को ब्रेन हैमरेज हो गया था। इसकी वजह से उनकी याददाश्त खराब हो गई।
बीते चार साल से अपनी मां को पढ़ना, लिखना और बोलना सिखाना ही क
=== Raw Text Dataset (m

In [14]:
# Even for continued pretraining, PEFT/LoRA is often used with Unsloth
# to make training feasible on limited hardware and manage checkpoints.
# The LoRA adapters will learn the new language patterns.
print("Configuring LoRA adapters for pretraining...")

model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Rank can be higher for pretraining (e.g., 32, 64) as we want to learn broader patterns
    lora_alpha = 64, # Adjust alpha accordingly (often 2*r)
    lora_dropout = 0, # Set to 0 for Unsloth fast patching
    bias = "none",
    use_gradient_checkpointing = True,
    random_state = 3407,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
)

print("LoRA configured for pretraining:")
print(model.print_trainable_parameters())
print("=== LoRA Configuration Complete ===")

Configuring LoRA adapters for pretraining...


Unsloth 2025.3.19 patched 24 layers with 24 QKV layers, 24 O layers and 24 MLP layers.


LoRA configured for pretraining:
trainable params: 17,596,416 || all params: 511,629,184 || trainable%: 3.4393
None
=== LoRA Configuration Complete ===


In [17]:
from trl import SFTTrainer
from transformers import TrainingArguments

output_directory = f"llama3_base_pretrain_{TARGET_LANGUAGE_CODE}_run1"

print(f"Configuring SFTTrainer for Continued Pretraining. Output directory: {output_directory}")

# Key difference: Use packing=True for efficient pretraining on raw text
# No custom formatting function is needed; SFTTrainer handles text packing.

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,        # The raw text dataset
    dataset_text_field = "text",    # The column containing the raw text
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = True,                 # <<< IMPORTANT: Enable packing for pretraining efficiency

    args = TrainingArguments(
        per_device_train_batch_size = 2,  # Keep batch size low due to sequence length
        gradient_accumulation_steps = 8,  # Increase accumulation (effective batch size 16)
        warmup_steps = 20,                # Slightly more warmup might be beneficial
        max_steps = 200,                # Set a max step count for the assignment (adjust as needed)
        num_train_epochs = 1,             # Or train for 1 epoch on the subset
        learning_rate = 1e-4,             # Learning rate can sometimes be slightly higher or lower for CPT (e.g., 5e-5 to 2e-4)
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = output_directory,
        save_strategy = "steps",
        save_steps = 50,                 # Save checkpoints regularly
        report_to="tensorboard",
    ),
)

print("Trainer configured for continued pretraining.")
if torch.cuda.is_available():
    gpu_stats = torch.cuda.get_device_properties(0)
    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024**3, 3)
    print(f"Pre-Train GPU: {gpu_stats.name}. Max memory reserved: {start_gpu_memory} GB.")
print("=== Trainer Configuration Complete ===")

Configuring SFTTrainer for Continued Pretraining. Output directory: llama3_base_pretrain_hi_run1


Generating train split: 0 examples [00:00, ? examples/s]

Trainer configured for continued pretraining.
Pre-Train GPU: NVIDIA L4. Max memory reserved: 7.834 GB.
=== Trainer Configuration Complete ===


In [18]:
import time

print(f"Starting continued pretraining on {TARGET_LANGUAGE_NAME} text...")
start_train_time = time.time()

# Clear cache before training
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("Cleared CUDA cache.")

# Start training
trainer_stats = trainer.train()

end_train_time = time.time()
print(f"Continued Pretraining finished in {(end_train_time - start_train_time)/60:.2f} minutes.")

# Analyze memory usage
if torch.cuda.is_available():
    used_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024**3, 3)
    used_memory_for_lora = round(used_gpu_memory - start_gpu_memory, 3) if 'start_gpu_memory' in locals() else used_gpu_memory
    print(f"\nPost-Train Peak reserved memory: {used_gpu_memory} GB.")
    if 'start_gpu_memory' in locals():
         print(f"Approx. memory used for training artifacts: {used_memory_for_lora} GB.")

print("\nTraining stats:", trainer_stats)
# Expect the loss to decrease as the model learns patterns in the new language.
print("=== Continued Pretraining Complete ===")

Starting continued pretraining on Hindi text...
Cleared CUDA cache.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 25,508 | Num Epochs = 1 | Total steps = 200
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 17,596,416/5,000,000,000 (0.35% trained)


Step,Training Loss
10,1.9814
20,1.884
30,1.7722
40,1.7361
50,1.8341
60,1.7594
70,1.7219
80,1.7825
90,1.8099
100,1.7192


Continued Pretraining finished in 12.46 minutes.

Post-Train Peak reserved memory: 7.834 GB.
Approx. memory used for training artifacts: 0.0 GB.

Training stats: TrainOutput(global_step=200, training_loss=1.7633136367797853, metrics={'train_runtime': 744.9632, 'train_samples_per_second': 4.296, 'train_steps_per_second': 0.268, 'total_flos': 1.47650456322048e+16, 'train_loss': 1.7633136367797853, 'epoch': 0.1254508389524855})
=== Continued Pretraining Complete ===


In [19]:
final_adapter_dir = f"{output_directory}/final_adapters"
print(f"\nSaving final LoRA adapters from pretraining to: {final_adapter_dir}")

# Save the trained LoRA adapters (capturing the learned language patterns)
model.save_pretrained(final_adapter_dir)

# Save the tokenizer as well
tokenizer.save_pretrained(final_adapter_dir)

print(f"Adapters and tokenizer saved to {final_adapter_dir}.")
print("=== Pretraining Adapters Saved ===")


Saving final LoRA adapters from pretraining to: llama3_base_pretrain_hi_run1/final_adapters
Adapters and tokenizer saved to llama3_base_pretrain_hi_run1/final_adapters.
=== Pretraining Adapters Saved ===


In [20]:
import warnings
warnings.filterwarnings("ignore")

print("\nRunning Inference Test (Text Generation)...")

# Prepare model for inference
FastLanguageModel.for_inference(model)
model.eval()

# --- Create a prompt IN THE TARGET LANGUAGE ---
# Example prompt in Hindi. Replace with a relevant start if you chose another language.
prompt_hindi = "भारत एक विशाल देश है जहाँ" # Translation: "India is a vast country where"
# prompt_swahili = "Habari za asubuhi! Leo ni siku" # Example for Swahili
# prompt_french = "Bonjour le monde! Aujourd'hui, il fait" # Example for French

print(f"Using prompt in {TARGET_LANGUAGE_NAME}: '{prompt_hindi}'")

# --- Tokenize the raw text prompt ---
# NO chat template here, just raw text for the base model
inputs = tokenizer([prompt_hindi], return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")

# --- Set generation parameters ---
generation_params = {
    "max_new_tokens": 50,      # Generate a short continuation
    "use_cache": True,
    "do_sample": True,
    "temperature": 0.7,
    "top_p": 0.9,
    "eos_token_id": tokenizer.eos_token_id,
    "pad_token_id": tokenizer.eos_token_id,
}

# --- Generate the response ---
print("\nGenerating continuation...")
with torch.no_grad():
    outputs = model.generate(**inputs, **generation_params) # Pass inputs directly

# Decode the ENTIRE output (prompt + generation)
full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Decode only the NEWLY generated part
# generated_tokens = outputs[0][len(inputs['input_ids'][0]):]
# generated_response = tokenizer.decode(generated_tokens, skip_special_tokens=True)

print("\n--- Prompt ---")
print(prompt_hindi)

print(f"\n--- Generated Continuation (Full Text) ---")
print(full_response)

# print(f"\n--- Generated Continuation (New Tokens Only) ---")
# print(generated_response)
# Note: The quality of generation heavily depends on the amount of pretraining data and steps.
# With a small subset and short training, it might be basic or repetitive.

# Clean up memory
del inputs, outputs
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("\n=== Pretraining Inference Test Complete ===")


Running Inference Test (Text Generation)...
Using prompt in Hindi: 'भारत एक विशाल देश है जहाँ'

Generating continuation...

--- Prompt ---
भारत एक विशाल देश है जहाँ

--- Generated Continuation (Full Text) ---
भारत एक विशाल देश है जहाँ यहां विचार देखा जाता है कि आप अपने अनुकूल विश्व को

=== Pretraining Inference Test Complete ===
