# Continued Pre-Training with SmolLM2 on Turkish Language

In [1]:
%%capture
!pip install -q "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

from torch import __version__ as torch_version
from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(torch_version) < V("2.4.0") else "xformers"

!pip install -q --no-deps {xformers} trl peft accelerate bitsandbytes datasets wandb huggingface_hub sentencepiece

In [2]:
import os, random, numpy as np, torch
from datasets import Dataset
from unsloth import FastLanguageModel, is_bfloat16_supported
from transformers import TrainingArguments
from trl import SFTTrainer

# reproducibility
random.seed(42); np.random.seed(42); torch.manual_seed(42)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(42)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
Device: cuda


# Hugging Face & W&B auth

In [3]:
from getpass import getpass
from huggingface_hub import login
import wandb

hf_token = getpass("üîë Enter your Hugging Face token (or press Enter to skip): ").strip()
if hf_token:
    login(hf_token)
    hf_logged_in = True
    print("‚úÖ HF login successful.")
else:
    hf_logged_in = False
    print("‚ÑπÔ∏è Skipping HF login.")

wb_token = getpass("üîë Enter your Weights & Biases token (or press Enter to skip): ").strip()
if wb_token:
    wandb.login(key=wb_token)
    run = wandb.init(project="CPT_SmolLM2_NewLanguage_TR", job_type="training", anonymous="allow")
    os.environ["WANDB_DISABLED"] = "false"
    report_to = "wandb"
    print("‚úÖ W&B logging enabled.")
else:
    os.environ["WANDB_DISABLED"] = "true"
    report_to = "none"
    print("‚ÑπÔ∏è W&B disabled.")

üîë Enter your Hugging Face token (or press Enter to skip): ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑
‚úÖ HF login successful.
üîë Enter your Weights & Biases token (or press Enter to skip): ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33maditya_rajpurohit[0m ([33maditya_rajpurohit-san-jose-state-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


‚úÖ W&B logging enabled.


# Load base Language Model for Continued Pre-Training

In [4]:
model_name = "unsloth/smollm2-135m"
max_seq_length = 512
dtype = None

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name       = model_name,
    max_seq_length   = max_seq_length,
    dtype            = dtype,
    load_in_4bit     = False,
    full_finetuning  = True,
    token            = hf_token if hf_logged_in else None,
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
if hasattr(model, "gradient_checkpointing_enable"):
    model.gradient_checkpointing_enable()

FastLanguageModel.for_training(model)
model.to(device)
print("‚úÖ Model ready for CPT:", model_name)

==((====))==  Unsloth 2025.11.3: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Float16 full finetuning uses more memory since we upcast weights to float32.


model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/742 [00:00<?, ?B/s]

‚úÖ Model ready for CPT: unsloth/smollm2-135m


# Tiny Turkish monolingual corpus

In [5]:
texts = [
    "Bu, dili √∂ƒürenmek i√ßin yazƒ±lmƒ±≈ü kƒ±sa bir T√ºrk√ße c√ºmledir.",
    "Yapay zeka, g√ºnl√ºk hayatƒ±mƒ±zdaki pek √ßok sorunu √ß√∂zmemize yardƒ±mcƒ± oluyor.",
    "Her g√ºn biraz pratik yaparsanƒ±z, dil becerileriniz yava≈ü yava≈ü geli≈üir.",
    "Veri bilimi ve makine √∂ƒürenimi g√ºn√ºm√ºz d√ºnyasƒ±nda √∂nemli beceriler haline geldi.",
    "Sƒ±navdan √∂nce sakin kalmak ve iyi notlarƒ± tekrar etmek √ßok √∂nemlidir.",
    "Yeni bir dil √∂ƒürenirken d√ºzenli tekrar ve sabƒ±r √ßok deƒüerlidir.",
    "Hata yapmaktan korkmayƒ±n; hatalar √ßoƒüu zaman en iyi √∂ƒüretmendir.",
    "Basit c√ºmlelerle ba≈ülayƒ±p zamanla daha karma≈üƒ±k yapƒ±lara ge√ßin.",
    "G√ºnl√ºk yazƒ± yazmak ve kƒ±sa metinler okumak akƒ±cƒ±lƒ±ƒüƒ± artƒ±rƒ±r.",
    "Dinleme, konu≈üma, okuma ve yazma birlikte geli≈ütiƒüinde g√º√ßl√º olur.",
]

texts = texts * 200

cpt_dataset = Dataset.from_dict({"text": texts})
print("Samples:", len(cpt_dataset))
print("Preview:", cpt_dataset[0]["text"])

Samples: 2000
Preview: Bu, dili √∂ƒürenmek i√ßin yazƒ±lmƒ±≈ü kƒ±sa bir T√ºrk√ße c√ºmledir.


# SFTTrainer tokenizes

In [6]:
def passthrough(examples):
    return {"text": examples["text"]}

cpt_text_ds = cpt_dataset.map(passthrough, batched=True)
print("Columns:", cpt_text_ds.column_names)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Columns: ['text']


# Training Arguments

In [7]:
training_args = TrainingArguments(
    output_dir                  = "smollm2-135m-cpt-tr",
    num_train_epochs            = 1,
    max_steps                   = 50,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 1,
    learning_rate               = 2e-4,
    warmup_steps                = 30,
    logging_steps               = 10,
    save_strategy               = "no",
    fp16                        = (torch.cuda.is_available() and not torch.cuda.is_bf16_supported()),
    bf16                        = (torch.cuda.is_available() and torch.cuda.is_bf16_supported()),
    report_to                   = report_to,
)
print("‚úÖ Training args ready.")

‚úÖ Training args ready.


# SFTTrainer

In [8]:
cpt_trainer = SFTTrainer(
    model              = model,
    tokenizer          = tokenizer,
    train_dataset      = cpt_text_ds,
    dataset_text_field = "text",
    max_seq_length     = max_seq_length,
    packing            = True,
    args               = training_args,
)
print("‚úÖ Trainer ready.")

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/2000 [00:00<?, ? examples/s]

‚úÖ Trainer ready.


# Train the Model

In [9]:
if torch.cuda.is_available():
    gpu = torch.cuda.get_device_properties(0)
    print(f"GPU: {gpu.name} | VRAM: {round(gpu.total_memory/1e9, 2)} GB")

train_out = cpt_trainer.train()

mins = round(train_out.metrics.get("train_runtime", 0)/60, 2)
peak = f"{round(torch.cuda.max_memory_reserved()/1e9,3)} GB" if torch.cuda.is_available() else "CPU"
print(f"‚è± Runtime: {mins} min | üíæ Peak reserved GPU: {peak}")

The model is already on multiple devices. Skipping the move to device specified in `args`.


GPU: Tesla T4 | VRAM: 15.83 GB


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,000 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 1 x 1) = 4
 "-____-"     Trainable parameters = 134,515,584 of 134,515,584 (100.00% trained)


Step,Training Loss
10,3.1649
20,1.2609
30,0.4189
40,0.1757
50,0.0702


‚è± Runtime: 1.09 min | üíæ Peak reserved GPU: 2.376 GB


# Inference in Turkish

In [10]:
FastLanguageModel.for_inference(model)
inference_dtype = torch.bfloat16 if is_bfloat16_supported() else (torch.float16 if torch.cuda.is_available() else torch.float32)
model = model.to(device=device, dtype=inference_dtype)

def gen_tr(prompt, max_new_tokens=100, temperature=0.8, top_p=0.9):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            max_new_tokens = max_new_tokens,
            do_sample      = True,
            temperature    = temperature,
            top_p          = top_p,
            use_cache      = True,
        )
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))

# test prompts in Turkish
gen_tr("Yapay zekanƒ±n eƒüitim alanƒ±ndaki faydalarƒ±nƒ± iki c√ºmlede √∂zetleyin.")
gen_tr("Sƒ±navlara √ßalƒ±≈üƒ±rken √∂ƒürencilerin dikkat etmesi gereken √º√ß √∂nemli noktayƒ± yazƒ±n.")

Yapay zekanƒ±n eƒüitim alanƒ±ndaki faydalarƒ±nƒ± iki c√ºmlede √∂zetleyin.

ƒ∞lkr√ºnl√ºk √∂nce sakin kalmak ve kalmak √ßok sorunu √ßok sorunu √∂nemlidir.

ƒ∞lk √∂nemlidir. √áoƒüu zaman en iyi √∂nemlidir. √áoƒüu zaman en √∂nemlidir. √áoƒüu zaman en √∂nemlidir. √áoƒü
Sƒ±navlara √ßalƒ±≈üƒ±rken √∂ƒürencilerin dikkat etmesi gereken √º√ß √∂nemli noktayƒ± yazƒ±n.

ƒ∞lk √∂nemlidirken d√ºzenli tekrar ve kalmak √ßoƒüu zamanla daha karma≈üƒ±k yapƒ±lara ge√ßin.

ƒ∞lk √∂nemlidirken d√ºzenli tekrar ve kalmak √ßoƒüu zamanla daha karma≈üƒ±k yapƒ±lara ge√ßin.

√ñnemlidirken


# Save the Model

In [None]:
save_dir = "SmolLM2-135M-ContinuedPretraining-NewLanguage"
repo_id  = "username/SmolLM2-135M-ContinuedPretraining-NewLanguage"

model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print("‚úÖ Saved locally to:", save_dir)

model.push_to_hub(repo_id)
tokenizer.push_to_hub(repo_id)
print("‚úÖ Uploaded to HF Hub:", repo_id)

‚úÖ Saved locally to: SmolLM2-135M-ContinuedPretraining-NewLanguage


README.md:   0%|          | 0.00/570 [00:00<?, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...anguage/model.safetensors:   0%|          |  524kB /  269MB            

Saved model to https://huggingface.co/aditya-rajpurohit/SmolLM2-135M-ContinuedPretraining-NewLanguage


README.md:   0%|          | 0.00/576 [00:00<?, ?B/s]

‚úÖ Uploaded to HF Hub: aditya-rajpurohit/SmolLM2-135M-ContinuedPretraining-NewLanguage
