# 🔄 What Does “Continued Fine-Tuning from Checkpoint” Mean?

Instead of beginning fine-tuning from a base model (e.g., `distilgpt2`), we resume training from a **partially fine-tuned checkpoint**, allowing us to build on prior learning.

✅ **What We’ll Implement in This Colab:**

- Fine-tune `distilgpt2` on a dataset (simulating initial training).
- Save the resulting model as a custom checkpoint.
- Reload the checkpoint and continue fine-tuning on new data (custom continuation).


In [1]:
# Install required packages
!pip install -q transformers datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m450.6/491.4 kB[0m [31m14.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source o

# 📦 Import Required Libraries for Continued Fine-Tuning

In [2]:
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Load the Base Model and Tokenizer

In [3]:
# Step 1️⃣: Load distilgpt2
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

# Create Initial Dataset for First Round of Fine-Tuning

In [4]:
# Step 2️⃣: Create initial dataset for first round of fine-tuning
initial_data = Dataset.from_dict({
    "text": [
        "User: What is the capital of India?\nAssistant: The capital of India is New Delhi.",
        "User: Who is the Prime Minister of Canada?\nAssistant: The Prime Minister of Canada is Justin Trudeau.",
    ]
})

def tokenize_fn(ex):
    return tokenizer(ex["text"], padding="max_length", truncation=True, max_length=128)

tokenized_initial = initial_data.map(tokenize_fn)


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

# Train the Initial Model and Save Checkpoint

In [5]:
# Step 3️⃣: Train initial model and save checkpoint
training_args_1 = TrainingArguments(
    output_dir="./custom_checkpoint",
    per_device_train_batch_size=1,
    num_train_epochs=1,
    save_total_limit=1,
    logging_steps=1,
    report_to="none",
)

trainer_1 = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args_1,
    train_dataset=tokenized_initial,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

trainer_1.train()
model.save_pretrained("./custom_checkpoint")
tokenizer.save_pretrained("./custom_checkpoint")

  trainer_1 = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
1,3.1851
2,2.5777


('./custom_checkpoint/tokenizer_config.json',
 './custom_checkpoint/special_tokens_map.json',
 './custom_checkpoint/vocab.json',
 './custom_checkpoint/merges.txt',
 './custom_checkpoint/added_tokens.json',
 './custom_checkpoint/tokenizer.json')

# Reload from Custom Checkpoint and Continue Fine-Tuning

In [6]:
# Load model from the saved checkpoint
model2 = AutoModelForCausalLM.from_pretrained("./custom_checkpoint")
tokenizer2 = AutoTokenizer.from_pretrained("./custom_checkpoint")

# New data for continuation
new_data = Dataset.from_dict({
    "text": [
        "User: What is the currency of Japan?\nAssistant: The currency of Japan is the Yen.",
        "User: What language is spoken in Brazil?\nAssistant: Portuguese is the main language in Brazil.",
    ]
})

tokenized_new = new_data.map(tokenize_fn)

training_args_2 = TrainingArguments(
    output_dir="./continued_finetune",
    per_device_train_batch_size=1,
    num_train_epochs=1,
    save_total_limit=1,
    logging_steps=1,
    report_to="none",
)

trainer_2 = Trainer(
    model=model2,
    tokenizer=tokenizer2,
    args=training_args_2,
    train_dataset=tokenized_new,
    data_collator=DataCollatorForLanguageModeling(tokenizer2, mlm=False)
)

trainer_2.train()


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

  trainer_2 = Trainer(


Step,Training Loss
1,2.4908
2,3.153


TrainOutput(global_step=2, training_loss=2.82192063331604, metrics={'train_runtime': 9.0957, 'train_samples_per_second': 0.22, 'train_steps_per_second': 0.22, 'total_flos': 65324187648.0, 'train_loss': 2.82192063331604, 'epoch': 1.0})

# 🔍 Inference: Test the Model After Continued Fine-Tuning

In [7]:
prompt = "User: What is the currency of Japan?\nAssistant:"
input_ids = tokenizer2(prompt, return_tensors="pt").input_ids
output = model2.generate(input_ids, max_new_tokens=20)
print(tokenizer2.decode(output[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


User: What is the currency of Japan?
Assistant: The currency of Japan is the yen. The currency of Japan is the yen. The currency of Japan
