# *Fine-tuning* de *Large Language Models* (LLMs)

<p align="center">
  <a href="https://colab.research.google.com/github/auduvignac/llm-finetuning/blob/main/notebooks/project/finetuning-projet.ipynb" target="_blank">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Ouvrir dans Google Colab"/>
  </a>
</p>

Le but de ce projet est de réaliser le *fine-tuning* d'un LLM.


## Installation des bibliothèques/libraires requises

In [1]:
!wget -q https://raw.githubusercontent.com/auduvignac/llm-finetuning/refs/heads/main/setup_env.py -O setup_env.py
%run setup_env.py

⚡ Exécution sur Colab : vérification stricte des dépendances…
✅ accelerate 1.8.1 — OK
✅ bitsandbytes 0.46.0 — OK
✅ datasets 4.0.0 — OK
✅ matplotlib 3.10.0 — OK
✅ numpy 2.0.2 — OK
✅ peft 0.15.0 — OK
✅ tabulate 0.9.0 — OK
✅ torch 2.8.0+cu126 — OK
✅ tqdm 4.67.1 — OK
✅ transformers 4.55.2 — OK
✅ Toutes les dépendances satisfont les contraintes.


In [2]:
%matplotlib inline
%config InlineBackend.figure_formats = ['svg']

import contextlib
import math

import re

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
from datasets import (
    load_dataset,
)
from huggingface_hub import (
    list_models,
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from tabulate import (
    tabulate,
)
from torch.utils.data import (
    DataLoader,
)
from tqdm import (
    tqdm,
)

# Si le notebook est exécuté dans un environnement jupyter, la librairie
# ci-dessus peut être utilisée
from tqdm.notebook import (
    tqdm,
)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    DistilBertConfig,
    DistilBertModel,
    DistilBertTokenizer,
    Trainer,
    TrainingArguments,
)

# Utilisation d’un GPU avec CUDA lorsque disponible sur la machine d’exécution.
# L’utilisation d’un GPU pour l’apprentissage entraîne souvent d’énormes
# accélérations lors de l’entraînement.
# Voir https://developer.nvidia.com/cuda-downloads pour installer CUDA
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda')

In [20]:
class LlamaFineTuner:
    def __init__(
        self, model_id="TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
    ):
        self.model_id = model_id

        # Config quantization 4-bit
        self.bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype="float16",
            bnb_4bit_use_double_quant=False,
        )

        # Chargement du tokenizer et modèle
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_id,
            quantization_config=self.bnb_config,
            device_map="auto",
            trust_remote_code=True,
        )
        self.dataset = None
        self.trainer = None

    def __str__(self):
        return str(self.model)

    def generate(
        self,
        prompt: str,
        max_new_tokens: int = 200,
        temperature: float = 1.0,
        instruction_mode: bool = False,
    ):
        """
        Génèreation de texte avec le modèle.

        Si instruction_mode=True, formate le prompt en mode
        Instruction/Response.
        """

        DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

        # Formatage du prompt
        if instruction_mode:
            text = f"### Instruction:\n{prompt}\n\n### Response:\n"
        else:
            text = prompt

        inputs = self.tokenizer(text, return_tensors="pt").to(DEVICE)

        # Correction du warning: désactivation du checkpointing et
        # réactivation du cache
        with contextlib.suppress(Exception):
            self.model.gradient_checkpointing_disable()
            self.model.config.use_cache = True

        outputs = self.model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            do_sample=True,
        )
        return self.tokenizer.decode(outputs[0])

    def prepare_dataset(self, dataset_name="tatsu-lab/alpaca"):
        def process_data(sample):
            return self.tokenizer(sample["text"])

        data = load_dataset(dataset_name)
        self.dataset = data.map(process_data, batched=True)
        return self.dataset

    def prepare_for_kbit_training(self):
        self.model = prepare_model_for_kbit_training(self.model)

    def print_trainable_parameters(self):
        trainable_params = 0
        all_param = 0
        for _, param in self.model.named_parameters():
            all_param += param.numel()
            if param.requires_grad:
                trainable_params += param.numel()
        print(
            f"trainable params: {trainable_params} || all params: {all_param} "
            f"|| trainable%: {100 * trainable_params / all_param:.2f}"
        )

    def apply_lora(self):
        config = LoraConfig(
            r=8,
            lora_alpha=32,
            target_modules=["q_proj", "k_proj", "v_proj"],
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM",
        )
        self.model = get_peft_model(self.model, config)
        self.print_trainable_parameters()

    def train(self, output_dir="outputs", max_steps=100):
        self.tokenizer.pad_token = self.tokenizer.eos_token

        training_args = TrainingArguments(
            per_device_train_batch_size=16,
            gradient_accumulation_steps=1,
            max_steps=max_steps,
            learning_rate=2e-4,
            fp16=True,
            logging_steps=1,
            output_dir=output_dir,
            optim="paged_adamw_8bit",
            report_to="none",
        )

        self.trainer = Trainer(
            model=self.model,
            train_dataset=self.dataset["train"],
            args=training_args,
            data_collator=DataCollatorForLanguageModeling(
                self.tokenizer, mlm=False
            ),
        )

        self.model.config.use_cache = False
        self.trainer.train()

    def workflow(
        self,
        quick_test_prompt: str = "Paris is the capital of",
        instruction_prompt: str = "Propose an outdoor activity.",
        max_steps: int = 10,
    ):
        """
        Exécution d'un workflow complet :
          1. Génération initiale (avant entraînement)
          2. Préparation du dataset
          3. Préparation du modèle pour k-bit training
          4. Application de LoRA
          5. Entraînement (max_steps configurables)
          6. Génération finale en mode instruction

        Args:
            quick_test_prompt: Texte de génération avant entraînement
            instruction_prompt: Instruction pour la génération finale
            max_steps: Nombre d'étapes d'entraînement
        """
        print(" Génération initiale (avant fine-tuning)…")
        print(self.generate(quick_test_prompt, max_new_tokens=50))

        print("\n Préparation du dataset…")
        self.prepare_dataset()

        print("\n Préparation k-bit training…")
        self.prepare_for_kbit_training()

        print("\n Application de LoRA…")
        self.apply_lora()

        print("\n Entraînement…")
        self.train(max_steps=max_steps)

        print("\nGénération finale (après fine-tuning)…")
        print(
            self.generate(
                instruction_prompt, instruction_mode=True, max_new_tokens=100
            )
        )

In [4]:
llm_reference = LlamaFineTuner()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


### Elements de précision sur le modèle de référence (`TinyLlama-1.1B-intermediate-step-1431k-3T`)

Affichons l'architecture associée au modèle de référence.

In [5]:
print(llm_reference)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear4bit(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), e

#### Présentation générale

Le modèle `TinyLlama-1.1B-intermediate-step-1431k-3T` présente une architecture constituée de 22 couches de décodeur (`22 x LlamaDecoderLayer`) et comporte environ 1,1 milliard de paramètres (la notation `1.1B` faisant référence à *billion*, soit un milliard en français).

Chaque couche applique successivement :  

1. une normalisation (RMSNorm) ;
2. un bloc *Self-Attention* (avec projections en 4-bit) ;
3. une nouvelle normalisation ;
4. un bloc *MLP (feed-forward)*.

#### Décomposition des blocs principaux

- `embed_tokens` : transformation de chaque token du vocabulaire (taille 32 000) en un vecteur de dimension 2048 ;
- `layers (0-21)` : 22 couches identiques de type `LlamaDecoderLayer` ;
- `self_attn` : bloc d'attention interne avec projections `q_proj`, `k_proj`, `v_proj`, `o_proj` en quantification 4-bit ;
- `mlp` : bloc feed-forward avec 3 projections (`gate_proj`, `up_proj`, `down_proj`) + fonction d'activation **SiLU** ;
- `input_layernorm` / `post_attention_layernorm` : normalisations RMS ;
- `rotary_emb` : embeddings rotatoires pour gérer la position des tokens (RoPE) ;
- `norm` : normalisation finale avant la couche de sortie ;
- `lm_head` : projection finale (2048 → 32 000) pour prédire le prochain token du vocabulaire.  

#### Elements clefs

- 22 couches : profondeur du modèle TinyLLaMA (environ 1.1B paramètres) ;
- **Quantization 4-bit** (`Linear4bit`) : réduit l'empreinte mémoire, rend le fine-tuning abordable.  
- **LoRA** sera appliqué principalement sur les modules d'attention (`q_proj`, `k_proj`, `v_proj`).  

In [None]:
llm_reference.workflow()

### Liste des modèles Llama disponibles

In [21]:
from huggingface_hub import list_models

accessible_models = []

models = list_models(search="llama", limit=50)
for model in models:
    try:
        current_model = LlamaFineTuner(model.modelId)  # essaie de charger
        total_params = sum(p.numel() for p in current_model.model.parameters())
        print(f"{model.modelId:<50} {total_params:,}")
        accessible_models.append((model.modelId, total_params))
    except OSError as e:
        print(f"Accès refusé : {model.modelId}")
    except Exception as e:
        print(f"Erreur avec {model.modelId}: {e}")

Accès refusé : meta-llama/Llama-3.1-8B-Instruct
Accès refusé : meta-llama/Llama-3.2-1B
Accès refusé : meta-llama/Llama-3.2-1B-Instruct
Accès refusé : meta-llama/Llama-3.2-3B-Instruct
Accès refusé : meta-llama/Meta-Llama-3-8B-Instruct


configuration_decilm.py: 0.00B [00:00, ?B/s]

(…)nsformers_4_44_2__configuration_llama.py: 0.00B [00:00, ?B/s]

(…)nsformers_4_44_2__modeling_rope_utils.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nvidia/Llama-3_3-Nemotron-Super-49B-v1_5:
- transformers_4_44_2__modeling_rope_utils.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/nvidia/Llama-3_3-Nemotron-Super-49B-v1_5:
- transformers_4_44_2__configuration_llama.py
- transformers_4_44_2__modeling_rope_utils.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


block_config.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nvidia/Llama-3_3-Nemotron-Super-49B-v1_5:
- block_config.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/nvidia/Llama-3_3-Nemotron-Super-49B-v1_5:
- configuration_decilm.py
- transformers_4_44_2__configuration_llama.py
- block_config.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_decilm.py: 0.00B [00:00, ?B/s]

(…)flash_attention_utils_backward_compat.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nvidia/Llama-3_3-Nemotron-Super-49B-v1_5:
- transformers_4_44_2__modeling_flash_attention_utils_backward_compat.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


variable_cache.py: 0.00B [00:00, ?B/s]

transformers_4_44_2__cache_utils.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nvidia/Llama-3_3-Nemotron-Super-49B-v1_5:
- transformers_4_44_2__cache_utils.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/nvidia/Llama-3_3-Nemotron-Super-49B-v1_5:
- variable_cache.py
- transformers_4_44_2__cache_utils.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


(…)mers_4_44_2__modeling_attn_mask_utils.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nvidia/Llama-3_3-Nemotron-Super-49B-v1_5:
- transformers_4_44_2__modeling_attn_mask_utils.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


transformers_4_44_2__pytorch_utils.py:   0%|          | 0.00/666 [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nvidia/Llama-3_3-Nemotron-Super-49B-v1_5:
- transformers_4_44_2__pytorch_utils.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


transformers_4_44_2__activations.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nvidia/Llama-3_3-Nemotron-Super-49B-v1_5:
- transformers_4_44_2__activations.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


transformers_4_44_2__modeling_outputs.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nvidia/Llama-3_3-Nemotron-Super-49B-v1_5:
- transformers_4_44_2__modeling_outputs.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/nvidia/Llama-3_3-Nemotron-Super-49B-v1_5:
- modeling_decilm.py
- transformers_4_44_2__modeling_flash_attention_utils_backward_compat.py
- variable_cache.py
- transformers_4_44_2__modeling_attn_mask_utils.py
- transformers_4_44_2__pytorch_utils.py
- transformers_4_44_2__activations.py
- transformers_4_44_2__modeling_outputs.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 21 files:   0%|          | 0/21 [00:00<?, ?it/s]

model-00001-of-00021.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00004-of-00021.safetensors:   0%|          | 0.00/4.91G [00:00<?, ?B/s]

model-00007-of-00021.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00003-of-00021.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00008-of-00021.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00006-of-00021.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00005-of-00021.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00002-of-00021.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [32]:
import re
from huggingface_hub import list_models

def guess_params(model_id, card_data=None):
    # 1) depuis les métadonnées HF
    if card_data and "n_parameters" in card_data:
        return card_data["n_parameters"]

    # 2) regex simple B/M
    match = re.search(r"(\d+(?:\.\d+)?)([BM])", model_id, re.IGNORECASE)
    if match:
        num, unit = match.groups()
        num = float(num)
        return f"{num}{unit.upper()}"

    # 3) cas Mixture of Experts (ex: 8X3B ou 17B-16E)
    moe_match = re.search(r"(\d+)X(\d+)([BM])", model_id, re.IGNORECASE)
    if moe_match:
        n_experts, size, unit = moe_match.groups()
        n_experts, size = int(n_experts), int(size)
        total = n_experts * size
        return f"{total}{unit.upper()} (MoE, {n_experts}x{size}{unit})"

    e_match = re.search(r"(\d+)([BM])-?(\d+)E", model_id, re.IGNORECASE)
    if e_match:
        size, unit, n_experts = e_match.groups()
        return f"{size}{unit.upper()} ({n_experts} experts)"

    return "inconnu"

def size_to_number(size_str):
    """Convertit '8B' → 8e9, '1.1B' → 1.1e9, '500M' → 500e6."""
    if not size_str or size_str == "inconnu":
        return float("inf")  # les inconnus vont à la fin
    size_str = size_str.upper().replace(" ", "")
    if size_str.endswith("B"):
        return float(size_str[:-1]) * 1e9
    if size_str.endswith("M"):
        return float(size_str[:-1]) * 1e6
    return float("inf")

models = [{"name" : model.modelId, "size": guess_params(model.modelId)} for model in list_models(search="llama", limit=50)]
print(models)
models_sorted = sorted(models, key=lambda m: size_to_number(m['size']))

for m in models_sorted:
    print(f"{m['name']:<45} {m['size']}")

[{'name': 'meta-llama/Llama-3.1-8B-Instruct', 'size': '8.0B'}, {'name': 'meta-llama/Llama-3.2-1B', 'size': '1.0B'}, {'name': 'meta-llama/Llama-3.2-1B-Instruct', 'size': '1.0B'}, {'name': 'meta-llama/Llama-3.2-3B-Instruct', 'size': '3.0B'}, {'name': 'meta-llama/Meta-Llama-3-8B-Instruct', 'size': '8.0B'}, {'name': 'nvidia/Llama-3_3-Nemotron-Super-49B-v1_5', 'size': '49.0B'}, {'name': 'meta-llama/Llama-4-Scout-17B-16E-Instruct', 'size': '17.0B'}, {'name': 'meta-llama/Llama-2-7b-chat-hf', 'size': '7.0B'}, {'name': 'fancyfeast/llama-joycaption-beta-one-hf-llava', 'size': 'inconnu'}, {'name': 'meta-llama/Llama-3.1-8B', 'size': '8.0B'}, {'name': 'DavidAU/Llama-3.2-8X3B-MOE-Dark-Champion-Instruct-uncensored-abliterated-18.4B-GGUF', 'size': '3.0B'}, {'name': 'concedo/llama-joycaption-beta-one-hf-llava-mmproj-gguf', 'size': 'inconnu'}, {'name': 'Aleph-Alpha/llama-tfree-hat-pretrained-7b-dpo', 'size': '7.0B'}, {'name': 'meta-llama/Llama-2-7b-hf', 'size': '7.0B'}, {'name': 'meta-llama/Llama-4-Scou

`TinyLlama-1.1B-intermediate-step-1431k-3T` est un modèle conçu pour l'expérimentation rapide et le *fine-tuning* sur GPU limités.

D'autres modèles Llama sont disponibles :
- **LLaMA-7B** : 7 milliards de paramètres  
- **LLaMA-13B** : 13 milliards  
- **LLaMA-65B** : 65 milliards.