# Chapter 7 - Exercises

> Author : Badr TAJINI - Large Language model (LLMs) - ESIEE 2024-2025

---


In [37]:
import torch
from torch.utils.data import Dataset, DataLoader
import json
import os
import urllib
import tiktoken
from functools import partial
from previous_labs import (
    calc_loss_loader,
    train_model_simple,
    token_ids_to_text,
    text_to_token_ids,
    generate,
    plot_losses
)
import time
from tqdm import tqdm
import psutil
import re

In [2]:
from gpt_download import download_and_load_gpt2
from previous_labs import GPTModel, load_weights_into_gpt


BASE_CONFIG = {
    "vocab_size": 50257,     # Vocabulary size
    "context_length": 1024,  # Context length
    "drop_rate": 0.0,        # Dropout rate
    "qkv_bias": True         # Query-key-value bias
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

CHOOSE_MODEL = "gpt2-medium (355M)"

BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
settings, params = download_and_load_gpt2(model_size=model_size, models_dir="gpt2")

model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)
model.eval();

File already exists and is up-to-date: gpt2\355M\checkpoint
File already exists and is up-to-date: gpt2\355M\encoder.json
File already exists and is up-to-date: gpt2\355M\hparams.json
File already exists and is up-to-date: gpt2\355M\model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2\355M\model.ckpt.index
File already exists and is up-to-date: gpt2\355M\model.ckpt.meta
File already exists and is up-to-date: gpt2\355M\vocab.bpe


In [4]:
def download_and_load_file(file_path, url):

    if not os.path.exists(file_path):
        with urllib.request.urlopen(url) as response:
            text_data = response.read().decode("utf-8")
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(text_data)
    else:
        with open(file_path, "r", encoding="utf-8") as file:
            text_data = file.read()

    with open(file_path, "r") as file:
        data = json.load(file)

    return data


file_path = "instruction-data.json"
url = ""

data = download_and_load_file(file_path, url)
print("Number of entries:", len(data))

Number of entries: 1100


In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Device:", device)

Device: cpu


## Exercise 7.1: Changing prompt styles

**Prompt Style Comparative Analysis: Impact on Model Response Quality**

**Key Research Question: How do different prompt styles `(Alpaca vs. Phi-3)` influence the generative response quality of the `fine-tuned` model?**

*Methodological Approach:*
- `Fine-tune` model with `Alpaca` prompt style
- Apply `Phi-3 prompt` configuration
- Compare response quality metrics

*Critical Parameters:*
- Prompt style variations
- Response quality assessment
- Comparative performance evaluation

*Recommended Investigation:*
1. Implement Phi-3 prompt style `shown in figure 4 in chapter 7`
2. Evaluate response quality
3. Compare with `Alpaca` prompt results
4. Analyze observed variations



In [5]:
def format_input(entry):
    instruction_text = (
        f"<|user|>\n{entry['instruction']}"
    )

    input_text = f"\n{entry['input']}" if entry["input"] else ""

    return instruction_text + input_text

In [6]:
model_input = format_input(data[50])
desired_response = f"\n\n<|assistant|>\n{data[50]['output']}"

print(model_input + desired_response)

<|user|>
Identify the correct spelling of the following word.
Ocassion

<|assistant|>
The correct spelling is 'Occasion.'


In [7]:
train_portion = int(len(data) * 0.85)  # 85% for training
test_portion = int(len(data) * 0.1)    # 10% for testing
val_portion = len(data) - train_portion - test_portion  # Remaining 5% for validation

train_data = data[:train_portion]
test_data = data[train_portion:train_portion + test_portion]
val_data = data[train_portion + test_portion:]

In [8]:
print("Training set length:", len(train_data))
print("Validation set length:", len(val_data))
print("Test set length:", len(test_data))

Training set length: 935
Validation set length: 55
Test set length: 110


In [25]:
class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data

        # Pre-tokenize texts
        self.encoded_texts = []
        for entry in data:
            instruction_plus_input = format_input(entry)
            response_text = f"\n\n<|assistant|>\n{entry['output']}"
            full_text = instruction_plus_input + response_text
            self.encoded_texts.append(
                tokenizer.encode(full_text)
            )

    def __getitem__(self, index):
        return self.encoded_texts[index]

    def __len__(self):
        return len(self.data)

In [26]:
tokenizer = tiktoken.get_encoding("gpt2")

print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

[50256]


In [27]:
def custom_collate_fn(
    batch,
    pad_token_id=50256,
    ignore_index=-100,
    allowed_max_length=None,
    device="cpu"
):
    # Find the longest sequence in the batch
    batch_max_length = max(len(item)+1 for item in batch)

    # Pad and prepare inputs and targets
    inputs_lst, targets_lst = [], []

    for item in batch:
        new_item = item.copy()
        # Add an <|endoftext|> token
        new_item += [pad_token_id]
        # Pad sequences to max_length
        padded = new_item + [pad_token_id] * (batch_max_length - len(new_item))
        inputs = torch.tensor(padded[:-1])  # Truncate the last token for inputs
        targets = torch.tensor(padded[1:])  # Shift +1 to the right for targets

        # New: Replace all but the first padding tokens in targets by ignore_index
        mask = targets == pad_token_id
        indices = torch.nonzero(mask).squeeze()
        if indices.numel() > 1:
            targets[indices[1:]] = ignore_index

        # New: Optionally truncate to maximum sequence length
        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]

        inputs_lst.append(inputs)
        targets_lst.append(targets)

    # Convert list of inputs and targets to tensors and transfer to target device
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)

    return inputs_tensor, targets_tensor

In [28]:
customized_collate_fn = partial(custom_collate_fn, device=device, allowed_max_length=1024)

In [29]:
num_workers = 0
batch_size = 8

torch.manual_seed(123)

train_dataset = InstructionDataset(train_data, tokenizer)
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=True,
    drop_last=True,
    num_workers=num_workers
)

In [30]:
val_dataset = InstructionDataset(val_data, tokenizer)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

test_dataset = InstructionDataset(test_data, tokenizer)
test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

In [31]:
print("Train loader:")
for inputs, targets in train_loader:
    print(inputs.shape, targets.shape)

Train loader:
torch.Size([8, 41]) torch.Size([8, 41])
torch.Size([8, 61]) torch.Size([8, 61])
torch.Size([8, 53]) torch.Size([8, 53])
torch.Size([8, 48]) torch.Size([8, 48])
torch.Size([8, 45]) torch.Size([8, 45])
torch.Size([8, 57]) torch.Size([8, 57])
torch.Size([8, 60]) torch.Size([8, 60])
torch.Size([8, 47]) torch.Size([8, 47])
torch.Size([8, 42]) torch.Size([8, 42])
torch.Size([8, 55]) torch.Size([8, 55])
torch.Size([8, 47]) torch.Size([8, 47])
torch.Size([8, 48]) torch.Size([8, 48])
torch.Size([8, 47]) torch.Size([8, 47])
torch.Size([8, 62]) torch.Size([8, 62])
torch.Size([8, 54]) torch.Size([8, 54])
torch.Size([8, 64]) torch.Size([8, 64])
torch.Size([8, 56]) torch.Size([8, 56])
torch.Size([8, 46]) torch.Size([8, 46])
torch.Size([8, 68]) torch.Size([8, 68])
torch.Size([8, 48]) torch.Size([8, 48])
torch.Size([8, 65]) torch.Size([8, 65])
torch.Size([8, 56]) torch.Size([8, 56])
torch.Size([8, 49]) torch.Size([8, 49])
torch.Size([8, 45]) torch.Size([8, 45])
torch.Size([8, 53]) torch.

In [None]:
start_time = time.time()

torch.manual_seed(123)

optimizer = torch.optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.1)

num_epochs = 2

train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context=format_input(val_data[0]), tokenizer=tokenizer
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

In [None]:
epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)

In [None]:
torch.manual_seed(123)


for entry in test_data[:3]:

    input_text = format_input(entry)

    token_ids = generate(
        model=model,
        idx=text_to_token_ids(input_text, tokenizer).to(device),
        max_new_tokens=256,
        context_size=BASE_CONFIG["context_length"],
        eos_id=50256
    )
    generated_text = token_ids_to_text(token_ids, tokenizer)
    response_text = generated_text[len(input_text):].replace("### Response:", "").strip()

    print(input_text)
    print(f"\nCorrect response:\n>> {entry['output']}")
    print(f"\nModel response:\n>> {response_text.strip()}")
    print("-------------------------------------")

In [None]:
for i, entry in tqdm(enumerate(test_data), total=len(test_data)):

    input_text = format_input(entry)

    token_ids = generate(
        model=model,
        idx=text_to_token_ids(input_text, tokenizer).to(device),
        max_new_tokens=256,
        context_size=BASE_CONFIG["context_length"],
        eos_id=50256
    )
    generated_text = token_ids_to_text(token_ids, tokenizer)
    response_text = generated_text[len(input_text):].replace("### Response:", "").strip()

    test_data[i]["model_response"] = response_text


with open("instruction-data-with-response.json", "w") as file:
    json.dump(test_data, file, indent=4)  # "indent" for pretty-printing

In [None]:
print(test_data[0])

In [None]:
def check_if_running(process_name):
    running = False
    for proc in psutil.process_iter(["name"]):
        if process_name in proc.info["name"]:
            running = True
            break
    return running

ollama_running = check_if_running("ollama")

if not ollama_running:
    raise RuntimeError("Ollama not running. Launch ollama before proceeding.")
print("Ollama running:", check_if_running("ollama"))

In [None]:
import urllib.request

def query_model(prompt, model="llama3", url="http://localhost:11434/api/chat"):
    # Create the data payload as a dictionary
    data = {
        "model": model,
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "options": {     # Settings below are required for deterministic responses
            "seed": 123,
            "temperature": 0,
            "num_ctx": 2048
        }
    }


    # Convert the dictionary to a JSON formatted string and encode it to bytes
    payload = json.dumps(data).encode("utf-8")

    # Create a request object, setting the method to POST and adding necessary headers
    request = urllib.request.Request(url, data=payload, method="POST")
    request.add_header("Content-Type", "application/json")

    # Send the request and capture the response
    response_data = ""
    with urllib.request.urlopen(request) as response:
        # Read and decode the response
        while True:
            line = response.readline().decode("utf-8")
            if not line:
                break
            response_json = json.loads(line)
            response_data += response_json["message"]["content"]

    return response_data


model = "llama3"

In [None]:
def generate_model_scores(json_data, json_key, model="llama3"):
    scores = []
    for entry in tqdm(json_data, desc="Scoring entries"):
        prompt = (
            f"Given the input `{format_input(entry)}` "
            f"and correct output `{entry['output']}`, "
            f"score the model response `{entry[json_key]}`"
            f" on a scale from 0 to 100, where 100 is the best score. "
            f"Respond with the integer number only."
        )
        score = query_model(prompt, model)
        try:
            scores.append(int(score))
        except ValueError:
            print(f"Could not convert score: {score}")
            continue

    return scores


scores = generate_model_scores(test_data, "model_response")
print(f"Number of scores: {len(scores)} of {len(test_data)}")
print(f"Average score: {sum(scores)/len(scores):.2f}\n")

On obtient un score d'environ 51 soit proche du format alcapa-style.
Avec ce test je ne vois pas de réel avantage au format phi comparé au format alcapa-style.

&nbsp;
## Exercise 7.2: Instruction and input masking

**Instruction Masking Performance Evaluation**

**Key Research Question**: How does replacing instruction and input `tokens` with the `-100` mask impact model performance during fine-tuning?

*Methodological Approach:*
- Implement `-100` token masking for instructions
- Evaluate model performance
- Compare against standard fine-tuning approach

*Critical Parameters:*
- Instruction masking technique
- Performance assessment metrics
- Comparative analysis methodology

*Recommended Investigation:*
1. Apply `-100` mask to instruction and input `tokens`
2. Fine-tune model using `InstructionDataset`
3. Measure and compare performance metrics
4. Analyze potential learning improvements



In [38]:
def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )

    input_text = f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""

    return instruction_text + input_text

In [39]:
class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data

        self.instruction_lengths = []

        self.encoded_texts = []
        
        for entry in data:
            instruction_plus_input = format_input(entry)
            response_text = f"\n\n### Response:\n{entry['output']}"
            full_text = instruction_plus_input + response_text
            
            self.encoded_texts.append(
                tokenizer.encode(full_text)
            )


            instruction_length = len(tokenizer.encode(instruction_plus_input))
            self.instruction_lengths.append(instruction_length)

    def __getitem__(self, index):
        return self.instruction_lengths[index], self.encoded_texts[index]

    def __len__(self):
        return len(self.data)

In [40]:
tokenizer = tiktoken.get_encoding("gpt2")

In [41]:
def custom_collate_fn(
    batch,
    pad_token_id=50256,
    ignore_index=-100,
    allowed_max_length=None,
    device="cpu"
):
    # Find the longest sequence in the batch
    batch_max_length = max(len(item)+1 for instruction_length, item in batch)   # New: batch is now a tuple

    # Pad and prepare inputs and targets
    inputs_lst, targets_lst = [], []

    for instruction_length, item in batch:  # New: batch is now a tuple
        new_item = item.copy()
        # Add an <|endoftext|> token
        new_item += [pad_token_id]
        # Pad sequences to max_length
        padded = new_item + [pad_token_id] * (batch_max_length - len(new_item))
        inputs = torch.tensor(padded[:-1])  # Truncate the last token for inputs
        targets = torch.tensor(padded[1:])  # Shift +1 to the right for targets

        # Replace all but the first padding tokens in targets by ignore_index
        mask = targets == pad_token_id
        indices = torch.nonzero(mask).squeeze()
        if indices.numel() > 1:
            targets[indices[1:]] = ignore_index

        # Mask all input and instruction tokens in the targets
        targets[:instruction_length-1] = -100
        
        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]
        
        inputs_lst.append(inputs)
        targets_lst.append(targets)

    # Convert list of inputs and targets to tensors and transfer to target device
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)

    return inputs_tensor, targets_tensor

In [42]:
customized_collate_fn = partial(custom_collate_fn, device=device, allowed_max_length=1024)

In [43]:
num_workers = 0
batch_size = 8

torch.manual_seed(123)

train_dataset = InstructionDataset(train_data, tokenizer)
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=True,
    drop_last=True,
    num_workers=num_workers
)

val_dataset = InstructionDataset(val_data, tokenizer)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

test_dataset = InstructionDataset(test_data, tokenizer)
test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

In [None]:
start_time = time.time()

torch.manual_seed(123)

optimizer = torch.optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.1)

num_epochs = 2

train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context=format_input(val_data[0]), tokenizer=tokenizer
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

In [None]:
torch.manual_seed(123)


for entry in test_data[:3]:

    input_text = format_input(entry)

    token_ids = generate(
        model=model,
        idx=text_to_token_ids(input_text, tokenizer).to(device),
        max_new_tokens=256,
        context_size=BASE_CONFIG["context_length"],
        eos_id=50256
    )
    generated_text = token_ids_to_text(token_ids, tokenizer)
    response_text = generated_text[len(input_text):].replace("### Response:", "").strip()

    print(input_text)
    print(f"\nCorrect response:\n>> {entry['output']}")
    print(f"\nModel response:\n>> {response_text.strip()}")
    print("-------------------------------------")

In [None]:
from tqdm import tqdm

for i, entry in tqdm(enumerate(test_data), total=len(test_data)):

    input_text = format_input(entry)

    token_ids = generate(
        model=model,
        idx=text_to_token_ids(input_text, tokenizer).to(device),
        max_new_tokens=256,
        context_size=BASE_CONFIG["context_length"],
        eos_id=50256
    )
    generated_text = token_ids_to_text(token_ids, tokenizer)
    response_text = generated_text[len(input_text):].replace("### Response:", "").strip()

    test_data[i]["model_response"] = response_text


with open("instruction-data-with-response.json", "w") as file:
    json.dump(test_data, file, indent=4)  # "indent" for pretty-printing

In [None]:
print(test_data[0])

In [None]:
def check_if_running(process_name):
    running = False
    for proc in psutil.process_iter(["name"]):
        if process_name in proc.info["name"]:
            running = True
            break
    return running

ollama_running = check_if_running("ollama")

if not ollama_running:
    raise RuntimeError("Ollama not running. Launch ollama before proceeding.")
print("Ollama running:", check_if_running("ollama"))

In [None]:
def generate_model_scores(json_data, json_key, model="llama3"):
    scores = []
    for entry in tqdm(json_data, desc="Scoring entries"):
        prompt = (
            f"Given the input `{format_input(entry)}` "
            f"and correct output `{entry['output']}`, "
            f"score the model response `{entry[json_key]}`"
            f" on a scale from 0 to 100, where 100 is the best score. "
            f"Respond with the integer number only."
        )
        score = query_model(prompt, model)
        try:
            scores.append(int(score))
        except ValueError:
            print(f"Could not convert score: {score}")
            continue

    return scores


scores = generate_model_scores(test_data, "model_response")
print(f"Number of scores: {len(scores)} of {len(test_data)}")
print(f"Average score: {sum(scores)/len(scores):.2f}\n")

On obtient un score d'environ 48% ce qui est moins bien que notre fine-tuning standard.

&nbsp;
## Exercise 7.3: Finetuning on the original Alpaca dataset

**Large-Scale Instruction Dataset Fine-Tuning: Computational and Methodological Considerations**

The Alpaca dataset, a significant instruction dataset created by Stanford researchers. With 52,002 entries, this dataset is notably larger than the previously mentioned instruction-data.json file. The text provides recommendations for fine-tuning a Large Language Model (LLM) using this dataset.

**Key Research Question: How can one effectively fine-tune an LLM using the Alpaca dataset while managing computational resources and potential memory constraints?**

**Link to download the Alpaca dataset**:  [here](https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json).

*Methodological Approach:*
- Analyze Alpaca dataset characteristics
- Develop GPU-accelerated fine-tuning strategy
- Implement computational optimization techniques

*Critical Parameters:*
- Dataset scale (52,002 entries)
- Computational resource management
- Fine-tuning performance optimization

*Computational Optimization Strategies:*
- Batch size reduction (`batch_size`)
- Maximum sequence length adjustment
- GPU resource utilization

*Recommended Investigation:*
1. Load and prepare Alpaca dataset
2. Implement adaptive fine-tuning approach
3. Address potential memory constraints
4. Optimize computational performance

*Key Mitigation Techniques:
- Reduce `batch_size` (8 → 4 → 2 → 1)
- Truncate `allowed_max_length` (1,024 → 512 → 256)
- Leverage GPU computational capabilities

In [46]:
file_path = "instruction-data_complete.json"
url = "https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json"

data = download_and_load_file(file_path, url)
print("Number of entries:", len(data))

Number of entries: 52002


In [47]:
print("Example entry:\n", data[50])

Example entry:
 {'instruction': 'Edit the following sentence to make it more concise.', 'input': 'He ran to the bus stop in order to catch the bus that was due to arrive in five minutes.', 'output': 'He ran to the bus stop, due to arrive in five minutes.'}


In [48]:
model_input = format_input(data[50])
desired_response = f"\n\n### Response:\n{data[50]['output']}"

print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Edit the following sentence to make it more concise.

### Input:
He ran to the bus stop in order to catch the bus that was due to arrive in five minutes.

### Response:
He ran to the bus stop, due to arrive in five minutes.


In [49]:
train_portion = int(len(data) * 0.85)  # 85% for training
test_portion = int(len(data) * 0.1)    # 10% for testing
val_portion = len(data) - train_portion - test_portion  # Remaining 5% for validation

train_data = data[:train_portion]
test_data = data[train_portion:train_portion + test_portion]
val_data = data[train_portion + test_portion:]

In [50]:
print("Training set length:", len(train_data))
print("Validation set length:", len(val_data))
print("Test set length:", len(test_data))

Training set length: 44201
Validation set length: 2601
Test set length: 5200


In [51]:
class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data

        # Pre-tokenize texts
        self.encoded_texts = []
        for entry in data:
            instruction_plus_input = format_input(entry)
            response_text = f"\n\n### Response:\n{entry['output']}"
            full_text = instruction_plus_input + response_text
            self.encoded_texts.append(
                tokenizer.encode(full_text)
            )

    def __getitem__(self, index):
        return self.encoded_texts[index]

    def __len__(self):
        return len(self.data)

In [52]:
def custom_collate_fn(
    batch,
    pad_token_id=50256,
    ignore_index=-100,
    allowed_max_length=None,
    device="cpu"
):
    # Find the longest sequence in the batch
    batch_max_length = max(len(item)+1 for item in batch)

    # Pad and prepare inputs and targets
    inputs_lst, targets_lst = [], []

    for item in batch:
        new_item = item.copy()
        # Add an <|endoftext|> token
        new_item += [pad_token_id]
        # Pad sequences to max_length
        padded = new_item + [pad_token_id] * (batch_max_length - len(new_item))
        inputs = torch.tensor(padded[:-1])  # Truncate the last token for inputs
        targets = torch.tensor(padded[1:])  # Shift +1 to the right for targets

        # New: Replace all but the first padding tokens in targets by ignore_index
        mask = targets == pad_token_id
        indices = torch.nonzero(mask).squeeze()
        if indices.numel() > 1:
            targets[indices[1:]] = ignore_index

        # New: Optionally truncate to maximum sequence length
        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]

        inputs_lst.append(inputs)
        targets_lst.append(targets)

    # Convert list of inputs and targets to tensors and transfer to target device
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)

    return inputs_tensor, targets_tensor

In [53]:
customized_collate_fn = partial(custom_collate_fn, device=device, allowed_max_length=1024)

In [54]:
num_workers = 0
batch_size = 8

torch.manual_seed(123)

train_dataset = InstructionDataset(train_data, tokenizer)
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=True,
    drop_last=True,
    num_workers=num_workers
)

val_dataset = InstructionDataset(val_data, tokenizer)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

test_dataset = InstructionDataset(test_data, tokenizer)
test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

In [55]:
print("Train loader:")
for inputs, targets in train_loader:
    print(inputs.shape, targets.shape)

Train loader:
torch.Size([8, 163]) torch.Size([8, 163])
torch.Size([8, 300]) torch.Size([8, 300])
torch.Size([8, 209]) torch.Size([8, 209])
torch.Size([8, 167]) torch.Size([8, 167])
torch.Size([8, 123]) torch.Size([8, 123])
torch.Size([8, 145]) torch.Size([8, 145])
torch.Size([8, 173]) torch.Size([8, 173])
torch.Size([8, 262]) torch.Size([8, 262])
torch.Size([8, 194]) torch.Size([8, 194])
torch.Size([8, 327]) torch.Size([8, 327])
torch.Size([8, 132]) torch.Size([8, 132])
torch.Size([8, 141]) torch.Size([8, 141])
torch.Size([8, 186]) torch.Size([8, 186])
torch.Size([8, 200]) torch.Size([8, 200])
torch.Size([8, 217]) torch.Size([8, 217])
torch.Size([8, 159]) torch.Size([8, 159])
torch.Size([8, 121]) torch.Size([8, 121])
torch.Size([8, 188]) torch.Size([8, 188])
torch.Size([8, 157]) torch.Size([8, 157])
torch.Size([8, 133]) torch.Size([8, 133])
torch.Size([8, 133]) torch.Size([8, 133])
torch.Size([8, 143]) torch.Size([8, 143])
torch.Size([8, 223]) torch.Size([8, 223])
torch.Size([8, 162])

In [56]:
print(inputs[0])

tensor([21106,   318,   281, 12064,   326,  8477,   257,  4876,    13, 19430,
          257,  2882,   326, 20431, 32543,   262,  2581,    13,   198,   198,
        21017, 46486,    25,   198, 13065,  3876,  1096,   262, 10066,   287,
         2237,  2456,    13,   198,   198, 21017, 23412,    25,   198, 26446,
         3640,   423,  3402,   326,   262,  5230,    11,   981,  1690,   852,
          257, 13205,  8271,    11,   460,   635,  2222,   546,  4633,  3048,
          884,   355,   281,  2620,   287, 10075, 16308,  1112,    11,  3092,
          286,  1919,   290,  7016,  4547,    11,   290,  3220,  7111,   284,
         1321,   326,   743,  3994,  3991,  1321,    13,   198,   198, 21017,
        18261,    25,   198, 20418,   527, 16308,  1112,    11,  3991,  1321,
           11, 15133,    13, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 

In [57]:
print(targets[0])

tensor([  318,   281, 12064,   326,  8477,   257,  4876,    13, 19430,   257,
         2882,   326, 20431, 32543,   262,  2581,    13,   198,   198, 21017,
        46486,    25,   198, 13065,  3876,  1096,   262, 10066,   287,  2237,
         2456,    13,   198,   198, 21017, 23412,    25,   198, 26446,  3640,
          423,  3402,   326,   262,  5230,    11,   981,  1690,   852,   257,
        13205,  8271,    11,   460,   635,  2222,   546,  4633,  3048,   884,
          355,   281,  2620,   287, 10075, 16308,  1112,    11,  3092,   286,
         1919,   290,  7016,  4547,    11,   290,  3220,  7111,   284,  1321,
          326,   743,  3994,  3991,  1321,    13,   198,   198, 21017, 18261,
           25,   198, 20418,   527, 16308,  1112,    11,  3991,  1321,    11,
        15133,    13, 50256,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100, 

In [None]:
start_time = time.time()

torch.manual_seed(123)

optimizer = torch.optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.1)

num_epochs = 2

train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context=format_input(val_data[0]), tokenizer=tokenizer
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Je n'ai pas pu finir le fine-tuning sur le dataset alcapa original car il durait plus d'une journée sur mon pc et qu'il demandait trop de calculs pour le gpu gratuit de google colab.

Mais j'imagine qu'on obtientdrait un score plus élevé et surtout un modèle qui sait répondre à un plus grand nombres de tâches et d'instructions puisqu'il a été entrainé sur un plus grand nombre de données.

De plus, je pense que réduire le batch_size de 8 à 1 permettra d'avoir un meilleur modèle car le modèle sera entrainée sur chaque instruction séparémment mais cela passera par un fine-tuning beaucoup plus long.

Réduire allowed_max_length (1,024 → 512 → 256) permettrait de réduire la consommation mémoire et d'avoir un entrainement plus rapide mais on obtiendrait un modèle moins pertinents car il y a un risque de perte d'information en baissant allowed_max_length