# 0. Setup.

In [4]:
import os
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'

In [2]:
%%html
<style>
    table {
        float: left;
        margin-right: 20px; /* Optional: Adds space between table and other content */
    }
</style>

# 1. Optuna.

## 1.1. Why use `Optuna`?
1. **Automated Hyperparameter Optimization**:
   - Simplifies the search for optimal hyperparameters using strategies like TPE and grid search.
2. **Efficient Search Algorithms**:
   - Employs smarter search strategies (e.g., Bayesian optimization) for faster convergence compared to brute-force methods.
3. **Dynamic Pruning**:
   - Automatically stops unpromising trials early, saving time and computational resources.
4. **Flexible Integration**:
   - Works seamlessly with frameworks like PyTorch, TensorFlow, and Hugging Face.
5. **Rich Visualizations**:
   - Provides easy-to-use tools for analyzing optimization history, hyperparameter importance, and parameter interactions.
6. **Scalable and Distributed**:
   - Supports parallel and distributed optimization across multiple machines.
7. **Persistent Study Management**:
   - Allows storing study results in databases (e.g., SQLite, PostgreSQL) for reproducibility and later analysis.
8. **Customization**:
   - Enables users to define custom objective functions and constraints for specific tasks.

## 1.2. Basic Usage.

In [9]:
# Data Preparation.
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import torch.nn as nn
import optuna

# Load.
ds       = load_dataset('imdb')
train_ds = Dataset.from_dict(ds['train'][:2500])  
test_ds  = Dataset.from_dict(ds['test'][:2500])   

# Tokenization.
checkpoint = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize(ds):
    return tokenizer(ds['text'], truncation=True, padding=True)

train_tokenized = train_ds.map(tokenize, batched=True)
test_tokenized  = test_ds.map(tokenize, batched=True)

# Define objective function for Optuna.
def objective(trial):
    # Hyperparameter suggestions from Optuna.
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)   # Search in continuous range (1e-5, 5e-5).
    dropout_rate  = trial.suggest_float("dropout_rate", 0.1, 0.5)                # [0.1, 0.5] for discrete values instead!

    # Configure model directly.
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
    model.classifier.dropout = nn.Dropout(dropout_rate)  # Dropout rate is tuned via Optuna.

    # Training arguments.
    training_args = TrainingArguments(
        output_dir="./temp",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=1,
        learning_rate=learning_rate,     # Learning rate is tuned via Optuna.
        evaluation_strategy="epoch",
        logging_dir="./logs",
        save_strategy="no"
    )

    # Trainer.
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized,
        eval_dataset=test_tokenized
    )

    # Train and evaluate.
    trainer.train()
    eval_results = trainer.evaluate()
    return eval_results["eval_loss"]

# Run Optuna study.
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)

# Best hyperparameters.
print("Best hyperparameters:", study.best_params)

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

[I 2024-12-29 00:45:38,708] A new study created in memory with name: no-name-38255e0c-618a-4e56-b4eb-714e595c9adc
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.00127


[I 2024-12-29 00:46:52,277] Trial 0 finished with value: 0.0012700868537649512 and parameters: {'learning_rate': 1.9007819166954822e-05, 'dropout_rate': 0.42922597921929884}. Best is trial 0 with value: 0.0012700868537649512.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.003179


[I 2024-12-29 00:48:06,139] Trial 1 finished with value: 0.003179125254973769 and parameters: {'learning_rate': 1.0149524372475638e-05, 'dropout_rate': 0.16525950829714547}. Best is trial 0 with value: 0.0012700868537649512.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.003055


[I 2024-12-29 00:49:20,221] Trial 2 finished with value: 0.0030551033560186625 and parameters: {'learning_rate': 1.0385910639905127e-05, 'dropout_rate': 0.3422294596561477}. Best is trial 0 with value: 0.0012700868537649512.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.000295


[I 2024-12-29 00:50:34,316] Trial 3 finished with value: 0.0002945879241451621 and parameters: {'learning_rate': 3.787297653035789e-05, 'dropout_rate': 0.23807413522898158}. Best is trial 3 with value: 0.0002945879241451621.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.001235


[I 2024-12-29 00:51:48,473] Trial 4 finished with value: 0.0012351060286164284 and parameters: {'learning_rate': 1.7754826308156474e-05, 'dropout_rate': 0.25629158428026444}. Best is trial 3 with value: 0.0002945879241451621.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.002876


[I 2024-12-29 00:53:02,535] Trial 5 finished with value: 0.002876021433621645 and parameters: {'learning_rate': 1.0755435438670248e-05, 'dropout_rate': 0.35170205857494086}. Best is trial 3 with value: 0.0002945879241451621.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.00065


[I 2024-12-29 00:54:17,048] Trial 6 finished with value: 0.0006501026218757033 and parameters: {'learning_rate': 2.554368477456048e-05, 'dropout_rate': 0.2662273820819414}. Best is trial 3 with value: 0.0002945879241451621.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.000385


[I 2024-12-29 00:55:31,326] Trial 7 finished with value: 0.0003851080546155572 and parameters: {'learning_rate': 3.3211808055202046e-05, 'dropout_rate': 0.37207385083974653}. Best is trial 3 with value: 0.0002945879241451621.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.001083


[I 2024-12-29 00:56:45,543] Trial 8 finished with value: 0.0010834855493158102 and parameters: {'learning_rate': 1.9081663752353572e-05, 'dropout_rate': 0.37805772805827087}. Best is trial 3 with value: 0.0002945879241451621.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.001375


[I 2024-12-29 00:57:59,911] Trial 9 finished with value: 0.0013752002269029617 and parameters: {'learning_rate': 1.6695368150555907e-05, 'dropout_rate': 0.23315557648412463}. Best is trial 3 with value: 0.0002945879241451621.


Best hyperparameters: {'learning_rate': 3.787297653035789e-05, 'dropout_rate': 0.23807413522898158}


## 1.3. Visualization.

In [14]:
from optuna import create_study
from optuna_dashboard import run_server
from optuna.visualization import (
    plot_param_importances,
    plot_optimization_history,
    plot_parallel_coordinate,
    plot_contour,
    plot_slice,
    plot_edf,
)
from IPython.display import IFrame

# Run the integrated dashboard.
run_server("sqlite:///example_study.db", port=8080)  # Use storage path for the dashboard

# Display the dashboard inline in Jupyter Notebook.
IFrame(src="http://127.0.0.1:8080/", width=1000, height=600)

# Plot visualizations
plot_optimization_history(study).show()      # Optimization history.
plot_param_importances(study).show()         # Hyperparameter importance.
plot_parallel_coordinate(study).show()       # Parallel coordinates plot.
plot_contour(study).show()                   # Contour plot.
plot_slice(study).show()                     # Slice plot.
plot_edf(study).show()                       # Empirical Distribution Function (EDF) plot.

Bottle v0.13.2 server starting up (using WSGIRefServer())...
Listening on http://localhost:8080/
Hit Ctrl-C to quit.

127.0.0.1 - - [29/Dec/2024 01:02:14] "GET / HTTP/1.1" 302 0
127.0.0.1 - - [29/Dec/2024 01:02:14] "GET /dashboard HTTP/1.1" 200 4145
127.0.0.1 - - [29/Dec/2024 01:02:15] "GET /static/bundle.js HTTP/1.1" 200 4158971
127.0.0.1 - - [29/Dec/2024 01:02:16] "GET /api/studies HTTP/1.1" 200 141
127.0.0.1 - - [29/Dec/2024 01:02:16] "GET /favicon.ico HTTP/1.1" 200 7670
127.0.0.1 - - [29/Dec/2024 01:02:20] "GET /api/studies/1/param_importances HTTP/1.1" 200 27
127.0.0.1 - - [29/Dec/2024 01:02:21] "GET /api/studies/1?after=0 HTTP/1.1" 200 1220
127.0.0.1 - - [29/Dec/2024 01:02:21] "GET /api/meta HTTP/1.1" 200 64
127.0.0.1 - - [29/Dec/2024 01:02:31] "GET /api/studies/1?after=1 HTTP/1.1" 200 380
127.0.0.1 - - [29/Dec/2024 01:02:42] "GET /api/studies/1?after=1 HTTP/1.1" 200 380


ImportError: Tried to import 'plotly' but failed. Please make sure that the package is installed correctly to use this feature. Actual error: No module named 'plotly'.

# 2. PEFT.
- Reduce the number of trainable parameters by using **Prompt Parameters** or **Reparametrization Methods** like LoRA.
- Good in general, default for large LLM.
- Tradeoffs of using PEFT:
  - It may not work well with small dataset for downstream,
  - Except LoRA, PEFT is not compatible with quantization,
  - In multitask training, PEFT does not generalized well and increases complexity.  
    => Consider merging adapters.

## 2.1. PeftConfig.
- Contains all important parameters of model.

In [1]:
from peft import LoraConfig, TaskType

# LoRA.
peft_config = LoraConfig(task_type       = TaskType.SEQ_2_SEQ_LM, 
                         inference_mode  = False,                   # If 'True', weights are frozen and not trained.
                         r=8, lora_alpha =32, lora_dropout=0.1)
# Prompt-based.
peft_config = PromptEncoderConfig(task_type="CAUSAL_LM", 
                                  num_virtual_tokens=20, 
                                  encoder_hidden_size=128)
# IA3.
peft_config = IA3Config(task_type="SEQ_2_SEQ_LM")

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

| Task Type                | Description                                                              |
|--------------------------|--------------------------------------------------------------------------|
| `SEQ_2_SEQ_LM`            | Sequence-to-sequence language modeling (e.g., translation, summarization)|
| `Causal_LM`               | Causal language modeling (e.g., text generation)                        |
| `Masked_LM`               | Masked language modeling (e.g., BERT-based models for masked token prediction)|
| `Token_CLS`               | Token-level classification (e.g., named entity recognition)            |
| `Seq_CLS`                 | Sequence-level classification (e.g., sentiment analysis, document classification)|
| `MultipleChoice`          | Multiple choice tasks (e.g., question answering with multiple options) |
| `QA`                      | Question answering tasks (e.g., extractive question answering)         |
| `Text_Classification`     | Text classification tasks (e.g., sentiment analysis)                    |
| `Text_2_Text_Generation`  | Text-to-text generation tasks (e.g., text summarization, translation)  |
| `Text_Summarization`      | Summarization tasks (e.g., extractive or abstractive summarization)    |
| `Next_Sentence_Prediction`| Predicting whether two sentences follow each other in a sequence      |
| `Sentence_Classification` | Classification of sentence pairs (e.g., entailment, similarity)        |
| `Token_Regression`        | Regression tasks on token-level inputs                                 |
| `Sentence_Regression`     | Regression tasks on sentence-level inputs                              |


## How to Choose PEFT.

| **Method**       | **When to Use**                                                               | **Best For**                                      |
|:------------------|:-----------------------------------------------------------------------------|:-------------------------------------------------|
| **LoRA**         | - Large pretrained models                                                    | - Task-specific fine-tuning                      |
|                  | - Low computational resources                                                | - Memory efficiency, reduced training cost       |
|                  | - Need for efficient fine-tuning on specific downstream tasks                | - Tasks like text classification, summarization, etc. |
|                  | - Compatible with 4-bit and 8-bit quantization.                              |                                                  |
|                  | - Works well with a sufficient amount of data for downstream tasks.          |                                                  |
| **Prompt-based** | - Few-shot/zero-shot learning                                                | - Tasks where model generalization is key        |
|                  | - Limited data, but need flexibility in task handling                        | - Multi-task learning without modifying model weights |
| **IA3**          | - Few-shot adaptation with task-specific attention                           | - Tasks requiring individualized attention adaptation |
|                  | - Memory efficiency with targeted model modifications                        | - Fine-tuning attention mechanisms for specific tasks |


# 2.2. PeftModel.

In [3]:
# Base Model.
from transformers import AutoModelForSeq2SeqLM
checkpoint = "bigscience/mt0-large"
model      = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

# Peft Model.
from peft import get_peft_model
model      = get_peft_model(model, peft_config)  

# Use multiple adapters.
model.add_adapter(peft_config, adapter_name='adapter1')
model.add_adapter(peft_config, adapter_name='adapter2')

model.set_adapter('adapter1')           # Set.
model.disable_adapters(['adapter1'])    # Disable.
model.enable_adapters(['adapter1'])     # Enable again.

config.json:   0%|          | 0.00/800 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

In [7]:
# Num of trainable parameters.
model.print_trainable_parameters()

trainable params: 2,359,296 || all params: 1,231,940,608 || trainable%: 0.1915


In [None]:
# Just use like typical model.
# Training arguments.
training_args = TrainingArguments(
    output_dir="your-name/bigscience/mt0-large-lora",
    learning_rate=1e-3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Trainer.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train.
trainer.train()

# Save the model.
model.save_pretrained("output_dir")

# Push to hub.
from huggingface_hub import login
hf_token = "your-huggingface-token"
login(token=hf_token)

model.push_to_hub(model_id="your-name/bigscience/mt0-large-lora")

# Load.
from peft import AutoPeftModel
model = AutoPeftModel.from_pretrained("ybelkada/opt-350m-lora")

> Note) Only save and load `Extra PEFT Parameters`, super efficient!

## 2.3. Merge.
- TIES, TrIm, Elect, and Merge.
- DARE, Drop And REscale.

> Note) When youâ€™re attempting to merge fully trained models with TIES, you should be aware of any special tokens each model may have added to the embedding layer which are not a part of the original checkpointâ€™s vocabulary. This may cause an issue because each model may have added a special token to the same embedding position. If this is the case, you should use the resize_token_embeddings method to avoid merging the special tokens at the same embedding index.
> 
> This shouldnâ€™t be an issue if youâ€™re only merging LoRA adapters trained from the same base model.

In [None]:
model = PeftModel.from_pretrained(model, "smangrul/tinyllama_lora_norobots", adapter_name="norobots")
model.load_adapter("smangrul/tinyllama_lora_sql", adapter_name="sql")
model.load_adapter("smangrul/tinyllama_lora_adcopy", adapter_name="adcopy")

adapters     = ["norobots", "adcopy", "sql"]
weights      = [2.0, 1.0, 1.0]
adapter_name = "merge"
density      = 0.2
model.add_weighted_adapter(adapters, weights, adapter_name, 
                           combination_type="ties",            # "dare_ties" for DARE.
                           density=density)

# 3. Bitsandbytes.
- Decrease the float precision, with minimum tradeoff of performance.
- Use '4-bits' as a default for large LLM.
- Consider 'INT8' if you want better performance, sacrificing some training time and memory usage.

> Note) 'LoRA' is compatible with quantization, but other PEFTs like prefix-tuning or prompt-based one may not.

## 3.1. StableEmbedding() Wrapper.

In [None]:
# Wrap the embedding layer of the pretrained model with StableEmbedding.
from bitsandbytes import StableEmbedding
model.get_input_embeddings = StableEmbedding(model.get_input_embeddings())

## 3.2. 8-bits Optimizer.
- Full List of Optimizers : https://huggingface.co/docs/bitsandbytes/reference/optim/optim_overview.

In [None]:
# 8-bits optimizer, using TrainingArguments and Trainer.
from bitsandbytes import optim as bnb
opt_adamw8 = bnb.optim.AdamW8bit(model.parameters(),      # Use AdamW8bit optimizer.     
                                lr=5e-5,
                                min_8bit_size=16384)      # All tensors with less than `min_8bit_size` elements are maintained as 32-bits float.



## 3.3. K-bits Quantized Training.

In [None]:
# 4-bits Quantization.
from transformers import BitsAndBytesConfig
config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Make a model with quantization_config.
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1", 
    quantization_config=config
)

# GlobalOptimManager : You can also specify parameters to use 32-bits manually.
mng = bnb.optim.GlobalOptimManager.get_instance()
mng.register_parameters(model.parameters())
mng.override_config(model.fc1.weight, "optim_bits", 32)   # fc1.weight now uses 32-bit floats.

# Linear8bitLt : Or using INT8 Quantization.
def replace_linear_with_8bit(model):
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):  # Replace all Linear layers to INT8 layer.
            setattr(model, name, bnb.nn.Linear8bitLt(module.in_features, module.out_features))
    return model
    
model = replace_linear_with_8bit(model)          # Replace linear layers in the model.

# PEFT : LoRA.
# Prepare PEFT model for quantized training.
from peft import prepare_model_for_kbit_training
model = prepare_model_for_kbit_training(model)

# LoRA Config.
from peft import LoraConfig, get_peft_model
config = LoraConfig(
    r=16,
    lora_alpha=8,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)

# Training arguments.
training_args = TrainingArguments(
    output_dir="./results",
    optim="adamw_bnb_8bit",              # AdamW 8-bits optimizer.
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
)

# Trainer.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    optimizers=[opt_adamw8]             # AdamW 8-bits optimizer.
)

# The model is now prepared for **4-bit quantization + 8-bit Linear layers + LoRA fine-tuning**.

> #### Note) Don't need to specify `BitsAndBytesConfig()`, just use `optim="adamw_bnb_8bit"`.

# 4. Practical Guidelines for Large LLMs.

## Hyperparameter Tuning
- Hyperparameter tuning is **less critical** for pretrained models compared to customized models.
- If needed, use **Optuna** for efficient and automated hyperparameter tuning.

## Quantization.
- Use **4-bit quantization** by default for efficient memory usage and faster computation.
- Replace layers with **INT8 quantization** if you need better performance, trading off some computation and memory resources.

## PEFT.
- Use **LoRA** as the default PEFT method for most fine-tuning tasks.
- In **Multitask Systems**:
  - Train **multiple adapters separately** for each task.
  - **Merge adapters** to reduce complexity and enable efficient multitask inference.
- If you have **limited data** and are considering **zero-shot or few-shot learning**, use **Prompt-based** or **IA3** methods instead.
- Be cautious with **quantization compatibility** when using Prompt-based or IA3 methods, as these may not work well with low-bit precision.
