In [1]:
pip install bitsandbytes peft transformers datasets trl

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.2-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting trl
  Downloading trl-0.9.6-py3-none-any.whl.metadata (12 kB)
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.8.5-py3-none-any.whl.metadata (8.2 kB)
Collecting docstring-parser>=0.16 (from tyro>=0.5.11->trl)
  Downloading docstring_parser-0.16-py3-none-any.whl.metadata (3.0 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl)
  Downloading shtab-1.7.1-py3-none-any.whl.metadata (7.3 kB)
Downloading bitsandbytes-0.43.2-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading peft-0.12.0-py3-none-any.whl (296 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.4/296.4 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownl

In [2]:
import pandas as pd
import numpy as np
import torch
import random
import matplotlib.pyplot as plt
from datasets import Dataset, load_dataset
import warnings
warnings.filterwarnings("ignore")

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

import logging
# Configure logging
logging.basicConfig(level=logging.INFO)


In [3]:
huggingface_dataset_name = "cnn_dailymail"
dataset = load_dataset(huggingface_dataset_name, "3.0.0")
dataset

Downloading readme:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/259M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [4]:
sample = dataset['train'][0]
print(f"""Article (excerpt of 500 characters, total length: {len(sample["article"])}):""")
print(sample["article"][:500])
print(f'\nSummary (length: {len(sample["highlights"])}):')
print(sample["highlights"])

Article (excerpt of 500 characters, total length: 2527):
LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as s

Summary (length: 217):
Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .
Young actor says he has no plans to fritter his cash away .
Radcliffe's earnings from first five Potter films have been held in trust fund .


In [5]:
def format_instruction(dialog: str, summary:str):
    instruction = f"""### Instruction:
    summarize the following conversation.
    
    ### input:
    {dialog.strip()}
    
    ### summary:
    {summary}
    
    """.strip()
    
    return instruction

def generate_instruction_dataset(data_point):
    return {
        "article": data_point["article"],
        "highlights": data_point["highlights"],
        "text": format_instruction(data_point["article"],data_point["highlights"])
    }

def procces_dataset(data : Dataset):
    
    return (
    data.shuffle(seed=42).map(generate_instruction_dataset).remove_columns(['id'])
    )

In [6]:
dataset['train'] = procces_dataset(dataset['train'])
dataset["test"] = procces_dataset(dataset["validation"])
dataset["validation"] = procces_dataset(dataset["validation"])

train_data = dataset['train'].shuffle(seed=42).select([i for i in range(1000)])

# Select 100 rows from the test and validation splits
test_data = dataset['test'].shuffle(seed=42).select([i for i in range(100)])
validation_data = dataset['validation'].shuffle(seed=42).select([i for i in range(100)])

train_data,test_data,validation_data


Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

(Dataset({
     features: ['article', 'highlights', 'text'],
     num_rows: 1000
 }),
 Dataset({
     features: ['article', 'highlights', 'text'],
     num_rows: 100
 }),
 Dataset({
     features: ['article', 'highlights', 'text'],
     num_rows: 100
 }))

In [7]:
model_id =  "NousResearch/Llama-2-7b-hf"

bnb_config = BitsAndBytesConfig(
    # Enabling 4-bit quantization for the model
    load_in_4bit=True,
    
    # Using double quantization, which can help improve the precision and reduce the memory footprint
    bnb_4bit_use_double_quant=True,
    
    # Setting the quantization type to "nf4" (Normalized Float 4), a specific type of 4-bit quantization
    bnb_4bit_quant_type="nf4",
    
    # Setting the data type used for computation to bfloat16 (Brain Float 16), which balances precision and performance
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Loading a pre-trained causal language model with the specified quantization configuration
model = AutoModelForCausalLM.from_pretrained(
    model_id,           # The ID or path of the pre-trained model
    quantization_config=bnb_config,  # Applying the previously defined quantization configuration
    device_map="auto"   # Automatically mapping the model to available devices (e.g., CPU, GPU)
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

## ZERO-SHOT INFERENCE WITH LLAMA-2 7B¶


In [8]:
index = 2

dialogue = test_data['article'][index]
summary = test_data['highlights'][index]

prompt = f"""
Summarize the following conversation.

### input:
{dialogue}

### summary
{summary}
"""

inputs = tokenizer(prompt, return_tensors='pt')
model_output = model.generate(inputs['input_ids'],
                             max_new_tokens=100,
                             )
output = tokenizer.decode(model_output[0], skip_speical_tokens=True)

dash_line = '-'.join('' for x in range(50))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

2024-07-30 09:15:03.599171: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-30 09:15:03.599317: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-30 09:15:03.734664: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


-------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.

### input:
A federal appeals court has given new life to a Holocaust survivor's claim that the University of Oklahoma is unjustly harboring a Camille Pissarro painting that the Nazis stole from her father during World War II. The 2nd U.S. Circuit Court of Appeals in Manhattan has directed a lower-court judge to consider whether the lawsuit she threw out should be transferred to Oklahoma, saying she has authority to do so. The court's order on Thursday came as the school found itself amid a racial controversy after video of fraternity students engaged in a racist chant spread across the Internet. Dr. Leone-Noelle Meyer maintained she is entitled to Pissarro's 1886 'Shepherdess Bringing in Sheep' because it belonged to her father when it was taken by the Nazis as Germany moved across France . University President David Boren ordered a fraternity house closed and expelled two of its mem

### TRAINING STEP (FINE TUNING)¶


In [9]:
from peft import prepare_model_for_kbit_training

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():

        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )


### LoRA (Low-Rank Adaptation) :
is a technique for Parameter-Efficient Fine-Tuning (PEFT) that adds trainable low-rank matrices to the model weights.

![LoRa](https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/blog/133_trl_peft/step2.png)


In [10]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    # The scaling factor for the low-rank approximation. Higher values allow 
    # for capturing more details but increase computation.
    r=16,
    
    # The scaling factor for the learning rate. This helps control how 
    # aggressively the model adapts to the learning rate.
    lora_alpha=64,
    
    # A list of module names within the model to which the LoRA (Low-Rank Adaptation)
    # method will be applied. Here, it targets the projection layers.
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    
    # The dropout rate applied to the LoRA layers to prevent overfitting.
    lora_dropout=0.1,
    
    # Specifies whether or not to apply the LoRA method to the bias terms in the model.
    # Here, "none" means no bias adaptation.
    bias="none",
    
    # Specifies the type of task the model is being used for. 
    # "CAUSAL_LM" stands for Causal Language Modeling, indicating that this configuration 
    # is for a task where the model predicts the next word in a sequence.
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
print_trainable_parameters(model)

trainable params: 16777216 || all params: 3517190144 || trainable%: 0.477006226934315


In [11]:
from transformers import TrainingArguments

OUTPUT_DIR = "llama2-docsum-adapter"

training_arguments = TrainingArguments(
    # Set the batch size per device (GPU/CPU) during training
    per_device_train_batch_size=4,
    
    # Number of gradient accumulation steps. This effectively increases the batch size
    # by accumulating gradients over multiple steps before updating the model.
    gradient_accumulation_steps=4,
    
    # Specify the optimizer to use. Here, 'paged_adamw_32bit' is a variant of AdamW 
    # optimizer suitable for training with lower precision.
    optim="paged_adamw_32bit",
    
    # How often to log training information (e.g., loss). Here, it logs every step.
    logging_steps=1,
    
    # Set the learning rate for the optimizer.
    learning_rate=1e-4,
    
    # Enable 16-bit floating-point precision training to reduce memory usage and speed up training.
    fp16=True,
    
    # Maximum gradient norm for gradient clipping to prevent exploding gradients.
    max_grad_norm=0.3,
    
    # Number of epochs to train the model.
    num_train_epochs=1,
    
    # Evaluation strategy to use during training. 'steps' means evaluation is performed every few steps.
    evaluation_strategy="steps",
    
    # Number of steps between evaluations. Here, it's set to 0.2 steps (interpreted as every 20% of an epoch).
    eval_steps=0.2,
    
    # Fraction of training steps used for warming up the learning rate scheduler.
    warmup_ratio=0.05,
    
    # Strategy to use for saving the model checkpoints. 'epoch' means saving at the end of every epoch.
    save_strategy="epoch",
    
    # Whether to group samples of similar lengths together to improve training efficiency.
    group_by_length=True,
    
    # Directory to save the training outputs (model checkpoints, logs, etc.).
    output_dir=OUTPUT_DIR,
    
    # Specify the tool to use for logging and reporting. Here, it uses TensorBoard.
    report_to="tensorboard",
    
    # Save the model in safetensors format, which is more secure and efficient.
    save_safetensors=True,
    
    # Learning rate scheduler type. 'cosine' implies a cosine annealing schedule for the learning rate.
    lr_scheduler_type="cosine",
    
    # Seed for random number generators to ensure reproducibility.
    seed=42,
)

# Disable caching during training to avoid potential issues or warnings.
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!


In [12]:
from trl import SFTTrainer

trainer = SFTTrainer(
    # The model to be trained
    model=model,
    # The dataset used for training the model
    train_dataset=train_data,
    # The dataset used for evaluating the model during training
    eval_dataset=validation_data,
    # The PEFT (Parameter-Efficient Fine-Tuning) configuration, here using LoRA (Low-Rank Adaptation)
    peft_config=lora_config,
    # The name of the field in the dataset that contains the text data
    dataset_text_field="text",
    # The maximum sequence length for input data. Inputs longer than this will be truncated.
    max_seq_length=1024,
    # The tokenizer used for processing the text data into tokens that the model can understand
    tokenizer=tokenizer,
    # The training arguments that define various training parameters and configurations
    args=training_arguments,
)

# Start the training process using the specified trainer instance
trainer.train()

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss
13,1.6131,1.706264
26,1.668,1.671105
39,1.6477,1.663563
52,1.8219,1.661522


TrainOutput(global_step=62, training_loss=1.703748187711162, metrics={'train_runtime': 6276.621, 'train_samples_per_second': 0.159, 'train_steps_per_second': 0.01, 'total_flos': 3.477409070560051e+16, 'train_loss': 1.703748187711162, 'epoch': 0.992})

In [13]:
peft_model_path="./peft-dialogue-summary"

trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

('./peft-dialogue-summary/tokenizer_config.json',
 './peft-dialogue-summary/special_tokens_map.json',
 './peft-dialogue-summary/tokenizer.model',
 './peft-dialogue-summary/added_tokens.json',
 './peft-dialogue-summary/tokenizer.json')

### INFERENCE

In [14]:
from transformers import TextStreamer

model.config.use_cache = True
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
          

In [17]:
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

peft_model_dir = "peft-dialogue-summary"

# load base LLM model and tokenizer
trained_model = AutoPeftModelForCausalLM.from_pretrained(
    peft_model_dir,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True
)

tokenizer = AutoTokenizer.from_pretrained(peft_model_dir)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [18]:
index = 51

dialogue = train_data['article'][index][:10000]
summary = train_data['highlights'][index]

prompt = f"""
Summarize the following conversation.

### Input:
{dialogue}

### Summary:
"""

input_ids = tokenizer(prompt, return_tensors='pt',truncation=True).input_ids.cuda()
outputs = trained_model.generate(input_ids=input_ids, max_new_tokens=200, )
output= tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'TRAINED MODEL GENERATED TEXT :\n{output}')


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.

### Input:
You might expect polar bears, the Artic Circle's apex predators, to be dab hands at dancing on ice. But as this specimen in Svalbard shows, even after thousands of years of evolutionary adaptation, some still suffer from two-left feet on the frozen ocean. Heinrich Eggenfellner, a 49-year-old videographer from Norway, said: 'I have encountered polar bears many times every year since I live up here and am used to them. 'This episode, however, was extraordinary.' Born slippy: A polar walks across thin sea ice in Svalbard, Norway, where it was caught on camera looking very unsteady on its feet as it made its way across the slippery surface . Spreading its weight: The polar bear does its best not to collapse through the fragile ice by spreading out . In its element: The beast finally gave up and pushed a hole in the ice to dive 