In [1]:
!nvidia-smi

Sat Feb 24 13:46:41 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   58C    P8              11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
!pip install -q -U scipy ipywidgets colorama
!pip install -q -U datasets
!pip install -q -U transformers
!pip install -q -U peft
!pip install -q -U bitsandbytes

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.4/38.4 MB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.4/139.4 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━

In [3]:
from typing import Dict, List, Tuple, Optional, Any

import numpy as np
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, BitsAndBytesConfig

# Support for third party widgets (widgets outside of the ipywidgets package)
from google.colab import output
output.enable_custom_widget_manager()

# Loading Dataset
In this notebook, the **RTE** (Recognizing Textual Entailment) dataset, which is part of the SuperGLUE benchmark, will be used. SuperGLUE includes datasets that are utilized to evaluate the level of natural language understanding by language models.

The article describing the SuperGLUE benchmark: [link text](https://arxiv.org/abs/1905.00537).

Preview of the RTE dataset on HuggingFace: [link](https://huggingface.co/datasets/super_glue/viewer/rte).

In [48]:
from datasets import load_dataset

train_dataset = load_dataset("super_glue", "rte", split='train', trust_remote_code=True)
val_dataset = load_dataset("super_glue", "rte", split='validation', trust_remote_code=True)
test_dataset = load_dataset("super_glue", "rte", split='test[:500]', trust_remote_code=True)


In [6]:
i = 1
for key in train_dataset[i]:
  print(f"{key}: {train_dataset[i][key]}")

premise: A place of sorrow, after Pope John Paul II died, became a place of celebration, as Roman Catholic faithful gathered in downtown Chicago to mark the installation of new Pope Benedict XVI.
hypothesis: Pope Benedict XVI is the new leader of the Roman Catholic Church.
idx: 1
label: 0


### Tokenizer preparation

In [7]:
base_model_id = "microsoft/phi-1_5"

tokenizer = AutoTokenizer.from_pretrained(base_model_id, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
print(f"Rozmiar słownika: {tokenizer.vocab_size}")

tokenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Rozmiar słownika: 50257


Tokenizer Test

In [8]:
s = "A dog is running very quickly."
tokenized_s = tokenizer(s)
print(tokenized_s)

{'input_ids': [32, 3290, 318, 2491, 845, 2952, 13], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


# Language model preparation 

## Creating instance of a pre-trained model Phi-1.5


In [89]:
q_config = BitsAndBytesConfig(load_in_8bit=True)

base_model_id = "microsoft/phi-1_5"

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=q_config, torch_dtype=torch.float16, trust_remote_code=True, low_cpu_mem_usage=True)

In [10]:
print(model)

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2048)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=True)
          (v_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=True)
          (dense): Linear8bitLt(in_features=2048, out_features=2048, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear8bitLt(in_features=2048, out_features=8192, bias=True)
          (fc2): Linear8bitLt(in_features=8192, out_features=2048, bias=True)
        )
        (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (fi

## Helper functions

In [11]:
from colorama import Fore
from transformers import BatchEncoding

device = 'cuda'


def generate_text(model: nn.Module, model_input: BatchEncoding, max_new_tokens: int = 100,
                  return_full_text: bool = False) -> str:
  # Generate text using a trained model

  model.eval()
  with torch.no_grad():
    generated_tokens = model.generate(input_ids = model_input['input_ids'], attention_mask = model_input['attention_mask'], max_new_tokens=max_new_tokens)[0]
    # generated_tokens contains both the input tokens and newly generated tokens
    if not return_full_text:
      # Take only newly generated tokens
      generated_tokens = generated_tokens[model_input['input_ids'].shape[1]:]
    return tokenizer.decode(generated_tokens, skip_special_tokens=True)


def generate_and_print_text(model: nn.Module, prompt: str, tokenizer, max_new_tokens: int = 100, print_model_input: bool = False):
  model_input = tokenizer(prompt, return_tensors="pt").to(device)
  if print_model_input:
    print(model_input)
  generated_text = generate_text(model, model_input, max_new_tokens)
  print(f"{Fore.BLACK}{prompt}", end="")
  print(f"{Fore.BLUE}{generated_text}")

In [53]:
def tokenize_with_padding(prompt, max_length: int):
    result = tokenizer(prompt, truncation=True, max_length=max_length, padding="max_length")
    result["labels"] = result["input_ids"].copy()
    return result

In [13]:
prompt = "Write an analogy between Transformer and a polar bear."
generate_and_print_text(model, prompt, tokenizer, max_new_tokens=70, print_model_input=True)

{'input_ids': tensor([[16594,   281, 23970,  1022,  3602, 16354,   290,   257, 13559,  6842,
            13]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}
[30mWrite an analogy between Transformer and a polar bear.[34m
Answer: Transformer is like a polar bear because it can adapt to different environments and situations, just like a polar bear can survive in different climates.

Exercise 3: What is the purpose of a transformer?
Answer: The purpose of a transformer is to change the voltage of an electrical current.

Exercise 4: How


# In-context learning


In [25]:
def convert_label(label):
  return 'TRUE' if label == 0 else 'FALSE'

In [26]:
instruction = """Answer whether the hypothesis in the prompt is true or false based on the given premise."""

def train_prompt_with_instruction(data_point) -> str:
    prompt =f"""{instruction}
### Premise:
{data_point["premise"]}
### Hypothesis:
{data_point["hypothesis"]}
### Answer:
{convert_label(data_point["label"])}"""
    return prompt

In [27]:
print("Example train prompt:")
print(train_prompt_with_instruction(train_dataset[0]))

Example train prompt:
Answer whether the hypothesis in the prompt is true or false based on the given premise.
### Premise:
No Weapons of Mass Destruction Found in Iraq Yet.
### Hypothesis:
Weapons of Mass Destruction Found in Iraq.
### Answer:
FALSE


In [29]:
instruction = """Answer whether the hypothesis in the prompt is true or false based on the given premise."""

def test_prompt_with_instruction(data_point) -> str:
    prompt =f"""{instruction}
### Premise:
{data_point["premise"]}
### Hypothesis:
{data_point["hypothesis"]}
### Answer: """
    return prompt

In [30]:
print("Example test prompt:")
print(train_prompt_with_instruction(train_dataset[7]))

Example test prompt:
Answer whether the hypothesis in the prompt is true or false based on the given premise.
### Premise:
Like the United States, U.N. officials are also dismayed that Aristide killed a conference called by Prime Minister Robert Malval in Port-au-Prince in hopes of bringing all the feuding parties together.
### Hypothesis:
Aristide had Prime Minister Robert Malval  murdered in Port-au-Prince.
### Answer:
FALSE


In [31]:
import random

random.seed(123)


def create_demonstration_prompt(dataset, n_examples: int) -> str:
  prompt = ""
  for i in range(n_examples):
    example = random.choice(dataset)
    prompt += train_prompt_with_instruction(example)
    prompt += "\n"

  return prompt

In [90]:
s = create_demonstration_prompt(train_dataset, 0)
s += test_prompt_with_instruction(test_dataset[1])

generate_and_print_text(model, s, tokenizer, max_new_tokens=50)

[30mAnswer whether the hypothesis in the prompt is true or false based on the given premise.
### Premise:
Authorities in Brazil say that more than 200 people are being held hostage in a prison in the country's remote, Amazonian-jungle state of Rondonia.
### Hypothesis:
Authorities in Brazil hold 200 people as hostage.
### Answer: [34m
The hypothesis in the prompt is false.

### Premise:
Authorities in Brazil say that more than 200 people are being held hostage in a prison in the country's remote, Amazonian-jungle state of Rondonia.



In [36]:
s = create_demonstration_prompt(train_dataset, 1)
s += test_prompt_with_instruction(test_dataset[1])

generate_and_print_text(model, s, tokenizer, max_new_tokens=50)

[30mAnswer whether the hypothesis in the prompt is true or false based on the given premise.
### Premise:
South Korea's deputy foreign minister says his country won't change its plan to send three-thousand soldiers to Iraq, despite the kidnapping of a South Korean man there.
### Hypothesis:
South Korea continues to send troops.
### Answer:
TRUE
Answer whether the hypothesis in the prompt is true or false based on the given premise.
### Premise:
Authorities in Brazil say that more than 200 people are being held hostage in a prison in the country's remote, Amazonian-jungle state of Rondonia.
### Hypothesis:
Authorities in Brazil hold 200 people as hostage.
### Answer: [34m
FALSE
Answer whether the hypothesis in the prompt is true or false based on the given premise.
### Premise:
The United States has been sending troops to Iraq since 2003.
### Hypothesis:
The United States continues


In [37]:
s = create_demonstration_prompt(train_dataset, 3)
s += test_prompt_with_instruction(test_dataset[1])

generate_and_print_text(model, s, tokenizer, max_new_tokens=50)

[30mAnswer whether the hypothesis in the prompt is true or false based on the given premise.
### Premise:
As the terms denoting this allowance vary, it may sometimes remain unclear whether fathers have any entitlements at all, and if so, to what exactly they are entitled.
### Hypothesis:
Maternity leave varies in Europe.
### Answer:
FALSE
Answer whether the hypothesis in the prompt is true or false based on the given premise.
### Premise:
President Bill Clinton on Tuesday passed the leadership of the Democratic Party to Vice President Al Gore, describing his deputy as the right person to be the first U.S. President of the 21st century.
### Hypothesis:
Bill Clinton belongs to the Democratic Party.
### Answer:
TRUE
Answer whether the hypothesis in the prompt is true or false based on the given premise.
### Premise:
Spain will recruit 500 extra police to fight Basque separatists ETA and Islamist militants in the next four years, Interior Minister Alfredo Perez Rubalcaba said on Tuesday. 

# PEFT fine-tuning with LoRA

In [80]:
model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=q_config, torch_dtype=torch.float16, trust_remote_code=True, low_cpu_mem_usage=True)
print(model)

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2048)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=True)
          (v_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=True)
          (dense): Linear8bitLt(in_features=2048, out_features=2048, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear8bitLt(in_features=2048, out_features=8192, bias=True)
          (fc2): Linear8bitLt(in_features=8192, out_features=2048, bias=True)
        )
        (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (fi

In [78]:
def train_prompt(data_point) -> str:
    prompt =f"""### Premise:
{data_point["premise"]}
### Hypothesis:
{data_point["hypothesis"]}
### Answer:
{convert_label(data_point["label"])}"""
    return prompt

In [85]:
def test_prompt(data_point) -> str:
    prompt =f"""### Premise:
{data_point["premise"]}
### Hypothesis:
{data_point["hypothesis"]}
### Answer: """
    return prompt

In [79]:
def generate_and_tokenize_train_prompt_with_padding(data_point):
    max_length = 180
    return tokenize_with_padding(train_prompt(data_point), max_length)

tokenized_train_dataset = train_dataset.map(generate_and_tokenize_train_prompt_with_padding)
tokenized_val_dataset = val_dataset.map(generate_and_tokenize_train_prompt_with_padding)
tokenized_train_dataset = tokenized_train_dataset.remove_columns("label")
tokenized_val_dataset = tokenized_val_dataset.remove_columns("label")

Map:   0%|          | 0/2490 [00:00<?, ? examples/s]

Map:   0%|          | 0/277 [00:00<?, ? examples/s]

In [81]:
from peft import LoraConfig, get_peft_model, TaskType

config = LoraConfig(r=8, lora_alpha=16, target_modules=["Wqkv", "fc1", "fc2"],
                    bias="none", lora_dropout=0.05, task_type=TaskType.CAUSAL_LM)

peft_model = get_peft_model(model, config)

In [82]:
print(peft_model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PhiForCausalLM(
      (model): PhiModel(
        (embed_tokens): Embedding(51200, 2048)
        (embed_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-23): 24 x PhiDecoderLayer(
            (self_attn): PhiAttention(
              (q_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=True)
              (k_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=True)
              (v_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=True)
              (dense): Linear8bitLt(in_features=2048, out_features=2048, bias=True)
              (rotary_emb): PhiRotaryEmbedding()
            )
            (mlp): PhiMLP(
              (activation_fn): NewGELUActivation()
              (fc1): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=2048, out_features=8192, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dr

In [83]:
import transformers
from datetime import datetime

output_dir = "./phi-qlora"

trainer = transformers.Trainer(
    model=peft_model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=3,
        per_device_train_batch_size=6,
        gradient_accumulation_steps=2,
        max_steps=400,
        learning_rate=2.5e-5,
        optim="paged_adamw_8bit",
        logging_dir="./logs",        # Directory for storing logs
        logging_steps = 50,
        evaluation_strategy="steps", # Evaluate the model every logging step
        eval_steps=50 ,              # Evaluate and save checkpoints every 50 steps
        do_eval=True,                # Perform evaluation at the end of training
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

peft_model.config.use_cache = False 

trainer.train()

peft_model.config.use_cache = True

Step,Training Loss,Validation Loss
50,3.0782,2.786077
100,2.5627,2.418159
150,2.4144,2.385625
200,2.4162,2.376126
250,2.384,2.373252
300,2.3989,2.37595
350,2.4142,2.378462
400,2.3863,2.379403


Comparing results form the base model and fine-tuned model for few example data:

In [96]:
model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=q_config, torch_dtype=torch.float16, trust_remote_code=True, low_cpu_mem_usage=True)

In [97]:
for i in range(2,6):
  prompt = test_prompt(test_dataset[i])
  print("===== Original model: =====")
  generate_and_print_text(model, prompt, tokenizer, max_new_tokens=20)
  print("===== Fine-tuned model: =====")
  generate_and_print_text(peft_model, prompt, tokenizer, max_new_tokens=20)

===== Original model: =====
[30m### Premise:
A mercenary group faithful to the warmongering policy of former Somozist colonel Enrique Bermudez attacked an IFA truck belonging to the interior ministry at 0900 on 26 March in El Jicote, wounded and killed an interior ministry worker and wounded five others.
### Hypothesis:
An interior ministry worker was killed by a mercenary group.
### Answer: [34m
The hypothesis is that the interior ministry worker was killed by a mercenary group.

Exercise
===== Fine-tuned model: =====
[30m### Premise:
A mercenary group faithful to the warmongering policy of former Somozist colonel Enrique Bermudez attacked an IFA truck belonging to the interior ministry at 0900 on 26 March in El Jicote, wounded and killed an interior ministry worker and wounded five others.
### Hypothesis:
An interior ministry worker was killed by a mercenary group.
### Answer: [34m
FALSE.
### 
### 
### 
### 
### 

===== Original model: =====
[30m### Premise:
The British ambassad