<a href="https://colab.research.google.com/github/Yogesh914/dpo_and_sd/blob/main/dpo_with_sd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Applying DPO To Improve Speculative Decoding 🏃💨

## Set-Up Environment

In [1]:
!pip install -q torch transformers accelerate bitsandbytes trl datasets peft sentencepiece wandb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m77.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.0/102.0 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━

In [2]:
!pip install --upgrade transformers accelerate bitsandbytes

Collecting transformers
  Downloading transformers-4.40.1-py3-none-any.whl (9.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.40.0
    Uninstalling transformers-4.40.0:
      Successfully uninstalled transformers-4.40.0
Successfully installed transformers-4.40.1


## Baseline Implementation

In [1]:
import os
import gc
import time
import torch
import wandb
import numpy as np
import transformers
import pandas as pd
import bitsandbytes as bnb
from trl import DPOTrainer
import torch.nn.functional as F
from google.colab import userdata
from datasets import Dataset, load_dataset
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, set_seed

access_token = userdata.get('HF_TOKEN')

In [2]:
quantization_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

In [5]:
def generate_with_time(model, inputs):
    start_time = time.time()
    outputs = model.generate(**inputs, assistant_model=None, max_new_tokens=500)
    generation_time = time.time() - start_time
    return outputs, generation_time

In [None]:
set_seed(0)
model_name = "google/gemma-7b-it"
prompt = "Tell me about gravity"

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config, token=access_token, do_sample=False)
model.config.use_cache = True
tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token)
model_inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
output, gen_time = generate_with_time(model, model_inputs)

print(gen_time)
print(tokenizer.decode(output[0], skip_special_tokens=True))

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

21.877572536468506
Tell me about gravity.

Gravity is a fundamental force of nature that acts between objects with mass. It is the force that pulls objects towards each other. The greater the mass of an object, the greater its gravitational pull.

**Here are some key points about gravity:**

* **Force:** Gravity is a force, like the force of friction or the force of magnetism. It is a push or pull between objects.
* **Mass:** Gravity is directly related to an object's mass. The greater the mass, the greater the gravitational pull.
* **Attraction:** Gravity attracts objects to each other. It is the force that pulls objects towards each other.
* **Inertial Motion:** Gravity opposes the motion of objects in space. It is the force that keeps objects in their current position.
* **Universal Constant:** The gravitational force between two objects is proportional to the product of their masses and inversely proportional to the square of the distance between them. This is known as the universa

## Testing Speculative Decoding

In [None]:
def assisted_generate_with_time(model, inputs, assistant_model):
    start_time = time.time()
    outputs = model.generate(**inputs, assistant_model=assistant_model, num_assistant_tokens=8, prompt_lookup_num_tokens=10, max_new_tokens=500)
    generation_time = time.time() - start_time
    return outputs, generation_time

In [None]:
assistant_model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", token=access_token).to("cuda")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
set_seed(0)
prompt = "Tell me about gravity"
model = AutoModelForCausalLM.from_pretrained("google/gemma-7b-it", quantization_config=quantization_config, token=access_token)
tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b-it", token=access_token)
model_inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
output, gen_time = assisted_generate_with_time(model, model_inputs, assistant_model)

print(gen_time)
print(tokenizer.decode(output[0], skip_special_tokens=True))

21.721193552017212
Tell me about gravity.

Gravity is a fundamental force of nature that acts between objects with mass. It is the force that pulls objects towards each other. The greater the mass of an object, the greater its gravitational pull.

**Here are some key points about gravity:**

* **Force:** Gravity is a force, which means it can be measured in units such as newtons (N).
* **Mass:** Gravity is proportional to mass, meaning that objects with greater mass experience a greater force of gravity.
* **Attraction:** Gravity is an attractive force, meaning that objects attract each other.
* **Direction:** Gravity pulls objects towards each other in a straight line.
* **Acceleration:** Gravity causes objects to accelerate towards the ground.
* **Universal gravitation:** Gravity is a universal force, meaning that it affects all objects with mass, regardless of their size or composition.

**Here are some examples of gravity in action:**

* The Earth's gravity pulls objects towards it

In [None]:
output.assistant_tokens

AttributeError: 'Tensor' object has no attribute 'assistant_tokens'

## Testing DPO

In [None]:
assistant_model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", token=access_token).to("cuda")
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it", token=access_token)

dataset = {
    "prompt": [
        "hello",
        "how are you",
        "What is your name?",
        "What is your name?",
        "Which is the best programming language?",
        "Which is the best programming language?",
        "Which is the best programming language?",
    ],
    "chosen": [
        "hi nice to meet you",
        "I am fine",
        "My name is Mary",
        "My name is Mary",
        "Python",
        "Python",
        "Java",
    ],
    "rejected": [
        "leave me alone",
        "I am not fine",
        "Whats it to you?",
        "I dont have a name",
        "Javascript",
        "C++",
        "C++",
    ],
}

dataset = Dataset.from_dict(dataset)

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
new_model = "dpo_gemma"

# LoRA configuration
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['k_proj', 'gate_proj', 'v_proj', 'up_proj', 'q_proj', 'o_proj', 'down_proj']
)

assistant_model.config.use_cache = False

# Training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    max_steps=50,
    save_strategy="no",
    logging_steps=1,
    output_dir=new_model,
    optim="paged_adamw_32bit",
    warmup_steps=10,
    bf16=True,
    report_to="wandb",
)

# Create DPO trainer
dpo_trainer = DPOTrainer(
    assistant_model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    peft_config=peft_config,
    beta=0.1,
    max_prompt_length=1024,
    max_length=1000,
)

# Fine-tune model with DPO
dpo_trainer.train()



Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
1,0.3466
2,0.3466
3,0.3442
4,0.3435
5,0.3317
6,0.3107
7,0.2755
8,0.2331
9,0.1804
10,0.1312


TrainOutput(global_step=50, training_loss=0.06298887740122154, metrics={'train_runtime': 27.2695, 'train_samples_per_second': 29.337, 'train_steps_per_second': 1.834, 'total_flos': 0.0, 'train_loss': 0.06298887740122154, 'epoch': 50.0})

In [None]:
dpo_trainer.model.save_pretrained("final_checkpoint")
tokenizer.save_pretrained("final_checkpoint")

# Flush memory
del dpo_trainer, assistant_model
gc.collect()
torch.cuda.empty_cache()

assistant_model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", token=access_token, return_dict=True).to("cuda")

tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b-it")

# Merge base model with the adapter
model = PeftModel.from_pretrained(assistant_model, "final_checkpoint")
model = model.merge_and_unload()

# Save model and tokenizer
model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

('dpo_gemma/tokenizer_config.json',
 'dpo_gemma/special_tokens_map.json',
 'dpo_gemma/tokenizer.model',
 'dpo_gemma/added_tokens.json',
 'dpo_gemma/tokenizer.json')

In [None]:
# Format prompt
prompt = "how are you"
tokenizer = AutoTokenizer.from_pretrained(new_model)
model_inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(**model_inputs, max_new_tokens=10)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

how are you doing?

I am doing well, thank you


## Creating dataset for Applied DPO

In [6]:
dataset = load_dataset("tatsu-lab/alpaca", split="train")
dataset = dataset.select(range(100))

Downloading readme:   0%|          | 0.00/7.47k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/24.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52002 [00:00<?, ? examples/s]

In [7]:
df = pd.DataFrame(dataset)
df

Unnamed: 0,instruction,input,output,text
0,Give three tips for staying healthy.,,1.Eat a balanced diet and make sure to include...,Below is an instruction that describes a task....
1,What are the three primary colors?,,"The three primary colors are red, blue, and ye...",Below is an instruction that describes a task....
2,Describe the structure of an atom.,,"An atom is made up of a nucleus, which contain...",Below is an instruction that describes a task....
3,How can we reduce air pollution?,,There are a number of ways to reduce air pollu...,Below is an instruction that describes a task....
4,Describe a time when you had to make a difficu...,,I had to make a difficult decision when I was ...,Below is an instruction that describes a task....
...,...,...,...,...
95,Translate the following phrase into French.,I miss you,Je te manque.,"Below is an instruction that describes a task,..."
96,Explain what an API is.,,API stands for Application Programming Interfa...,Below is an instruction that describes a task....
97,Compute the area of a rectangle with length 10...,,The area of the rectangle is 50 cm2.,Below is an instruction that describes a task....
98,Find the capital of Spain.,,The capital of Spain is Madrid.,Below is an instruction that describes a task....


In [None]:
df['instruction'] = df['instruction'] + df['input'].apply(lambda x: ' ' + x if x else '')
df = df[['instruction']]

In [None]:
df.to_csv("/content/drive/MyDrive/data/dpo/dpo100.csv")

In [9]:
df = pd.read_csv("/content/drive/MyDrive/data/dpo/dpo100.csv")
tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b-it")

# Function to generate answers using the model
def generate_answer(model, instruction):
    inputs = tokenizer(instruction, return_tensors="pt").to("cuda")
    output = model.generate(**inputs, max_length=100, num_return_sequences=1)
    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    return answer

# Process the dataset with gemma-7b model
def process_dataset_7b(batch):
    instructions = batch['instruction']
    chosen_answers = []
    for instruction in instructions:
        chosen_answers.append(generate_answer(model_7b, instruction))
    return {'prompt': instructions, 'chosen': chosen_answers}

# Process the dataset with gemma-2b model
def process_dataset_2b(batch):
    instructions = batch['prompt']
    rejected_answers = []
    for instruction in instructions:
        rejected_answers.append(generate_answer(model_2b, instruction))
    return {'prompt': instructions, 'chosen': batch['chosen'], 'rejected': rejected_answers}


# Process the dataset with gemma-7b model
model_7b = AutoModelForCausalLM.from_pretrained("google/gemma-7b-it", token=access_token).to("cuda")
processed_dataset_7b = df.to_dict(orient='list')
processed_dataset_7b = process_dataset_7b(processed_dataset_7b)
del model_7b
torch.cuda.empty_cache()

# Process the dataset with gemma-2b model
model_2b = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", token=access_token).to("cuda")
processed_dataset = process_dataset_2b(processed_dataset_7b)
del model_2b
torch.cuda.empty_cache()

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
pd.DataFrame(processed_dataset)

Unnamed: 0,prompt,chosen,rejected
0,Give three tips for staying healthy.,Give three tips for staying healthy.\n\n1. Eat...,Give three tips for staying healthy.\n\n1. Eat...
1,What are the three primary colors?,"What are the three primary colors?\n\na. Red, ...",What are the three primary colors?\n\nThe thre...
2,Describe the structure of an atom.,"Describe the structure of an atom.\n\nSure, he...",Describe the structure of an atom.\n\nAn atom ...
3,How can we reduce air pollution?,How can we reduce air pollution?\n\n**1. Reduc...,How can we reduce air pollution?\n\n**1. Reduc...
4,Describe a time when you had to make a difficu...,Describe a time when you had to make a difficu...,Describe a time when you had to make a difficu...
...,...,...,...
95,Translate the following phrase into French. I ...,Translate the following phrase into French. I ...,Translate the following phrase into French. I ...
96,Explain what an API is.,Explain what an API is.\n\nAn API (Application...,Explain what an API is.\n\nAn API (Application...
97,Compute the area of a rectangle with length 10...,Compute the area of a rectangle with length 10...,Compute the area of a rectangle with length 10...
98,Find the capital of Spain.,Find the capital of Spain.\n\nThe answer is: M...,Find the capital of Spain.\n\nThe capital of S...


In [None]:
pd.DataFrame(processed_dataset).to_csv("/content/drive/MyDrive/data/dpo/dpo_fin_100.csv")

In [15]:
processed_dataset = pd.read_csv("/content/drive/MyDrive/data/dpo/dpo_fin_100.csv")
processed_dataset['chosen'][2]

'Describe the structure of an atom.\n\nSure, here is the structure of an atom:\n\n**1. Nucleus:**\n- The nucleus is the center of the atom and contains protons and neutrons.\n- Protons have a positive electric charge, while neutrons have no electric charge.\n- The number of protons in an atom determines its atomic number, which identifies the element.\n\n**2. Electrons:**\n- Electrons orbit the nucleus in shells or energy levels.\n- The'

In [23]:
processed_dataset = pd.read_csv("/content/drive/MyDrive/data/dpo/dpo_fin_100.csv")

# Function to format the dataset
def chatml_format(batch):
    formatted_examples = []
    for i in range(len(batch['prompt'])):
        # Format instruction
        message = {"role": "user", "content": batch['prompt'][i]}
        prompt = tokenizer.apply_chat_template([message], tokenize=False, add_generation_prompt=True)

        # Format chosen answer
        chosen = batch['chosen'][i] + "<|im_end|>\n"

        # Format rejected answer
        rejected = batch['rejected'][i] + "<|im_end|>\n"

        formatted_examples.append({
            "prompt": prompt,
            "chosen": chosen,
            "rejected": rejected,
        })
    return formatted_examples


# Format the dataset
formatted_dataset = chatml_format(processed_dataset)

processed_dataset["chosen"][0]

'Give three tips for staying healthy.\n\n1. Eat a balanced diet.\n2. Get regular exercise.\n3. Get enough sleep.'

# Applying DPO

In [12]:
assistant_model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", token=access_token).to("cuda")
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it", token=access_token)

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [13]:
dataset = Dataset.from_dict(pd.read_csv("/content/drive/MyDrive/data/dpo/dpo_fin_100.csv").to_dict('series'))

In [14]:
import warnings
warnings.filterwarnings('ignore')

new_model = "dpo_gemma"

# LoRA configuration
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['k_proj', 'gate_proj', 'v_proj', 'up_proj', 'q_proj', 'o_proj', 'down_proj']
)

assistant_model.config.use_cache = False

# Training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=False,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    max_steps=50,
    save_strategy="no",
    logging_steps=1,
    output_dir=new_model,
    optim="paged_adamw_32bit",
    warmup_steps=10,
    bf16=True,
    report_to="wandb",
)

# Create DPO trainer
dpo_trainer = DPOTrainer(
    assistant_model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    peft_config=peft_config,
    beta=0.1,
    max_prompt_length=1024,
    max_length=1000,
)

# Fine-tune model with DPO
dpo_trainer.train()

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
1,0.6931
2,0.6931
3,0.6953
4,0.6926
5,0.6922
6,0.684
7,0.6741
8,0.6558
9,0.6031
10,0.5746


TrainOutput(global_step=50, training_loss=0.2354881042987108, metrics={'train_runtime': 92.9067, 'train_samples_per_second': 8.611, 'train_steps_per_second': 0.538, 'total_flos': 0.0, 'train_loss': 0.2354881042987108, 'epoch': 8.0})

In [15]:
dpo_trainer.model.save_pretrained("/content/drive/MyDrive/data/dpo/dpo100_checkpoint")
tokenizer.save_pretrained("/content/drive/MyDrive/data/dpo/dpo100_checkpoint")

# Flush memory
del dpo_trainer, assistant_model
gc.collect()
torch.cuda.empty_cache()

In [17]:
base = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", token=access_token, return_dict=True).to("cuda")

tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b-it")

# Merge base model with the adapter
assistant_model = PeftModel.from_pretrained(base, "/content/drive/MyDrive/data/dpo/dpo100_checkpoint")
assistant_model = assistant_model.merge_and_unload()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [18]:
# Save model and tokenizer
assistant_model.save_pretrained(f"/content/drive/MyDrive/data/dpo/{new_model}")
tokenizer.save_pretrained(f"/content/drive/MyDrive/data/dpo/{new_model}")

('/content/drive/MyDrive/data/dpo/dpo_gemma/tokenizer_config.json',
 '/content/drive/MyDrive/data/dpo/dpo_gemma/special_tokens_map.json',
 '/content/drive/MyDrive/data/dpo/dpo_gemma/tokenizer.model',
 '/content/drive/MyDrive/data/dpo/dpo_gemma/added_tokens.json',
 '/content/drive/MyDrive/data/dpo/dpo_gemma/tokenizer.json')

In [22]:
prompt = "Give three tips for staying healthy."
model = AutoModelForCausalLM.from_pretrained("/content/drive/MyDrive/data/dpo/dpo_gemma", token=access_token).to("cuda")
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/data/dpo/dpo_gemma")
model_inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

output, gen_time = generate_with_time(model, model_inputs)

print(gen_time)
print(tokenizer.decode(output[0], skip_special_tokens=True))

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

1.5488171577453613
Give three tips for staying healthy.

Sure, here are three tips for staying healthy:

1. Eat a balanced diet rich in fruits, vegetables, and whole grains.
2. Get regular exercise, at least 30 minutes most days of the week.
3. Maintain a healthy weight and avoid smoking.


In [None]:
assistant_model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", token=access_token).to("cuda")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
prompt = "Give three tips for staying healthy."
model = AutoModelForCausalLM.from_pretrained("google/gemma-7b-it", quantization_config=quantization_config, token=access_token)
tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b-it", token=access_token)
model_inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

output, gen_time = assisted_generate_with_time(model, model_inputs, assistant_model)

print(gen_time)
print(tokenizer.decode(output[0], skip_special_tokens=True))

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

3.945812463760376
Give three tips for staying healthy.

1. Eat a balanced diet.
2. Get regular exercise.
3. Practice healthy habits, such as washing your hands frequently and getting enough sleep.


# New Implementation

In [4]:
# target_model.save_pretrained("/content/drive/MyDrive/models/gemma7b", from_pt=True)
# draft_model.save_pretrained("/content/drive/MyDrive/models/gemma2b", from_pt=True)

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

target_model_id = "/content/drive/MyDrive/model5/gemma7b"
draft_model_id = "/content/drive/MyDrive/model5/gemma2b"

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it", token=access_token)

target_model = AutoModelForCausalLM.from_pretrained(target_model_id)
draft_model = AutoModelForCausalLM.from_pretrained(draft_model_id).to(device)

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [80]:
x_draft_ids = tokenizer.encode("Tell me about gravity", return_tensors='pt').to("cuda")

draft_model(torch.cat([x_draft_ids, torch.tensor([[235265]]).to(device)], dim=-1)).logits[:,-1].argmax(dim=1).cpu().detach().numpy()

array([109])

In [9]:
t = draft_model(x_draft_ids).logits[:,-1].softmax(dim=1).topk(dim=1, k=1)[0][0].cpu().detach().numpy()
p = []
p.append(t[0])
p

[0.69325334]

In [10]:
t = draft_model(x_draft_ids).logits[:,-1].softmax(dim=1)
x = torch.cat((t, t), dim=0)
x.topk(dim=1, k=1)

torch.return_types.topk(
values=tensor([[0.6933],
        [0.6933]], device='cuda:0', grad_fn=<TopkBackward0>),
indices=tensor([[235265],
        [235265]], device='cuda:0'))

In [37]:
x_draft_ids = tokenizer.encode("Tell me about gravity", return_tensors='pt').to("cuda")
padded_tensor = F.pad(x_draft_ids, (0, 10), 'constant', 0)

t = draft_model(padded_tensor, attention_mask=attention_mask).logits[:,-1].softmax(dim=1)
t.topk(dim=1, k=1)

torch.return_types.topk(
values=tensor([[0.9091]], device='cuda:0', grad_fn=<TopkBackward0>),
indices=tensor([[109]], device='cuda:0'))

In [42]:
padded_tensor = F.pad(x_draft_ids, (0, 10), 'constant', 0)
attention_mask = padded_tensor != 0

t = target_model(padded_tensor).logits[:,-1].softmax(dim=1)
t.topk(dim=1, k=5)

torch.return_types.topk(
values=tensor([[0.8161, 0.0522, 0.0192, 0.0149, 0.0149]], device='cuda:0',
       grad_fn=<TopkBackward0>),
indices=tensor([[109, 108, 714, 651, 110]], device='cuda:0'))

In [19]:
draft_model(padded_tensor, attention_mask=attention_mask).logits[:,-1].softmax(dim=1)
x.topk(dim=1, k=1)

tensor([[-28.5772,  -6.4410, -30.2168,  ..., -20.4414, -21.3014, -27.6076]],
       device='cuda:0', grad_fn=<SelectBackward0>)

In [11]:
x = tokenizer.encode("hello there", return_tensors='pt')
x

tensor([[    2, 17534,  1104]])

In [15]:
tokenizer.batch_decode(x)

['<bos>hello there']

In [13]:
x_draft_ids = tokenizer.encode("Tell me about gravity", return_tensors='pt').to("cuda")
len(x_draft_ids[0])

5

## Final Implementation (GPU Issues Fixed)

In [4]:
def sample(x, probs, i):
    f = torch.abs(probs[0][i] - x)
    result = torch.min(f, dim=0, keepdim=False)
    return probs[1][i][result.indices.item()]

def speculative_sampling(x_ids, draft_model, target_model, tokenizer, N, K):
    with torch.no_grad():
        n = len(x_ids[0])
        T = len(x_ids[0]) + N

        total_tokens = 0
        accepted_tokens = 0

        draft_sequences = x_ids.clone()
        x_draft_ids = x_ids

        while n < T:
            p = []

            n2 = len(x_draft_ids[0])
            # Step 1: auto-regressive decode K tokens from draft model
            for i in range(K):
                next_logit = draft_model(x_draft_ids).logits[:, -1]
                draft_probs = next_logit.softmax(dim=1)
                p.append(draft_probs.topk(dim=1, k=1)[0][0].cpu().detach().numpy()[0])

                next_token = next_logit.argmax(dim=1)

                x_draft_ids = torch.cat([x_draft_ids, next_token.unsqueeze(0)], dim=-1)
                new_sequence = x_draft_ids[:, :n2 + i + 1].clone()
                padding = draft_sequences.size(1) - new_sequence.size(1)
                new_sequence = torch.nn.functional.pad(new_sequence, (0, padding), mode='constant', value=0)
                draft_sequences = torch.cat([draft_sequences, new_sequence], dim=0)


            # Step 2: target model forward passes on x_draft
            target_logits = target_model(draft_sequences[:, :n2 + K]).logits
            next_token = target_logits[:, -1].argmax(dim=1)
            q = target_logits[:, -1].softmax(dim=1).topk(dim=1, k=1)[0].squeeze().cpu().numpy()
            target_probs = target_logits[:, -1].softmax(dim=1).topk(dim=1, k=10)

            # Step 3: append draft tokens based on rejection criterion and resample
            all_accepted = True
            for t in range(K):
                j = x_draft_ids[0][n2 + t - 1].item()
                total_tokens += 1
                rand = np.random.random()
                if rand < min(1, q[t] / p[t]):  # accepted
                    x_ids = torch.cat([x_ids, torch.tensor([[j]], device=device)], dim=-1)
                    accepted_tokens += 1
                    n += 1
                else:  # rejected
                    resampled_token = sample(abs(q[t] - p[t]), target_probs, t)
                    x_ids = torch.cat([x_ids, torch.tensor([[resampled_token]], device=device)], dim=-1)
                    n += 1
                    all_accepted = False
                    break

            # Step 4: if all draft tokens were accepted, sample a final token
            if all_accepted:
                final_token = next_token[-1]
                x_ids = torch.cat([x_ids, torch.tensor([[final_token]], device=device)], dim=-1)
                total_tokens += 1
                accepted_tokens += 1
                n += 1

        acceptance_rate = accepted_tokens / total_tokens
        return tokenizer.batch_decode(x_ids), acceptance_rate

In [5]:
x = "How can we reduce air pollution?"
N = 100
K = 5
x_ids = tokenizer.encode(x, return_tensors='pt').to(device)

In [6]:
start_time = time.time()
generated_text, acceptance_rate = speculative_sampling(x_ids, draft_model, target_model, tokenizer, N, K)
generation_time = time.time() - start_time
print("Generated Text:", generated_text)
print("Acceptance Rate:", acceptance_rate)
print("Time:", generation_time)

Generated Text: ['<bos>How can we reduce air pollution??\n\n**1.\n\n Reduce vehicle emissions:**\n\n\n\n* Use public transportation,\n\n carpool, or ride\n\n a bike.\n*\n\n Avoid driving in congested areas\n\n.\n vehicle regularly.\n Use alternative fuels, such\n\n as biodiesel or electric vehicles\n\n.\n\n**2.\n\n Improve energy efficiency:**\n* Use energy-efficient\n\n appliances and light bulbs\n\n* Turn off lights\n\n and electronics when not in\n\n use.\n* Reduce\n\n heating and cooling usage.\n\n']
Acceptance Rate: 0.96
Time: 5.779980182647705


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
draft_model_dpo_id = "/content/drive/MyDrive/data/dpo/dpo_gemma"
target_model_id = "/content/drive/MyDrive/model5/gemma7b"


dpo_tokenizer = AutoTokenizer.from_pretrained(draft_model_dpo_id)
draft_model_dpo = AutoModelForCausalLM.from_pretrained(draft_model_dpo_id).to(device)
target_model = AutoModelForCausalLM.from_pretrained(target_model_id)

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
x = "How can we reduce air pollution?"
N = 100
K = 5
x_ids = dpo_tokenizer.encode(x, return_tensors='pt').to(device)

In [7]:
start_time = time.time()
generated_text, acceptance_rate = speculative_sampling(x_ids, draft_model_dpo, target_model, dpo_tokenizer, N, K)
generation_time = time.time() - start_time
print("Generated Text:", generated_text)
print("Acceptance Rate:", acceptance_rate)
print("Time:", generation_time)

Generated Text: ['<bos>How can we reduce air pollution??\n\n**Reducing Emissions\n\n:**\n\n* **Reduce\n\n energy consumption:** Use energy\n\n-efficient appliances, turn\n\n off lights when not in\n\n use, and adjust thermostat\n\n settings.\n* **\n\nSwitch to renewable energy sources\n\n:** Invest in solar panels\n\n, wind turbines, or\n\n other clean energy sources.\n\n\n* **Reduce industrial\n\n emissions:** Implement cleaner production\n\n technologies, use less fuel\n\n, and adopt cleaner burning\n\n fuels.\n* **\n\nReduce transportation emissions:** Use\n\n']
Acceptance Rate: 1.0
Time: 5.361500263214111


In [None]:
prompt = "Tell me about gravity"
model_inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

In [17]:
model_inputs = dpo_tokenizer.encode(x, return_tensors='pt').to(device)
outputs = draft_model_dpo.generate(model_inputs, max_new_tokens=200)
print(dpo_tokenizer.decode(outputs[0], skip_special_tokens=True))

How can we reduce air pollution?

**Reducing Emissions:**

* **Reduce energy consumption:** Use energy-efficient appliances, turn off lights when not in use, and adjust thermostat settings.
* **Switch to renewable energy sources:** Invest in solar panels, wind turbines, or other clean energy sources.
* **Reduce industrial emissions:** Implement cleaner production technologies, use less fuel, and adopt cleaner burning fuels.
* **Reduce transportation emissions:** Use public transportation, carpool, or bike to reduce reliance on fossil fuels.

**Improving Infrastructure:**

* **Improve public transportation:** Expand bus routes, create bike lanes, and invest in efficient transit systems.
* **Install air quality monitoring stations:** Monitor air quality in different areas and communities.
* **Plant trees and vegetation:** Trees absorb pollutants and contribute to cleaner air.
* **Reduce industrial emissions:** Invest in cleaner technologies and implement stricter emission standards.

**C

In [16]:
model_inputs = tokenizer.encode(x, return_tensors='pt').to(device)
start_time = time.time()
outputs = target_model.generate(model_inputs, max_new_tokens=100)
generation_time = time.time() - start_time

generation_time

6.67097020149231

In [47]:
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Tell me about gravity.

Gravity is a fundamental force of nature that acts between objects with mass. It is the force that pulls objects towards each other. The greater the mass of an object, the greater its gravitational pull.

**Here are some key points about gravity:**

* **Force:** Gravity is a force, which means it can be measured in units such as newtons (N).
* **Mass:** Gravity is directly related to an object's mass. The greater the mass, the greater the


In [49]:
print(generated_text[0])

<bos>Tell me about gravity.

Gravity is a force that attracts objects and mass. It is one of the four fundamental forces. nature and along with the strong nuclear force, the electromagnetic force. and the weak nuclear force. Gravity is responsible for. formation of stars, galaxies, and planets, and. is also responsible for the motion of objects in the. system.

**Key points about gravity:**

. Gravity is a force that acts between any two objects. mass.
* The force of gravity is proportional.


## Draft Implementation (GPU Issues)

In [4]:
def sample(x, probs, i):
    f = torch.abs(probs[0][i] - x)
    result = torch.min(f, dim=0, keepdim=False)
    return probs[1][i][result.indices.item()]

def speculative_sampling(x, draft_model, target_model, tokenizer, N, K):
  with torch.no_grad():
    device = next(draft_model.parameters()).device
    x_ids = tokenizer.encode(x, return_tensors='pt').to(device)

    n = len(x_ids[0])
    T = len(x_ids[0]) + N

    total_tokens = 0
    accepted_tokens = 0
    p = []
    q = []

    draft_sequences = [x_ids]
    x_draft_ids = x_ids

    while n < T:
        print(n)
        n2 = len(x_ids[0])

        # Step 1: auto-regressive decode K tokens from draft model
        for i in range(K):
            next_logit = draft_model(x_draft_ids).logits[:,-1]

            draft_probs = next_logit.softmax(dim=1)
            p.append(draft_probs.topk(dim=1, k=1)[0][0].cpu().detach().numpy()[0])

            next_token = next_logit.argmax(dim=1)
            x_draft_ids = torch.cat([x_draft_ids, next_token.unsqueeze(0)], dim=-1)
            draft_sequences.append(x_draft_ids)

        max_length = max(t.size(1) for t in draft_sequences)

        # Pad each tensor to the maximum length and collect them into a new list
        padded_tensors = [torch.nn.functional.pad(t, (0, max_length - t.size(1))) for t in draft_sequences]

        # Stack the padded tensors into a single tensor
        stacked_tensor = torch.stack(padded_tensors, dim=0)
        attention_mask = stacked_tensor.squeeze() != 0

        # Step 2: target model forward passes on x_draft
        target_logits = target_model(stacked_tensor.squeeze()).logits
        next_token = target_logits[:,-1].argmax(dim=1)
        q = (target_logits[:,-1].softmax(dim=1).topk(dim=1, k=1)[0].cpu().detach().numpy()).squeeze()
        target_probs = (target_logits[:,-1].softmax(dim=1).topk(dim=1, k=10))

        # Step 3: append draft tokens based on rejection criterion and resample
        all_accepted = True
        for t in range(K):
            j = x_draft_ids[0][n2+t]

            total_tokens += 1
            if np.random.random() < min(1, q[t] / p[t]):  # accepted
                x_ids = torch.cat([x_ids, torch.tensor([[j]]).to(device)], dim=-1)
                accepted_tokens += 1
                n += 1
            else:  # rejected
                resampled_token = sample(abs(q[t] - p[t]), target_probs, t)
                x_ids = torch.cat([x_ids, torch.tensor([[resampled_token]]).to(device)], dim=-1)
                n += 1
                all_accepted = False
                break

        # Step 4: if all draft tokens were accepted, sample a final token
        if all_accepted:
            final_token = next_token[-1]
            x_ids = torch.cat([x_ids, torch.tensor([[final_token]]).to(device)], dim=-1)
            total_tokens += 1
            accepted_tokens += 1
            n += 1

    acceptance_rate = accepted_tokens / total_tokens
    return tokenizer.batch_decode(x_ids), acceptance_rate