# Load model

We will use quantized model (AWQ) to fit model in free Colab tier. vLLM will speed up inference.

In [None]:
!pip install -q autoawq
!pip install -q vllm

In [None]:
from vllm import LLM, SamplingParams
import torch
import numpy as np

llm = LLM(model="TheBloke/Mistral-7B-Instruct-v0.2-AWQ",
          quantization='awq',
          dtype='half',
          max_model_len=128)


In [None]:
sampling_params = SamplingParams(temperature=0,
                                 max_tokens=8)

prompts = [
    "[INST] Return only result witn no explanation: 2 + 2[/INST] = ",
    "[INST] Return only result witn no explanation: 234 * 231 [/INST] = ",
]

outputs = llm.generate(prompts, sampling_params)

# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"\nPrompt: {prompt!r}, \nGenerated text: {generated_text!r}")

In [None]:
def clean_output(txt):
  x = txt.strip().split()[0].replace(',', '')
  try:
    x = float(x)
    return x
  except ValueError:
    return

In [None]:
PROMPT = "[INST] Return only result witn no explanation: {inst} [/INST] = "

def calculate(dataset, operation):
  p = [PROMPT.format(inst=f'{a} {operation} {b}') for a, b, _ in dataset]
  outputs = llm.generate(p, sampling_params)
  return [x.outputs[0].text for x in outputs]

# Generate dataset

In [None]:
def add(a, b):
    return a + b


def subtract(a, b):
    return a - b


def multiply(a, b):
    return a * b


def divide(a, b):
    if b != 0:
        return round(a / b, 2)
    else:
        return None  # Handle division by zero


def generate_dataset(start, end, function):
     return [(i, j, function(i, j)) for i in range(start, end + 1) for j in range(start, end + 1)]

## Test adding

In [None]:
add_dataset = generate_dataset(1, 100, add)
results_raw = calculate(add_dataset, '+')
results = [clean_output(x) for x in results_raw]

In [None]:
y_true = [x[2] for x in add_dataset]
y_true = np.array(y_true)
results = np.array(results)
acc = sum(results == y_true) / len(results)

In [None]:
acc

In [None]:
import matplotlib.pyplot as plt

def plot_heatmap(dataset, weights):
  a_values, b_values, _ = zip(*dataset)

  # Creating bins for a and b with bin size 10
  a_bins = np.arange(1, 110, 10)
  b_bins = np.arange(1, 110, 10)

  # Creating a 2D histogram based on the sum of 'True' values
  heatmap, xedges, yedges = np.histogram2d(a_values, b_values, bins=[a_bins, b_bins], weights=weights)

  # Plotting the heatmap
  plt.imshow(heatmap, extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]], origin='lower', cmap='viridis', aspect='auto')
  plt.colorbar(label='Sum of True values')
  plt.xlabel('a')
  plt.ylabel('b')
  plt.title('Heatmap with Bin Size 10')

  plt.show()

In [None]:
plot_heatmap(add_dataset, results==y_true)

## Test multiplication

In [None]:
mul_dataset = generate_dataset(1, 100, multiply)
results_raw = calculate(mul_dataset, '*')
results = [clean_output(x) for x in results_raw]

In [None]:
y_true = [x[2] for x in mul_dataset]
y_true = np.array(y_true)
results = np.array(results)
acc = sum(results == y_true) / len(results)

In [None]:
acc

In [None]:
plot_heatmap(mul_dataset, results==y_true)

# Finetune

Use unsloth, QLoRA and DPO.  

### Kaggle

In [None]:
%%capture
!mamba install cudatoolkit xformers bitsandbytes pytorch pytorch-cuda=11.8 \
    -c pytorch -c nvidia -c xformers -c conda-forge -y
!pip install "unsloth[kaggle] @ git+https://github.com/unslothai/unsloth.git"
!pip uninstall datasets -y
!pip install datasets

import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# restart after this 
# !pip install bitsandbytes
# !pip install xformers

### Colab

In [None]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install "unsloth[colab-ampere] @ git+https://github.com/unslothai/unsloth.git"
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install "unsloth[colab] @ git+https://github.com/unslothai/unsloth.git"
pass

In [None]:
# !pip install -q accelerate
# !pip install bitsandbytes

# Finetune dataset

Multiplication for  a, b in range 60-100.

In [None]:
import random


train_dataset = generate_dataset(60, 100, multiply)
val_dataset = generate_dataset(101, 110, multiply)


dpo_dataset_train = {
    "prompt": [
        PROMPT.format(inst=f'{a} * {b}') for a, b, _ in train_dataset
    ],
    "chosen": [
        str(x[2]) for x in train_dataset
    ],
    "rejected": [
        str(x[2] + random.choice([-5, -4, -3, -2, -1, 1, 2, 3, 4, 5])) for x in train_dataset
    ],
}

dpo_dataset_eval = {
    "prompt": [
        PROMPT.format(inst=f'{a} * {b}') for a, b, _ in val_dataset
    ],
    "chosen": [
        str(x[2]) for x in val_dataset
    ],
    "rejected": [
        str(x[2] + random.choice([-5, -4, -3, -2, -1, 1, 2, 3, 4, 5])) for x in val_dataset
    ],
}

In [None]:
from datasets import Dataset

dpo_dataset_train = Dataset.from_dict(dpo_dataset_train)
dpo_dataset_eval = Dataset.from_dict(dpo_dataset_eval)

## Finetune

In [None]:
from unsloth import PatchDPOTrainer
PatchDPOTrainer()

In [None]:
# ~22min for 3 epochs 
import torch
from transformers import TrainingArguments
from trl import DPOTrainer
from unsloth import FastLanguageModel

max_seq_length = 256

# Load model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = None, # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = True, # Use 4bit quantization to reduce memory usage. Can be False.
)

# Do model patching and add fast LoRA weights
model = FastLanguageModel.get_peft_model(
    model,
    r = 64,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 64,
    lora_dropout = 0, # Dropout = 0 is currently optimized
    bias = "none",    # Bias = "none" is currently optimized
    use_gradient_checkpointing = True,
)

training_args = TrainingArguments(learning_rate=5e-05,
                                  num_train_epochs=2,
                                  logging_steps=100,
                                  per_device_eval_batch_size=8,
                                  per_device_train_batch_size=8,
                                  warmup_ratio=0.0,
                                  output_dir="./output", 
                                  report_to='none',
                                  fp16=not torch.cuda.is_bf16_supported(),
                                  bf16=torch.cuda.is_bf16_supported(),)

dpo_trainer = DPOTrainer(
    model,
    args=training_args,
    beta=0.1,
    train_dataset=dpo_dataset_train,
    eval_dataset=dpo_dataset_eval,
    tokenizer=tokenizer,
)
dpo_trainer.train()

In [None]:
training_args

In [None]:
# kaggle: You can add your huggingface access token, go to: Add-ons -> Secrets 
# remove outputs saved by trainer if you will get error 
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("hf_token")

# model.save_pretrained_merged("dpo_calc_mistral", tokenizer, save_method = "merged_16bit",)
# model.push_to_hub_merged("adriata/dpo_calc_mistral", tokenizer, save_method = "merged_16bit", token = hf_token)
model.push_to_hub_merged("adriata/dpo_calc_mistral", tokenizer, save_method = "lora", token = hf_token)

In [None]:
model_to_merge = PeftModel.from_pretrained(AutoModelForCausalLM.from_pretrained(base_model).to(“cuda”), lora_adapter)

merged_model = model_to_merge.merge_and_unload()
merged_model.save_pretrained(merged_model)

# Check model 

In [None]:
tokenizer

In [None]:
inputs = tokenizer(
[
    PROMPT.format(inst='22 * 3')
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 12, use_cache = True)
tokenizer.batch_decode(outputs)