In [41]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
!pip install  --upgrade \
  "transformers[sentencepiece]==4.37.2" \
  "datasets==2.16.1" \
  "accelerate==0.26.1" \
  "evaluate==0.4.1" \
  "bitsandbytes==0.42.0" \
  "trl==0.7.11" \
  "peft==0.8.2" \
  "pillow"

In [4]:
import torch; assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'
# install flash-attn
!pip install ninja packaging
!MAX_JOBS=1 pip install flash-attn --no-build-isolation

In [7]:
from huggingface_hub import login

login(
  token=  # ADD YOUR TOKEN HERE
  # add_to_git_credential=True
)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [58]:
from transformers import AutoTokenizer
from datasets import load_dataset

# Load Tokenizer from the hub

model_id = "cognitivecomputations/dolphin-2.1-mistral-7b"

tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load dataset from the hub
dataset = load_dataset("argilla/ultrafeedback-binarized-preferences-cleaned", split="train")
dataset = dataset.shuffle().select(range(13750))


def rec_extract_assistant_messages(messages, index=-1):
  """Recursively extract the last assistant messages from the end of the conversation."""
  if messages[index]["role"] == "assistant":
    return [messages[index]]
  else:
    return rec_extract_assistant_messages(messages, index-1)

# System message used if there is no system message at the beginning of the conversation
# Can be repelaced and modified as needed
DEFAULT_SYSTEM_MESSAGE = "You are Dolphin, a helpful AI assistant."

def create_triplets(example, tokenizer, default_system_message=DEFAULT_SYSTEM_MESSAGE):
  """Create the triplets (prompt, chosen, rejected)"""
  # Extract the N-1 turns to form the prompt
  # Prepend a system message if the first message is not a system message
  prompt_messages = example["chosen"][:-1]
  if example["chosen"][0]["role"] != "system":
      prompt_messages.insert(0, {"role": "system", "content": default_system_message})
  # Now we extract the final assistant turn to define chosen/rejected responses
  chosen_messages = rec_extract_assistant_messages(example["chosen"])
  rejected_messages = rec_extract_assistant_messages(example["rejected"])

  # apply template to the messages and return the triplets
  return {
    "prompt": tokenizer.apply_chat_template(prompt_messages, tokenize=False),
    "chosen": tokenizer.apply_chat_template(chosen_messages, tokenize=False),
    "rejected": tokenizer.apply_chat_template(rejected_messages, tokenize=False)
  }

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [59]:
len(dataset) # Org

13750

In [60]:
dataset_100 = dataset.select(range(100))

In [61]:
dataset_100 = dataset_100.map(create_triplets, remove_columns=dataset_100.features, fn_kwargs={"tokenizer": tokenizer})
# split dataset into 11,000 training samples and 2,750 test samples
dataset_100 = dataset_100.train_test_split(test_size=0.2)

# print sample cut of
print(dataset_100["train"][0]["prompt"][:50])
print(dataset_100["train"][0]["chosen"][:50])
print(dataset_100["train"][0]["rejected"][:50])

# save datasets to disk
dataset_100["train"].to_json("train_dataset.json", orient="records")
dataset_100["test"].to_json("test_dataset.json", orient="records")

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

<|im_start|>system
You are Dolphin, a helpful AI a
<|im_start|>assistant
As an AI language model, I'm
<|im_start|>assistant
Dear Client,

We appreciate 


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

79930

In [43]:
print(dataset_100["train"][1]["prompt"][:500])
print("\n")
print(dataset_100["train"][1]["chosen"][:500])
print("\n")
print(dataset_100["train"][1]["rejected"][:500])

<|im_start|>system
You are Dolphin, a helpful AI assistant.<|im_end|>
<|im_start|>user
Can you summarize the findings of the experiment comparing the area/velocity method and turbine flow meter method for calculating flow rate? Answer according to: This experiment investigated the area/velocity method and turbine flow meter method of calculating flow rate. Its aim was to compare accuracy and practical usage for each method. Initial hypotheses predicted that the turbine flow meter would have a hi


<|im_start|>assistant
The experiment aimed to compare the accuracy and practical usage of the area/velocity method and the turbine flow meter method for calculating flow rates. It was hypothesized that the turbine flow meter would be more accurate, and the results confirmed this hypothesis.

However, while the turbine flow meter proved to have greater accuracy and is well-suited for applications requiring high accuracy and installations in pipes, such as water supplies or factories, it is not

In [18]:
from datasets import load_dataset

# Load jsonl data from disk
train_dataset = load_dataset("json", data_files="train_dataset.json", split="train")
eval_dataset = load_dataset("json", data_files="test_dataset.json", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [19]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Hugging Face model id
model_id = "cognitivecomputations/dolphin-2.1-mistral-7b" # replace with your model id

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    use_cache=False,
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left' # to prevent errors with FA
tokenizer.truncation_side = 'left' # to prevent cutting off last generation

config.json:   0%|          | 0.00/622 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [20]:
prompt_length = 1024
max_seq_length = 1512

In [21]:
from peft import LoraConfig

# LoRA config based on QLoRA paper & Sebastian Raschka experiment
peft_config = LoraConfig(
        lora_alpha=128,
        lora_dropout=0.05,
        r=256,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM",
)

In [22]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="doplhin-dpo",               # directory to save and repository id
    num_train_epochs=1,                     # number of training epochs
    per_device_train_batch_size=4,          # batch size per device during training - Org 12
    per_device_eval_batch_size=1,           # batch size for evaluation - Org - 4
    gradient_accumulation_steps=1,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    learning_rate=5e-5,                     # 10x higher LR than QLoRA paper
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.1,                       # warmup ratio based on QLoRA paper
    lr_scheduler_type="cosine",             # use cosine learning rate scheduler
    logging_steps=25,                       # log every 25 steps
    save_steps=500,                         # when to save checkpoint
    save_total_limit=2,                     # limit the total amount of checkpoints
    evaluation_strategy="steps",            # evaluate every 1000 steps
    eval_steps=700,                         # when to evaluate
    bf16=True,                              # use bfloat16 precision
    tf32=True,                              # use tf32 precision
    push_to_hub=False,                      # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
)

dpo_args = {
    "beta": 0.1,                            # The beta factor in DPO loss. Higher beta means less divergence
    "loss_type": "sigmoid"                  # The loss type for DPO.
}

In [23]:
from trl import DPOTrainer

trainer = DPOTrainer(
    model,
    ref_model=None, # set to none since we use peft
    peft_config=peft_config,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    max_length=max_seq_length,
    max_prompt_length=prompt_length,
    beta=dpo_args["beta"],
    loss_type=dpo_args["loss_type"],
)



Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [24]:
# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

# save model at the end of training
trainer.save_model()

The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss


In [32]:
# import locale
# locale.getpreferredencoding = lambda: "UTF-8"
# # above used otherwise NotImplementedError: A UTF-8 locale is required. Got ANSI_X3.4-1968

# !zip -r '/content/doplhin-dpo-zip' '/content/doplhin-dpo'

  adding: content/doplhin-dpo/ (stored 0%)
  adding: content/doplhin-dpo/tokenizer.json (deflated 74%)
  adding: content/doplhin-dpo/README.md (deflated 66%)
  adding: content/doplhin-dpo/runs/ (stored 0%)
  adding: content/doplhin-dpo/runs/Mar25_13-13-41_28c0613a8476/ (stored 0%)
  adding: content/doplhin-dpo/runs/Mar25_13-13-41_28c0613a8476/events.out.tfevents.1711372447.28c0613a8476.1407.0 (deflated 59%)
  adding: content/doplhin-dpo/special_tokens_map.json (deflated 70%)
  adding: content/doplhin-dpo/training_args.bin (deflated 51%)
  adding: content/doplhin-dpo/added_tokens.json (deflated 25%)
  adding: content/doplhin-dpo/adapter_config.json (deflated 51%)
  adding: content/doplhin-dpo/tokenizer.model (deflated 55%)
  adding: content/doplhin-dpo/tokenizer_config.json (deflated 71%)
  adding: content/doplhin-dpo/adapter_model.safetensors (deflated 22%)


In [33]:
# !ls -l /content/doplhin-dpo-zip.zip # 1.04 GB for 100 record training

-rw-r--r-- 1 root root 1049474938 Mar 25 13:33 /content/doplhin-dpo-zip.zip


In [None]:
# # free the memory again
# del model
# del trainer
# torch.cuda.empty_cache()

In [25]:
def get_device_map() -> str:
    return 'cuda' if torch.cuda.is_available() else 'cpu'

device = get_device_map()  # 'cpu'

import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, pipeline

# Path to saved peft adapter model
# peft_model_id = args.output_dir # or
peft_model_id = "/content/doplhin-dpo"

# Load Model with PEFT adapter
model = AutoPeftModelForCausalLM.from_pretrained(
  peft_model_id,
  device_map=device,
  # device_map="auto",
  torch_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
# load into pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCaus

In [26]:
prompts = [
  "A rectangular garden has a length of 25 feet and a width of 15 feet. If you want to build a fence around the entire garden, how many feet of fencing will you need?",
  "It's Bengay for muscle relief, a combination of methyl salicylate, menthol, and what other active ingredient commonly found in aspirin?",
  "How can i get rid of llamas in my backyard?"
]

In [27]:
for prompt in prompts:
  messages = pipe.tokenizer.apply_chat_template([{"role":"user", "content": prompt}], tokenize=False)
  outputs = pipe(prompt, max_new_tokens=2048, do_sample=True, temperature=1.0, top_k=50, top_p=0.9, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id)
  print(f"**Prompt**:\n{prompt}\n")
  print(f"**Generated Answer**:\n{outputs[0]['generated_text'][len(prompt):].strip()}")
  print("===" * 10)

**Prompt**:
A rectangular garden has a length of 25 feet and a width of 15 feet. If you want to build a fence around the entire garden, how many feet of fencing will you need?

**Generated Answer**:
First, we need to find the perimeter of the rectangular garden. The formula for the perimeter of a rectangle is P = 2(L + W), where L is the length and W is the width.

Using the given values:

Perimeter (P) = 2(25 ft + 15 ft)

Perimeter (P) = 2(40 ft)

Perimeter (P) = 80 ft

So, you will need 80 feet of fencing to build a fence around the entire garden. |

You will need 80 feet of fencing to build a fence around the entire garden.

#### Solution

To calculate the amount of fencing needed to enclose the rectangular garden, we use the formula for the perimeter of a rectangle: P = 2(L + W), where L is the length and W is the width.

Given length (L) = 25 feet
Given width (W) = 15 feet

Now we can plug these values into the formula:

Perimeter (P) = 2(25 ft + 15 ft)

Perimeter (P) = 2(40 ft)

