In [1]:
!pip install transformers datasets peft trl bitsandbytes accelerate
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting trl
  Downloading trl-0.10.1-py3-none-any.whl.metadata (12 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.8.10-py3-none-any.whl.metadata (8.4 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl)
  Downloading shtab-1.7.1-py3

In [2]:
pip install wandb

Collecting wandb
  Downloading wandb-0.18.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-2.14.0-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.9 kB)
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.29,>=1.0.0->wandb)
  Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython!=3.1.29,>=1.0.0->wandb)
  Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)
Downloading wandb-0.18.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_

In [3]:
from datasets import load_dataset, Dataset
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import wandb
from transformers.integrations import WandbCallback
import sys



def get_train_val_ds():
    dataset = load_dataset("OpenAssistant/oasst1")
    dataset = dataset.shuffle().select(range(12500))

    train_dataset = dataset["train"]
    val_dataset = dataset["validation"]

    train_df = train_dataset.to_pandas()
    val_df = val_dataset.to_pandas()

    train_ds = preprocess_data(train_df)
    val_ds = preprocess_data(val_df)

    hf_train_ds = Dataset.from_pandas(train_ds)
    hf_val_ds = Dataset.from_pandas(val_ds)

    return hf_train_ds, hf_val_ds

def preprocess_data(df):
    def process_conversation(group):
        prompt_df = group[group["role"] == "prompter"]
        assistant_df = group[group["role"] == "assistant"]

        prompts = prompt_df["text"].tolist()
        responses = assistant_df.groupby("parent_id")["text"].apply(" ".join).tolist()

        if not prompts or len(prompts) != len(responses):
            return pd.DataFrame(columns=["prompt", "response"])

        base_prompt = prompts[0]
        augmented_prompts = [base_prompt] + [f"{base_prompt} {prompt}" for prompt in prompts[1:]]

        return pd.DataFrame({"prompt": augmented_prompts, "response": responses})

    processed_df = df.groupby("message_tree_id").apply(process_conversation).reset_index(drop=True)
    return processed_df[processed_df["response"] != ""]

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [4]:
train_ds, val_ds = get_train_val_ds()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

(…)-00000-of-00001-b42a775f407cee45.parquet:   0%|          | 0.00/39.5M [00:00<?, ?B/s]

(…)-00000-of-00001-134b8fd0c89408b6.parquet:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/84437 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4401 [00:00<?, ? examples/s]

In [5]:
train_ds

Dataset({
    features: ['prompt', 'response'],
    num_rows: 7501
})

In [6]:
import datasets
from peft import LoraConfig, get_peft_model
import torch
import transformers
from trl import SFTTrainer
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    BitsAndBytesConfig,
)
from dataclasses import dataclass
from typing import Optional

# Configuration classes
@dataclass
class BaseConfig:
    model_name: str = "microsoft/Phi-3-mini-4k-instruct"
    dataset_name: str = "OpenAssistant/oasst1"

training_config = {
    "bf16": True,
    "do_eval": False,
    "learning_rate": 5.0e-06,
    "log_level": "info",
    "logging_steps": 20,
    "logging_strategy": "steps",
    "lr_scheduler_type": "cosine",
    "num_train_epochs": 1,
    "max_steps": -1,
    "output_dir": "./checkpoint_dir",
    "overwrite_output_dir": True,
    "per_device_eval_batch_size": 4,
    "per_device_train_batch_size": 4,
    "remove_unused_columns": True,
    "save_steps": 100,
    "save_total_limit": 1,
    "seed": 0,
    "gradient_checkpointing": True,
    "gradient_checkpointing_kwargs":{"use_reentrant": False},
    "gradient_accumulation_steps": 1,
    "warmup_ratio": 0.2,
    "report_to":"wandb"
    }

peft_config = {
    "r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "bias": "none",
    "task_type": "CAUSAL_LM",
    "target_modules": "all-linear",
    "modules_to_save": None,
}
@dataclass
class ModelConfig:
    torch_dtype: torch.dtype = torch.bfloat16
    trust_remote_code: bool = True
    use_cache: bool = False
    attn_implementation: str = "eager"

@dataclass
class BnbConfig:
    load_in_4bit: bool = True
    bnb_4bit_compute_dtype: torch.dtype = torch.float16
    bnb_4bit_quant_type: str = "nf4"
    bnb_4bit_use_double_quant: bool = True

# Helper functions
def create_bnb_config() -> BitsAndBytesConfig:
    return BitsAndBytesConfig(
        load_in_4bit=BnbConfig.load_in_4bit,
        bnb_4bit_compute_dtype=BnbConfig.bnb_4bit_compute_dtype,
        bnb_4bit_quant_type=BnbConfig.bnb_4bit_quant_type,
        bnb_4bit_use_double_quant=BnbConfig.bnb_4bit_use_double_quant
    )

def create_lora_config() -> LoraConfig:
    return LoraConfig(
        **peft_config
    )

def create_training_args() -> TrainingArguments:
    return TrainingArguments(
        **training_config
    )

In [7]:
wandb.init(project="phi-3-fine-tuning", name="qloratest")

train_arg = create_training_args()
lora_config = create_lora_config()
bnb_config = create_bnb_config()

In [8]:
log_level = train_arg.get_process_log_level()
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()

In [9]:
model = AutoModelForCausalLM.from_pretrained(
    BaseConfig.model_name,
    quantization_config=bnb_config,
    device_map={"": 0},
    trust_remote_code=True)

model.config.use_cache = False
model.config.pretraining_tp = 1
tokenizer = AutoTokenizer.from_pretrained(BaseConfig.model_name, trust_remote_code=True)
tokenizer.model_max_length = 2048
tokenizer.pad_token = tokenizer.unk_token  # use unk rather than eos token to prevent endless generation
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'right'

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

[INFO|configuration_utils.py:733] 2024-09-17 13:35:41,473 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/5a516f86087853f9d560c95eb9209c1d4ed9ff69/config.json


configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
[INFO|configuration_utils.py:733] 2024-09-17 13:35:42,010 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/5a516f86087853f9d560c95eb9209c1d4ed9ff69/config.json
[INFO|configuration_utils.py:800] 2024-09-17 13:35:42,013 >> Model config Phi3Config {
  "_name_or_path": "microsoft/Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_rang

modeling_phi3.py:   0%|          | 0.00/73.2k [00:00<?, ?B/s]

- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

[INFO|modeling_utils.py:3678] 2024-09-17 13:35:43,381 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/5a516f86087853f9d560c95eb9209c1d4ed9ff69/model.safetensors.index.json


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

[INFO|modeling_utils.py:1606] 2024-09-17 13:36:07,029 >> Instantiating Phi3ForCausalLM model under default dtype torch.float16.
[INFO|configuration_utils.py:1038] 2024-09-17 13:36:07,032 >> Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 32000,
  "pad_token_id": 32000
}



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO|modeling_utils.py:4507] 2024-09-17 13:36:12,565 >> All model checkpoint weights were used when initializing Phi3ForCausalLM.

[INFO|modeling_utils.py:4515] 2024-09-17 13:36:12,569 >> All the weights of Phi3ForCausalLM were initialized from the model checkpoint at microsoft/Phi-3-mini-4k-instruct.
If your task is similar to the task the model of the checkpoint was trained on, you can already use Phi3ForCausalLM for predictions without further training.


generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

[INFO|configuration_utils.py:993] 2024-09-17 13:36:13,059 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/5a516f86087853f9d560c95eb9209c1d4ed9ff69/generation_config.json
[INFO|configuration_utils.py:1038] 2024-09-17 13:36:13,062 >> Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": [
    32000,
    32001,
    32007
  ],
  "pad_token_id": 32000
}



tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

[INFO|tokenization_utils_base.py:2269] 2024-09-17 13:36:15,850 >> loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/5a516f86087853f9d560c95eb9209c1d4ed9ff69/tokenizer.model
[INFO|tokenization_utils_base.py:2269] 2024-09-17 13:36:15,851 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/5a516f86087853f9d560c95eb9209c1d4ed9ff69/tokenizer.json
[INFO|tokenization_utils_base.py:2269] 2024-09-17 13:36:15,853 >> loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/5a516f86087853f9d560c95eb9209c1d4ed9ff69/added_tokens.json
[INFO|tokenization_utils_base.py:2269] 2024-09-17 13:36:15,854 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/5a516f86087853f9d560c95eb9209c1d4ed9ff69/special_tokens_map

In [10]:
# Function to format the conversation
def format_conversation(example):
    prompt = example['prompt']
    response = example['response']
    conversation = f"Human: {prompt}\n\nAssistant: {response}"
    return {"conversation": conversation}

# Apply the formatting to the dataset
formatted_train_ds = train_ds.map(format_conversation)
formatted_val_ds = val_ds.map(format_conversation)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["conversation"], truncation=True, padding="max_length", max_length=512)

tokenized_train_ds = formatted_train_ds.map(tokenize_function, batched=True)
tokenized_val_ds = formatted_val_ds.map(tokenize_function, batched=True)


Map:   0%|          | 0/7501 [00:00<?, ? examples/s]

Map:   0%|          | 0/398 [00:00<?, ? examples/s]

Map:   0%|          | 0/7501 [00:00<?, ? examples/s]

Map:   0%|          | 0/398 [00:00<?, ? examples/s]

In [11]:
tokenized_train_ds.features

{'prompt': Value(dtype='string', id=None),
 'response': Value(dtype='string', id=None),
 'conversation': Value(dtype='string', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [12]:
# Create the trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_val_ds,
    peft_config=lora_config,
    dataset_text_field="conversation",
    max_seq_length=2048,
    tokenizer=tokenizer,
    args=train_arg,
    callbacks=[WandbCallback()]
)

# Start training
trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
[INFO|training_args.py:2100] 2024-09-17 13:36:19,699 >> PyTorch: setting up devices
[INFO|training_args.py:2100] 2024-09-17 13:36:19,756 >> PyTorch: setting up devices
:DefaultFlowCallback
WandbCallback
[INFO|trainer.py:648] 2024-09-17 13:36:20,179 >> Using auto half precision backend
[INFO|trainer.py:811] 2024-09-17 13:36:20,473 >> The following columns in the training set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: prompt, response, conversation. If prompt, response, conversation are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.
[INFO|trainer.py:2134] 2024-09-17 13:36:21,180 >> ***** Running training *****
[INFO|trainer.py:2135] 2024-09-17 13:36:21,182 >>   Num examples = 7,501
[INFO|trainer.py:2136] 2024-09-17 13:36:21,183 >>   Num Epochs = 1
[INFO|trainer.py:2137] 2024-09-17 13:36:21,1

Step,Training Loss
20,1.3331
40,1.3319
60,1.4855
80,1.3648
100,1.4002
120,1.4451
140,1.451
160,1.397
180,1.4101
200,1.3144


[INFO|trainer.py:3503] 2024-09-17 13:37:49,868 >> Saving model checkpoint to ./checkpoint_dir/checkpoint-100
[INFO|configuration_utils.py:733] 2024-09-17 13:37:50,382 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/5a516f86087853f9d560c95eb9209c1d4ed9ff69/config.json
[INFO|configuration_utils.py:800] 2024-09-17 13:37:50,385 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
 

TrainOutput(global_step=1876, training_loss=1.2948236617960656, metrics={'train_runtime': 1657.6795, 'train_samples_per_second': 4.525, 'train_steps_per_second': 1.132, 'total_flos': 8.63595525266473e+16, 'train_loss': 1.2948236617960656, 'epoch': 1.0})

In [13]:
trainer.evaluate()

[INFO|trainer.py:811] 2024-09-17 14:03:58,888 >> The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: prompt, response, conversation. If prompt, response, conversation are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.
[INFO|trainer.py:3819] 2024-09-17 14:03:58,893 >> 
***** Running Evaluation *****
[INFO|trainer.py:3821] 2024-09-17 14:03:58,894 >>   Num examples = 398
[INFO|trainer.py:3824] 2024-09-17 14:03:58,895 >>   Batch size = 4


{'eval_loss': 1.3421905040740967,
 'eval_runtime': 25.9246,
 'eval_samples_per_second': 15.352,
 'eval_steps_per_second': 3.857,
 'epoch': 1.0}

In [14]:
from transformers import pipeline

test_input = "Human: What is an LLM in AI?\n\nAssistant:"

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(test_input)
print(result[0]['generated_text'])



RuntimeError: expected mat1 and mat2 to have the same dtype, but got: float != c10::BFloat16

In [None]:
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

tokenizer = AutoTokenizer.from_pretrained(checkpoint_path, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [None]:
wandb.finish()