In [1]:
!pip install transformers datasets peft trl bitsandbytes accelerate
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting trl
  Downloading trl-0.10.1-py3-none-any.whl.metadata (12 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.8.10-py3-none-any.whl.metadata (8.4 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl)
  Downloading shtab-1.7.1-py3

In [2]:
from datasets import load_dataset, Dataset
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline


class PromptResponseExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        def process_tree(group):
            prompt_df = group[group["role"] == "prompter"]
            assistant_df = group[group["role"] == "assistant"]

            prompts = prompt_df["text"].tolist()
            responses = (
                assistant_df.groupby("parent_id")["text"].apply(" ".join).tolist()
            )

            if not prompts or len(prompts) != len(responses):
                return pd.DataFrame(columns=["prompt", "response"])

            base_prompt = prompts[0]
            augmented_prompts = [base_prompt] + [
                f"{base_prompt} {prompt}" for prompt in prompts[1:]
            ]

            return pd.DataFrame({"prompt": augmented_prompts, "response": responses})

        return X.groupby("message_tree_id").apply(process_tree).reset_index(drop=True)


def create_preprocessing_pipeline():
    return Pipeline(
        [
            ("prompt_response_extractor", PromptResponseExtractor()),
        ]
    )


# Usage
def preprocess_data(df):
    pipeline = create_preprocessing_pipeline()
    pipeline = pipeline.fit_transform(df)
    final_df = pipeline[pipeline["response"] != ""]
    return final_df


def get_train_val_ds():
    dataset = load_dataset("OpenAssistant/oasst1")

    train_dataset = dataset["train"]  # len(train)=84437 (95%)
    val_dataset = dataset["validation"]  # len(val)=4401 (5%)

    train_df = train_dataset.to_pandas()
    val_df = val_dataset.to_pandas()

    train_ds = preprocess_data(train_df)
    val_ds = preprocess_data(val_df)

    hf_train_ds = Dataset.from_pandas(train_ds)
    hf_val_ds = Dataset.from_pandas(val_ds)

    return hf_train_ds, hf_val_ds


In [3]:
train_ds, val_ds = get_train_val_ds()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

(…)-00000-of-00001-b42a775f407cee45.parquet:   0%|          | 0.00/39.5M [00:00<?, ?B/s]

(…)-00000-of-00001-134b8fd0c89408b6.parquet:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/84437 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4401 [00:00<?, ? examples/s]

In [4]:
train_ds

Dataset({
    features: ['prompt', 'response'],
    num_rows: 7501
})

In [5]:
import datasets
from peft import LoraConfig, get_peft_model
import torch
import transformers
from trl import SFTTrainer
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    BitsAndBytesConfig,
)
from dataclasses import dataclass
from typing import Optional

# Configuration classes
@dataclass
class BaseConfig:
    model_name: str = "microsoft/Phi-3-mini-4k-instruct"
    dataset_name: str = "OpenAssistant/oasst1"

@dataclass
class PeftConfig:
    lora_alpha = 16
    lora_dropout = 0.1
    lora_r = 64
    task_type = "CAUSAL_LM"
    target_modules = ["o_proj", "qkv_proj"]

@dataclass
class TrainingConfig:
    output_dir = "/content/results"
    per_device_train_batch_size = 1
    gradient_accumulation_steps = 16
    optim = "paged_adamw_32bit"
    save_steps = 10
    logging_steps = 10
    learning_rate = 2e-4
    max_grad_norm = 0.3
    max_steps = 100  # Reduced for quicker testing
    warmup_ratio = 0.03
    lr_scheduler_type = "constant"
    max_seq_length = 512

@dataclass
class ModelConfig:
    torch_dtype: torch.dtype = torch.float16
    trust_remote_code: bool = True
    use_cache: bool = False
    attn_implementation: str = "eager"

@dataclass
class BnbConfig:
    load_in_4bit: bool = True
    bnb_4bit_compute_dtype: torch.dtype = torch.float16
    bnb_4bit_quant_type: str = "nf4"
    bnb_4bit_use_double_quant: bool = True

# Helper functions
def create_bnb_config() -> BitsAndBytesConfig:
    return BitsAndBytesConfig(
        load_in_4bit=BnbConfig.load_in_4bit,
        bnb_4bit_compute_dtype=BnbConfig.bnb_4bit_compute_dtype,
        bnb_4bit_quant_type=BnbConfig.bnb_4bit_quant_type,
        bnb_4bit_use_double_quant=BnbConfig.bnb_4bit_use_double_quant
    )

def create_lora_config() -> LoraConfig:
    return LoraConfig(
        r=PeftConfig.lora_r,
        lora_alpha=PeftConfig.lora_alpha,
        target_modules=PeftConfig.target_modules,
        lora_dropout=PeftConfig.lora_dropout,
        task_type=PeftConfig.task_type
    )

def create_training_args() -> TrainingArguments:
    return TrainingArguments(
        output_dir=TrainingConfig.output_dir,
        per_device_train_batch_size=TrainingConfig.per_device_train_batch_size,
        gradient_accumulation_steps=TrainingConfig.gradient_accumulation_steps,
        optim=TrainingConfig.optim,
        save_steps=TrainingConfig.save_steps,
        logging_steps=TrainingConfig.logging_steps,
        learning_rate=TrainingConfig.learning_rate,
        max_grad_norm=TrainingConfig.max_grad_norm,
        max_steps=TrainingConfig.max_steps,
        warmup_ratio=TrainingConfig.warmup_ratio,
        lr_scheduler_type=TrainingConfig.lr_scheduler_type
    )

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [6]:
train_arg = create_training_args()
lora_config = create_lora_config()
bnb_config = create_bnb_config()

In [7]:
model = AutoModelForCausalLM.from_pretrained(
    BaseConfig.model_name,
    quantization_config=bnb_config,
    device_map={"": 0},
    trust_remote_code=True)

model.config.use_cache = False
model.config.pretraining_tp = 1
tokenizer = AutoTokenizer.from_pretrained(BaseConfig.model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

In [8]:
# Function to format the conversation
def format_conversation(example):
    prompt = example['prompt']
    response = example['response']
    conversation = f"Human: {prompt}\n\nAssistant: {response}"
    return {"conversation": conversation}

# Apply the formatting to the dataset
formatted_train_ds = train_ds.map(format_conversation)
formatted_val_ds = val_ds.map(format_conversation)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["conversation"], truncation=True, padding="max_length", max_length=512)

tokenized_train_ds = formatted_train_ds.map(tokenize_function, batched=True)
tokenized_val_ds = formatted_val_ds.map(tokenize_function, batched=True)


Map:   0%|          | 0/7501 [00:00<?, ? examples/s]

Map:   0%|          | 0/398 [00:00<?, ? examples/s]

Map:   0%|          | 0/7501 [00:00<?, ? examples/s]

Map:   0%|          | 0/398 [00:00<?, ? examples/s]

In [9]:
tokenized_train_ds.features

{'prompt': Value(dtype='string', id=None),
 'response': Value(dtype='string', id=None),
 'conversation': Value(dtype='string', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [10]:
# Create the trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_val_ds,
    peft_config=lora_config,
    dataset_text_field="conversation",
    max_seq_length=TrainingConfig.max_seq_length,
    tokenizer=tokenizer,
    args=train_arg,
)

# Start training
trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
10,1.4503
20,1.4543
30,1.3593
40,1.3051
50,1.3304
60,1.3729
70,1.2881
80,1.3479
90,1.4065
100,1.3315


TrainOutput(global_step=100, training_loss=1.3646246528625487, metrics={'train_runtime': 4295.6696, 'train_samples_per_second': 0.372, 'train_steps_per_second': 0.023, 'total_flos': 1.8482762612736e+16, 'train_loss': 1.3646246528625487, 'epoch': 0.21330489268097588})

In [13]:
from transformers import pipeline

test_input = "Human: What is an LLM in AI?\n\nAssistant:"

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(test_input)
print(result[0]['generated_text'])

Human: What is an LLM in AI?

Assistant: An LLM (Language Model) is a type of AI model that is trained on a large corpus of text data and can generate text that is similar to the input text. LLMs are commonly used in natural language processing (NLP) tasks such as text generation, translation, and summarization. They are also used in chatbots and virtual assistants to generate responses to user queries. LLMs are trained using a variety of techniques, including unsupervised learning, supervised learning, and reinforcement learning. The training process involves feeding the model large amounts of text data and adjusting the model's parameters to minimize the difference between the model's output and the expected output. LLMs are typically trained on a combination of text data from books, articles, and websites, as well as other sources such as social media and news articles. The model is then fine


In [None]:
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

tokenizer = AutoTokenizer.from_pretrained(checkpoint_path, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)