In [None]:
# 1.intsalling libraies 
pip install accelerate datasets bitsandbyes transformers

In [None]:
# 2.import lib.....
import os 
import torch 
from dataclasses import dataclass ,field 
from typing import Optional 
from datasets import load_dataset 
from peft import AutoPeftModelForCausalLM,LoraConfig 
from tqdm import tqdm 
from transformers import AutoModelForCausalLM,AutoTokenizer,BitsAndBytesConfig,HfArgumentParser
from trl import SFFTrainer
from trl.trainer import ConstantLengthDataset

In [None]:
# 3.loging to HF 
from huggingface_hub import notebook_login 
notebook_login()

In [None]:
# 4. Processing 
def chars_token_ratio(dataset,tokenizer,nb_examples=400):
    "Estimate the average number of characters per token "
    total_characters,total_tokens=0,0 
    for _,example in tqdm(zip(range(nb_examples),iter(dataset)),total=nb_examples):
        text = prepare_sample_text(example)
        total_characters += len(text)
        if tokenizer.is_fast:
            total_tokens += len(tokenizer(text).tokens())
        else :
            total_tokens += len(tokenizer.tokenize(text))
    return total_characters / total_tokens


def  print_trainable_parameters(model):
    """ Print the number of trainable parameters """
    trainable_params = 0
    all_param = 0
    for _,param in model.named_parameters():
        all_param += param.numel()

    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


def prepare_sample_text(example):
    """Prepare the text from a sample of the dataset."""
    text =f"{example['prompt']}\n {example['response']}"
    return text 

def create_datasets(tokenizer):
    dataset =load_dataset(
        "Dahos/full-hh-rlhf",
        split="test",
        use_auth_token=True
    )
    dataset = dataset.train_test_split(test_size=0.005,seed=None)
    train_data = dataset["train"]
    valid_data = dataset["test"]
    print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}")

    chars_per_token = chars_token_ratio(train_data,tokenizer)
    print(f"The ratio is {chars_per_token:.2f}")

    train_dataset = ConstantLengthDataset(
        tokenizer,
        train_data,
        formatting_func = prepare_sample_text,
        infinite=True,
        seq_length=256,
        chars_per_token=chars_per_token,
    )
    valid_dataset = ConstantLengthDataset(
        tokenizer,
        valid_data,
        formatting_func=prepare_sample_text,
        infinite=True,
        seq_length=256,
        chars_per_token=chars_per_token,
    )
    return train_dataset,valid_dataset
