In [1]:
import re
from pprint import pprint

import pandas as pd
import numpy as np
import torch

from datasets import Dataset, load_dataset
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import SFTTrainer




  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL_NAME= "mistralai/Mistral-7B-v0.1"
DATASET_NAME = "dyumat/databricks-dolly-5k-rag-split"

## Load Dataset


In [8]:
data = load_dataset(DATASET_NAME,trust_remote_code=True)

In [66]:
tokenizer = AutoTokenizer.from_pretrained("dyumat/mistral-7b-chat-pdf")

tokenizer_config.json: 100%|██████████| 1.69k/1.69k [00:00<00:00, 5.90MB/s]
tokenizer.json: 100%|██████████| 1.80M/1.80M [00:00<00:00, 10.9MB/s]
special_tokens_map.json: 100%|██████████| 414/414 [00:00<00:00, 2.94MB/s]


In [67]:
DEFAULT_SYSTEM_PROMPT = """
You are a helpful AI assistant. Answer the user questions based only on the context provided.
Be concise and respectful.
""".strip()

In [68]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@[^\s]+", "", text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\^[^ ]+", "", text)

    return text

def generate_training_prompt(
    context: str, question: str,answer:str, system_prompt: str = DEFAULT_SYSTEM_PROMPT
) -> str:
    empty_str =" "
    cntxt = "<|context|>"
    msg = [{"role":"system", "content": DEFAULT_SYSTEM_PROMPT},
         {"role":"user","content": f'{clean_text(question) if question else empty_str}'},
        {"role":"context","content":f'{clean_text(context) if context else empty_str}'},
        {"role":"assistant","content":clean_text(answer) if answer  else empty_str}
                    ]
    return tokenizer.apply_chat_template(msg,tokenize=False,add_generation_prompt=False)

def generate_text(data_point):
        return {"text":generate_training_prompt(context=data_point["context"],question=data_point["instruction"],answer=data_point["response"])}


In [69]:
train_data = data["train"][np.random.randint(len(data["train"]))]
print(generate_training_prompt(context=train_data["context"],question=train_data["instruction"],answer=train_data["response"]))

<|system|>
You are a helpful AI assistant. Answer the user questions based only on the context provided.
Be concise and respectful.</s>
<|user|>
Given this paragraph about the longest living person, what was their name and when were they born?</s>
<|context|>
The longest documented and verified human lifespan is that of Jeanne Calment of France (1875–1997), a woman who lived to age 122 years and 164 days. She claimed to have met Vincent van Gogh when she was 12 or 13. She received news media attention in 1985, after turning 110. Calment's claim was investigated and authenticated by Jean-Marie Robine and Dr Michel Allard for the GRG. Her longevity claim was put into question in 2018, but the original assessing team stood by their judgement.</s>
<|assistant|>
Jeanne Calment is the longest living person to be verified. She was born in 1875 and lived to be 122 years old.</s>



In [9]:
def process_dataset(data: Dataset):
    return (
        data.shuffle(seed=15445)
        .map(generate_text)
        .remove_columns("category")
    )
data["train"] = process_dataset(data["train"])
data["validation"] = process_dataset(data["validation"])

### Prepare model

In [10]:
def create_model_and_tokenizer():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        use_safetensors=True,
        trust_remote_code=True,
        quantization_config=bnb_config,
        device_map="auto",
    )

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    return model, tokenizer

In [11]:
model, tokenizer = create_model_and_tokenizer()
model.config.use_cache = False
model.config.quantization_config.to_dict()

Loading checkpoint shards: 100%|██████████| 2/2 [00:21<00:00, 10.85s/it]


{'quant_method': <QuantizationMethod.BITS_AND_BYTES: 'bitsandbytes'>,
 '_load_in_8bit': False,
 '_load_in_4bit': True,
 'llm_int8_threshold': 6.0,
 'llm_int8_skip_modules': None,
 'llm_int8_enable_fp32_cpu_offload': False,
 'llm_int8_has_fp16_weight': False,
 'bnb_4bit_quant_type': 'nf4',
 'bnb_4bit_use_double_quant': False,
 'bnb_4bit_compute_dtype': 'bfloat16',
 'load_in_4bit': True,
 'load_in_8bit': False}

In [12]:
lora_r = 16
lora_alpha = 16
lora_dropout = 0.1
lora_target_modules = [
    "q_proj",
    "up_proj",
    "o_proj",
    "k_proj",
    "down_proj",
    "gate_proj",
    "v_proj",
]


peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=lora_target_modules,
    bias="none",
    task_type="CAUSAL_LM",
)

## Training

In [13]:
OUTPUT_DIR = "/scratch/engin_root/engin1/asaklani/experiments/mistral-rag-pdf-1"

In [14]:
training_arguments = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=16,
    gradient_checkpointing = True,
    eval_accumulation_steps=16,
    optim="paged_adamw_8bit",
    logging_steps=1,
    learning_rate=1e-4,
    fp16=False,
    max_grad_norm=0.3,
    num_train_epochs=1,
    evaluation_strategy="steps",
    eval_steps=0.5,
    warmup_ratio=0.05,
    save_strategy="epoch",
    group_by_length=True,
    output_dir=OUTPUT_DIR,
    report_to="tensorboard",
    save_safetensors=True,
    lr_scheduler_type="cosine",
    seed=42,
)

In [15]:
trainer = SFTTrainer(
    model=model,
    train_dataset=data["train"],
    eval_dataset=data["validation"],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=4096,
    tokenizer=tokenizer,
    args=training_arguments,
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
trainer.train()
trainer.save_model()



Step,Training Loss,Validation Loss


In [None]:
from peft import AutoPeftModelForCausalLM

trained_model = AutoPeftModelForCausalLM.from_pretrained(
    OUTPUT_DIR,
    low_cpu_mem_usage=True,
)

merged_model = trained_model.merge_and_unload()
merged_model.save_pretrained("mistral-7b-chat-pdf", safe_serialization=True,)
tokenizer.save_pretrained("mistral-7b-chat-pdf")

In [65]:
# merged_model.push_to_hub("mistral-7b-chat-pdf")
tokenizer.push_to_hub("mistral-7b-chat-pdf")

README.md: 100%|██████████| 5.18k/5.18k [00:00<00:00, 15.8MB/s]


CommitInfo(commit_url='https://huggingface.co/dyumat/mistral-7b-chat-pdf/commit/414257bf380fa0c24a660983362e82af28e379ad', commit_message='Upload tokenizer', commit_description='', oid='414257bf380fa0c24a660983362e82af28e379ad', pr_url=None, pr_revision=None, pr_num=None)