In [16]:
import json
import re
from pprint import pprint

import pandas as pd
import numpy as np
import torch

from datasets import Dataset, load_dataset
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import SFTTrainer

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

In [83]:
MODEL_NAME= "mistralai/Mistral-7B-v0.1"
DATASET_NAME = "dyumat/databricks-dolly-5k-rag-split"

## Load Dataset


In [84]:
data = load_dataset(DATASET_NAME,trust_remote_code=True)

In [85]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [86]:
"{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"

"{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"

In [87]:
template = "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ '\n[ASST] '  + content.strip() + ' ' + eos_token }}{% endif %}\n{% if loop.last and add_generation_prompt and message['role'] != 'assistant' %}\n{{ '\n[ASST]' }}\n{% endif %}\n{% endfor %}"
tokenizer.chat_template = template

In [88]:
DEFAULT_SYSTEM_PROMPT = """
You are a helpful AI assistant. Answer the user questions based only on the context provided.
Be concise and respectful.
""".strip()

In [89]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@[^\s]+", "", text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\^[^ ]+", "", text)

    return text

def generate_training_prompt(
    context: str, question: str,answer:str, system_prompt: str = DEFAULT_SYSTEM_PROMPT
) -> str:
    empty_str =" "
    cntxt = "<<CONTEXT>>"
    msg = [{"role":"system", "content": DEFAULT_SYSTEM_PROMPT},
         {"role":"user","content": f'{clean_text(question) if question else empty_str}\n <<CONTEXT>> \n {clean_text(context) if context else empty_str}\n<</CONTEXT>>'},
        {"role":"assistant","content":clean_text(answer) if answer  else empty_str}
                    ]
    return tokenizer.apply_chat_template(msg,tokenize=False,add_generation_prompt=True)

def generate_text(data_point):
        return {"text":generate_training_prompt(context=data_point["context"],question=data_point["instruction"],answer=data_point["response"])}


In [90]:
train_data = data["train"][np.random.randint(len(data["train"]))]
print(generate_training_prompt(context=train_data["context"],question=train_data["instruction"],answer=train_data["response"]))

<s>[INST] <<SYS>>
You are a helpful AI assistant. Answer the user questions based only on the context provided.
Be concise and respectful.
<</SYS>>

Given these paragraphs about Multiomics, what is panomics?
 <<CONTEXT>> 
 Multiomics, multi-omics, integrative omics, "panomics" or "pan-omics" is a biological analysis approach in which the data sets are multiple "omes", such as the genome, proteome, transcriptome, epigenome, metabolome, and microbiome (i.e., a meta-genome and/or meta-transcriptome, depending upon how it is sequenced); in other words, the use of multiple omics technologies to study life in a concerted way. By combining these "omes", scientists can analyze complex biological big data to find novel associations between biological entities, pinpoint relevant biomarkers and build elaborate markers of disease and physiology. In doing so, multiomics integrates diverse omics data to find a coherently matching geno-pheno-envirotype relationship or association. The OmicTools servi

In [91]:
def process_dataset(data: Dataset):
    return (
        data.shuffle(seed=15445)
        .map(generate_text)
        .remove_columns("category")
    )
data["train"] = process_dataset(data["train"])
data["validation"] = process_dataset(data["validation"])

Map: 100%|██████████| 259/259 [00:00<00:00, 4216.33 examples/s]


### Prepare model

In [93]:
def create_model_and_tokenizer():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        use_safetensors=True,
        trust_remote_code=True,
        quantization_config=bnb_config,
        device_map="auto",
    )

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    return model, tokenizer

In [95]:
model, tokenizer = create_model_and_tokenizer()
model.config.use_cache = Falbse

ImportError: Using `bitsandbytes` 8-bit quantization requires Accelerate: `pip install accelerate` and the latest version of bitsandbytes: `pip install -i https://pypi.org/simple/ bitsandbytes`

In [None]:
model.config.quantization_config.to_dict()

In [None]:
lora_r = 8
lora_alpha = 16
lora_dropout = 0.1
lora_target_modules = [
    "q_proj",
    "up_proj",
    "o_proj",
    "k_proj",
    "down_proj",
    "gate_proj",
    "v_proj",
]


peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=lora_target_modules,
    bias="none",
    task_type="CAUSAL_LM",
)

## Training

In [None]:
OUTPUT_DIR = "experiments/gemma_2b_dqa"

%load_ext tensorboard
%tensorboard --logdir experiments/runs

In [None]:
training_arguments = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    gradient_checkpointing = True,
    eval_accumulation_steps=1,
    optim="paged_adamw_8bit",
    logging_steps=1,
    learning_rate=1e-4,
    fp16=True,
    max_grad_norm=0.3,
    num_train_epochs=2,
    evaluation_strategy="steps",
    eval_steps=0.2,
    warmup_ratio=0.05,
    save_strategy="epoch",
    group_by_length=True,
    output_dir=OUTPUT_DIR,
    report_to="tensorboard",
    save_safetensors=True,
    lr_scheduler_type="cosine",
    seed=42,
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=data["train"],
    eval_dataset=data["validation"],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=2048,
    tokenizer=tokenizer,
    args=training_arguments,
)

In [None]:
trainer.train()
trainer.save_model()

In [None]:
from peft import AutoPeftModelForCausalLM

trained_model = AutoPeftModelForCausalLM.from_pretrained(
    OUTPUT_DIR,
    low_cpu_mem_usage=True,
)

merged_model = model.merge_and_unload()
merged_model.save_pretrained("merged_model", safe_serialization=True)
tokenizer.save_pretrained("merged_model")