In [2]:
import json
import re
from pprint import pprint

import pandas as pd
import torch

from datasets import Dataset, load_dataset,load_from_disk
from huggingface_hub import notebook_login
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import SFTTrainer

from utils import *

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"


In [4]:
MODEL_NAME= "mistralai/Mistral-7B-Instruct-v0.2"

In [15]:
data = load_dataset('neural-bridge/rag-dataset-12000',trust_remote_code=True)

In [16]:
data

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 9600
    })
    test: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 2400
    })
})

In [6]:
train_data = data['train']

In [10]:
DEFAULT_SYSTEM_PROMPT = """
You are a helpful AI assistant. Answer the user questions based only on the context provided between [CONTEXT] and [/CONTEXT] tags. 
If the question is not relevant to the context, do not answer anything and say Information in not found in the context provided.
Be concise.
""".strip()

In [None]:
# train_dataset = []
# empty_str = " "
# for i,data in enumerate(train_data):
#     # print(i)
#     train_dataset.append(
#         [{"role":"user","content": f'{DEFAULT_SYSTEM_PROMPT} \n [CONTEXT] \n {clean_text(data["context"]) if data["context"] else empty_str} [/CONTEXT] \n { clean_text(data["question"]) if data["question"] else empty_str}'},
#         {"role":"assistant","content":clean_text(data["answer"]) if data["answer"]  else empty_str}
#                     ]
#     )
    
    

In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [9]:
def generate_training_prompt(
    context: str, question: str,answer:str, system_prompt: str = DEFAULT_SYSTEM_PROMPT
) -> str:
    empty_str =" "
    msg = [{"role":"user","content": f'{DEFAULT_SYSTEM_PROMPT} \n [CONTEXT] \n {clean_text(context) if context else empty_str} [/CONTEXT] \n { clean_text(question) if question else empty_str}'},
        {"role":"assistant","content":clean_text(answer) if answer  else empty_str}
                    ]
    return tokenizer.apply_chat_template(msg,tokenize=False,add_generation_prompt=True)


In [11]:

def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@[^\s]+", "", text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\^[^ ]+", "", text)

    return text

def generate_text(data_point):
        return {"text":generate_training_prompt(context= data_point["context"], question = data_point["question"], answer=data_point["answer"])}

     


In [17]:
def process_dataset(data: Dataset):
    return (
        data.shuffle(seed=42)
        .map(generate_text)
    )

data["train"] = process_dataset(data["train"])
data["test"] = process_dataset(data["test"])

In [18]:
ds = data["train"].train_test_split(test_size=0.1, shuffle=True)
data["train"] = ds["train"]
data["validation"] = ds["test"]

In [20]:
data.save_to_disk("datasets/rag")

Saving the dataset (1/1 shards): 100%|██████████| 8640/8640 [00:01<00:00, 7817.50 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 2400/2400 [00:00<00:00, 12992.40 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 960/960 [00:00<00:00, 9339.01 examples/s]


In [None]:
def create_model_and_tokenizer():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        use_safetensors=True,
        trust_remote_code=True,
        quantization_config=bnb_config,
        device_map="auto",
    )

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    return model, tokenizer

In [None]:
model, tokenizer = create_model_and_tokenizer()
model.config.use_cache = False

In [None]:
model.config.quantization_config.to_dict()

In [None]:
lora_r = 8
lora_alpha = 16
lora_dropout = 0.1
lora_target_modules = [
    "q_proj",
    "up_proj",
    "o_proj",
    "k_proj",
    "down_proj",
    "gate_proj",
    "v_proj",
]


peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=lora_target_modules,
    bias="none",
    task_type="CAUSAL_LM",
)

## Training

In [None]:
OUTPUT_DIR = "experiments/gemma_2b_dqa"

%load_ext tensorboard
%tensorboard --logdir experiments/runs

In [None]:
training_arguments = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    gradient_checkpointing = True,
    eval_accumulation_steps=1,
    optim="paged_adamw_8bit",
    logging_steps=1,
    learning_rate=1e-4,
    fp16=True,
    max_grad_norm=0.3,
    num_train_epochs=2,
    evaluation_strategy="steps",
    eval_steps=0.2,
    warmup_ratio=0.05,
    save_strategy="epoch",
    group_by_length=True,
    output_dir=OUTPUT_DIR,
    report_to="tensorboard",
    save_safetensors=True,
    lr_scheduler_type="cosine",
    seed=42,
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=data["train"],
    eval_dataset=data["validation"],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=2048,
    tokenizer=tokenizer,
    args=training_arguments,
)

In [None]:
trainer.train()
trainer.save_model()

In [None]:
from peft import AutoPeftModelForCausalLM

trained_model = AutoPeftModelForCausalLM.from_pretrained(
    OUTPUT_DIR,
    low_cpu_mem_usage=True,
)

merged_model = model.merge_and_unload()
merged_model.save_pretrained("merged_model", safe_serialization=True)
tokenizer.save_pretrained("merged_model")

In [None]:
def generate_prompt(
    conversation: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT
) -> str:
    return f"""### Instruction: {system_prompt}

### Input:
{conversation.strip()}

### Response:
""".strip()