In [1]:
import json
import re
from pprint import pprint

import pandas as pd
import numpy as np
import torch

from datasets import Dataset, load_dataset
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import SFTTrainer



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL_NAME= "dyumat/mistral-7b-chat-pdf"
DATASET_NAME = "dyumat/databricks-dolly-5k-rag-split"

In [3]:
data = load_dataset(DATASET_NAME,trust_remote_code=True)

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

tokenizer.json: 100%|██████████| 1.80M/1.80M [00:00<00:00, 17.4MB/s]


In [5]:
DEFAULT_SYSTEM_PROMPT = """
You are a helpful AI assistant. Answer the user questions based only on the context provided.
Be respectful.
""".strip()

In [8]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@[^\s]+", "", text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\^[^ ]+", "", text)

    return text

def generate_test_prompt(
    context: str, question: str,answer:str, system_prompt: str = DEFAULT_SYSTEM_PROMPT
) -> str:
    empty_str =" "
    cntxt = "<<CONTEXT>>"
    msg = [{"role":"system", "content": DEFAULT_SYSTEM_PROMPT},
         {"role":"user","content": f'{clean_text(question) if question else empty_str}\n <<CONTEXT>> \n {clean_text(context) if context else empty_str}\n<</CONTEXT>>'},]
    return tokenizer.apply_chat_template(msg,tokenize=False,add_generation_prompt=True)

def generate_text(data_point):
        return {"text":generate_test_prompt(context=data_point["context"],question=data_point["instruction"],answer=data_point["response"])}


In [9]:
test_data = data["test"][np.random.randint(len(data["test"]))]
print(generate_test_prompt(context=test_data["context"],question=test_data["instruction"],answer=test_data["response"]))

<s>[INST] <<SYS>>
You are a helpful AI assistant. Answer the user questions based only on the context provided.
Be respectful.
<</SYS>>

Given this paragraph about the planet Jupiter, tell me what it is made of.
 <<CONTEXT>> 
 Jupiter's upper atmosphere is about 90% hydrogen and 10% helium by volume. Since helium atoms are more massive than hydrogen molecules, Jupiter's atmosphere is approximately 24% helium by mass. The atmosphere also contains trace amounts of methane, water vapour, ammonia, and silicon-based compounds. There are also fractional amounts of carbon, ethane, hydrogen sulfide, neon, oxygen, phosphine, and sulfur. The outermost layer of the atmosphere contains crystals of frozen ammonia. Through infrared and ultraviolet measurements, trace amounts of benzene and other hydrocarbons have also been found. The interior of Jupiter contains denser materials—by mass it is roughly 71% hydrogen, 24% helium, and 5% other elements.
<</CONTEXT>> [/INST]


In [10]:
def process_dataset(data: Dataset):
    return (
        data.shuffle(seed=15445)
        .map(generate_text)
        .remove_columns("category")
    )
data["test"] = process_dataset(data["test"])

Map: 100%|██████████| 259/259 [00:01<00:00, 252.14 examples/s]


### Prepare model

In [11]:
def create_model_and_tokenizer():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        use_safetensors=True,
        trust_remote_code=True,
        quantization_config=bnb_config,
        device_map="auto",
    )

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    return model, tokenizer

In [12]:
model, tokenizer = create_model_and_tokenizer()
model.config.use_cache = False
model.config.quantization_config.to_dict()

config.json: 100%|██████████| 642/642 [00:00<00:00, 4.90MB/s]
model.safetensors.index.json: 100%|██████████| 23.9k/23.9k [00:00<00:00, 108MB/s]
Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]
model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s][A
model-00001-of-00003.safetensors:   0%|          | 10.5M/4.94G [00:00<01:26, 57.3MB/s][A
model-00001-of-00003.safetensors:   0%|          | 21.0M/4.94G [00:00<02:01, 40.5MB/s][A
model-00001-of-00003.safetensors:   1%|          | 41.9M/4.94G [00:00<01:12, 67.8MB/s][A
model-00001-of-00003.safetensors:   1%|          | 52.4M/4.94G [00:00<01:06, 73.8MB/s][A
model-00001-of-00003.safetensors:   1%|▏         | 73.4M/4.94G [00:01<01:08, 71.5MB/s][A
model-00001-of-00003.safetensors:   2%|▏         | 83.9M/4.94G [00:01<01:47, 45.4MB/s][A
model-00001-of-00003.safetensors:   2%|▏         | 105M/4.94G [00:01<01:22, 58.7MB/s] [A
model-00001-of-00003.safetensors:   2%|▏         | 115M/4.94G [00:02<01:25, 56.6MB/s][

{'quant_method': <QuantizationMethod.BITS_AND_BYTES: 'bitsandbytes'>,
 '_load_in_8bit': False,
 '_load_in_4bit': True,
 'llm_int8_threshold': 6.0,
 'llm_int8_skip_modules': None,
 'llm_int8_enable_fp32_cpu_offload': False,
 'llm_int8_has_fp16_weight': False,
 'bnb_4bit_quant_type': 'nf4',
 'bnb_4bit_use_double_quant': False,
 'bnb_4bit_compute_dtype': 'bfloat16',
 'load_in_4bit': True,
 'load_in_8bit': False}

In [13]:
lora_r = 16
lora_alpha = 16
lora_dropout = 0.1
lora_target_modules = [
    "q_proj",
    "up_proj",
    "o_proj",
    "k_proj",
    "down_proj",
    "gate_proj",
    "v_proj",
]


peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=lora_target_modules,
    bias="none",
    task_type="CAUSAL_LM",
)

### Inference

In [44]:
text = data["test"]["text"][3] + "\n[ASST]"
text

"<s>[INST] <<SYS>>\nYou are a helpful AI assistant. Answer the user questions based only on the context provided.\nBe respectful.\n<</SYS>>\n\nGive a comma separated list of all the people listed in this passage about Jesse LaFollette\n <<CONTEXT>> \n LaFollette was born near Morristown, New Jersey in 1781. His father, Joseph, and grandfather, Jean, were Huguenots who had escaped the persecution in France, traveling first to Jersey and then to the colonies where they operated a small farm near the Wallkill River in northern New Jersey. Jean was killed during the French and Indian Wars. Joseph married Phoebe Gobel of Morristown, New Jersey, whose father's farm along with other neighboring farms in Jockey Hollow was used by George Washington and his troops during the winter of 1780. After serving with Count Casimir Pulaski during the Revolutionary War, Joseph and his family joined the pioneers who trekked westward through the Cumberland Gap.\n<</CONTEXT>> [/INST]\n[ASST]"

In [45]:
inputs = tokenizer(text, return_tensors="pt").input_ids

In [49]:
outputs = model.generate(inputs, max_new_tokens=200, do_sample=True, top_k=50, top_p=0.95)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [50]:
ans = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

In [51]:
print(ans)

[INST] <<SYS>>
You are a helpful AI assistant. Answer the user questions based only on the context provided.
Be respectful.
<</SYS>>

Give a comma separated list of all the people listed in this passage about Jesse LaFollette
 <<CONTEXT>> 
 LaFollette was born near Morristown, New Jersey in 1781. His father, Joseph, and grandfather, Jean, were Huguenots who had escaped the persecution in France, traveling first to Jersey and then to the colonies where they operated a small farm near the Wallkill River in northern New Jersey. Jean was killed during the French and Indian Wars. Joseph married Phoebe Gobel of Morristown, New Jersey, whose father's farm along with other neighboring farms in Jockey Hollow was used by George Washington and his troops during the winter of 1780. After serving with Count Casimir Pulaski during the Revolutionary War, Joseph and his family joined the pioneers who trekked westward through the Cumberland Gap.
<</CONTEXT>> [/INST]
[ASST]
Joseph, Jean, Phoebe Gobel, C

In [56]:
from transformers import pipeline
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=500)

In [58]:
result = pipe(text)
print(result[0])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


{'generated_text': "<s>[INST] <<SYS>>\nYou are a helpful AI assistant. Answer the user questions based only on the context provided.\nBe respectful.\n<</SYS>>\n\nGive a comma separated list of all the people listed in this passage about Jesse LaFollette\n <<CONTEXT>> \n LaFollette was born near Morristown, New Jersey in 1781. His father, Joseph, and grandfather, Jean, were Huguenots who had escaped the persecution in France, traveling first to Jersey and then to the colonies where they operated a small farm near the Wallkill River in northern New Jersey. Jean was killed during the French and Indian Wars. Joseph married Phoebe Gobel of Morristown, New Jersey, whose father's farm along with other neighboring farms in Jockey Hollow was used by George Washington and his troops during the winter of 1780. After serving with Count Casimir Pulaski during the Revolutionary War, Joseph and his family joined the pioneers who trekked westward through the Cumberland Gap.\n<</CONTEXT>> [/INST]\n[ASS