<div class="alert alert-block alert-info">
⚠️ We will use ml.g5.2xlarge to run this notebook
</div>

In [2]:
!python3 -m pip install -q accelerate==0.20.3 transformers==4.33.0 gradio bitsandbytes accelerate google-search-results sentencepiece

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
import gc
from threading import Thread
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    BitsAndBytesConfig
)
from transformers import GenerationConfig, TextIteratorStreamer

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
MODEL_ID = "NousResearch/Nous-Hermes-Llama2-13b"
# MODEL_ID = "tiiuae/falcon-40b-instruct"

In [5]:
# quantization config using BnB
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID, 
    # load_in_8bit=True, 
    quantization_config=bnb_config,
    # trust_remote_code=True,
    device_map="auto"
)

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
Loading checkpoint shards: 100%|██████████| 3/3 [01:27<00:00, 29.12s/it]


In [6]:
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID
)
tokenizer.pad_token = tokenizer.eos_token

In [7]:
def generate_dolly_like_prompt(user_question, user_context):
    """
    Generates a dolly Like prompt for model to respond with context 
    """
    prefix = "You are an assistant for question-answering tasks. You are helpful, friendly and only answer the question you are asked."
    instruction = f"### Instruction:\n{prefix}\n\n{user_question}"
    context = f"### Input:\n{user_context}" if user_context else None
    response = f"### Response:\n"

    prompt = "\n\n".join([i for i in [instruction, context, response] if i is not None])

    return prompt

def tokenize(tokenizer, prompt):
    """ 
    Tokenize your input prompt to provide as an input 
    to the model
    """
    tokenized = tokenizer(prompt, return_tensors="pt")
    input_ids = tokenized.input_ids
    input_ids = input_ids.to(model.device)
    
    return tokenized, input_ids

### Convert Input into a Prompt

In [8]:
user_question = "How can learn to drive a car?"
user_context = None

prompt = generate_dolly_like_prompt(
    user_question=user_question, 
    user_context=user_context
)

In [9]:
print(prompt)

### Instruction:
You are an assistant for question-answering tasks. You are helpful, friendly and only answer the question you are asked.

How can learn to drive a car?

### Response:



### Tokenize

In [10]:
_, input_ids = tokenize(
    tokenizer=tokenizer, 
    prompt=prompt
)

In [11]:
print(input_ids)

tensor([[    1,   835,  2799,  4080, 29901,    13,  3492,   526,   385, 20255,
           363,  1139, 29899, 12011,   292,  9595, 29889,   887,   526,  8444,
         29892, 19780,   322,   871,  1234,   278,  1139,   366,   526,  4433,
         29889,    13,    13,  5328,   508,  5110,   304,  7899,   263,  1559,
         29973,    13,    13,  2277, 29937, 13291, 29901,    13]],
       device='cuda:0')


## Generate Response as a Blocking Process

When you prompt a model to generate a response, it takes times, sometimes several minutes based on the size of the model.

In [12]:
with torch.inference_mode():
    outputs = model.generate(
        input_ids=input_ids,
        temperature=0.01,
        top_p=0.95,
        max_new_tokens=512,
        do_sample=True,
        use_cache=False,
        num_return_sequences=1 # generate multiple responses from the model with values > 1
    )



In [13]:
responses =[]
for i, _output in enumerate(outputs): 
    response = tokenizer.decode(
        _output, 
        skip_special_tokens=True
    )
    print(f"#################### Response: {i+1} ####################\n")
    print(response)
    print("\n")

#################### Response: 1 ####################

### Instruction:
You are an assistant for question-answering tasks. You are helpful, friendly and only answer the question you are asked.

How can learn to drive a car?

### Response:
 To learn to drive a car, you can follow these steps:

1. Research and understand the laws and requirements for obtaining a driver's license in your area.
2. Find a reputable driving school or instructor who can teach you the basics of driving and help you develop good driving habits.
3. Practice driving in a safe and controlled environment, such as an empty parking lot or a driving range.
4. Gradually move on to driving on quiet streets and in low-traffic areas to gain confidence and experience.
5. Take a driver's education course if one is available in your area.
6. Once you feel comfortable, take a driving test to obtain your driver's license.

Remember to always use caution and be attentive while driving, and to always follow traffic laws and safe

How can we improve the process without having the users wait several seconds to minutes for a response? Maybe we can stream response from the model 

## Stream you outputs

In [14]:
# create a model generator config
generation_config = GenerationConfig(
    temperature=0.5,
    top_p=0.95,
    max_new_tokens=512,
    do_sample=True,
    use_cache=False,
    num_return_sequences=1
)

In [15]:
# streaming handler to handle tokens output from the model
streamer = TextIteratorStreamer(
    tokenizer, 
    skip_prompt=False, 
    skip_special_tokens=True
)

### Generate a new prompt

In [16]:
user_question = "How can I learn to cook?"

prompt = generate_dolly_like_prompt(
    user_question=user_question, 
    user_context=None
)

In [17]:
print(prompt)

### Instruction:
You are an assistant for question-answering tasks. You are helpful, friendly and only answer the question you are asked.

How can I learn to cook?

### Response:



In [18]:
_tokenized, _input_ids = tokenize(
    tokenizer=tokenizer, 
    prompt=prompt
)

In [19]:
generate_kwargs = dict(
    input_ids=_input_ids,
    generation_config=generation_config,
    return_dict_in_generate=True,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
    bos_token_id=tokenizer.bos_token_id,
    attention_mask=_tokenized.attention_mask,
    output_scores=True,
    streamer=streamer,
)

### Stream you Output

In [20]:
thread = Thread(
    target=model.generate, 
    kwargs=generate_kwargs
)

thread.start()
for new_text in streamer:
    print(new_text, end="")

thread.join()

### Instruction:
You are an assistant for question-answering tasks. You are helpful, friendly and only answer the question you are asked.

How can I learn to cook?

### Response:
To learn to cook, start by mastering basic techniques and recipes. Practice regularly and experiment with different ingredients and flavors. Consider taking cooking classes or watching online tutorials to learn new skills and get inspiration.

In [21]:
# https://modal.com/docs/guide/ex/falcon_bitsandbytes
class StreamingAgent:
    def __init__(
        self, 
        model, 
        tokenizer, 
        model_name=None
    ):
        
        if model_name is not None: 
            print(f"Downloading and Loading {model_name}!")
            self.model_name = model_name
            self.quantization_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.bfloat16
            )

            # load model into memory locally
            self._model = AutoModelForCausalLM.from_pretrained(
                self.model_name, 
                quantization_config=self.quantization_config,
                device_map="auto"
            )
            self._model.eval()
            self.local_model = torch.compile(self._model)
            
            # load tokenizer into memroy
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.tokenizer.pad_token = self.tokenizer.eos_token
            
        else:
            print("Loading local model provided by a user ...")
            self.local_model = torch.compile(model)
            # user provided tokenizer
            self.tokenizer = tokenizer
            
    
    def prompt_template(self, user_question):
        
        user_context = "Answer the question truthfully, honestly and to the point. Also, try to be funny when you answer the question."

        instruction = f"### Instruction\n{user_question}"
        context = f"### Context\n{user_context}" if user_context else None
        response = f"### Answer\n"

        prompt = "\n\n".join([i for i in [instruction, context, response] if i is not None])
        
        return prompt

    def generate(self, user_query: str):
        
        # keep track of when prompt template is going to served right back 
        self._counter = 0
        
        prompt = self.prompt_template(user_query)

        tokenized = self.tokenizer(prompt, return_tensors="pt")
        input_ids = tokenized.input_ids
        input_ids = input_ids.to(self.local_model.device)

        generation_config = GenerationConfig(
            temperature=1.5,
            top_k=120,
            top_p=0.9,
            max_new_tokens=512,
            do_sample=True,
            num_return_sequences=1
        )

        streamer = TextIteratorStreamer(
            self.tokenizer, 
            skip_special_tokens=True
        )
        
        generate_kwargs = dict(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.eos_token_id,
            bos_token_id=self.tokenizer.bos_token_id,
            attention_mask=tokenized.attention_mask,
            output_scores=True,
            streamer=streamer,
        )

        thread = Thread(
            target=self.local_model.generate, 
            kwargs=generate_kwargs
        )
        
        thread.start()
        for new_text in streamer:
            
            if self._counter > 0:
                yield new_text
                self._counter += 1
            else:
                self._counter += 1

        thread.join()
        self._counter = 0
    
    def delete_model(self):
        del self.local_model
        del self._model
        del self.tokenizer
        gc.collect()
        torch.cuda.empty_cache()

In [22]:
stream_agent = StreamingAgent(
    model=model, 
    tokenizer=tokenizer,
    model_name=None
)

Loading local model provided by a user ...


## Build a local UI

In [23]:
# https://github.com/gradio-app/gradio/blob/main/demo/chatbot_simple/run.py
import gradio as gr
import random
import time

with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    clear = gr.Button("Clear")

    def user(user_message, history):
        return "", history + [[user_message, None]]

    def bot(history):
        last_user_message = history[-1][0]
        print("Asking the model: ", last_user_message)
        bot_message = stream_agent.generate(last_user_message)
        history[-1][1] = ""
        
        for pred_words in stream_agent.generate(last_user_message):
            if pred_words:
                history[-1][1] += pred_words
                yield history

    msg.submit(
        user, [msg, chatbot], [msg, chatbot], 
        queue=False
    ).then(
        bot, chatbot, chatbot
    )
    clear.click(lambda: None, None, chatbot, queue=False)
    
demo.queue().launch(
    share=True
)

Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://b9cfe9d1391a7cc533.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


