In [None]:
import gradio as gr

def yes_man(message, history):
    if message.endswith("?"):
        return "Yes"
    else:
        return "No"

gr.ChatInterface(
    yes_man,
    textbox=gr.Textbox(placeholder="Ask me a question about the LOTR", container=False, scale=7),
    title="Yes Man",
    description="Ask Yes Man any question",
    theme="soft",
    examples=["Hello", "Am I cool?", "Are tomatoes vegetarian?"],
    cache_examples=True,
    retry_btn=None,
    undo_btn="Delete Previous",
    clear_btn="Clear",
).launch()

In [13]:
from transformers import AutoTokenizer
from llama_index.embeddings.huggingface import HuggingFaceEmbedding


from llama_index.core import StorageContext, load_index_from_storage


from llama_index.core import set_global_tokenizer

set_global_tokenizer(
    AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf").encode)

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")


# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir="index")

# load index
index = load_index_from_storage(storage_context, embed_model= embed_model)

In [None]:
import torch

from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.llms.llama_cpp.llama_utils import (
    messages_to_prompt,
    completion_to_prompt,
)


model_url = "https://huggingface.co/TheBloke/CapybaraHermes-2.5-Mistral-7B-GGUF/resolve/main/capybarahermes-2.5-mistral-7b.Q4_0.gguf"


llm = LlamaCPP(
    model_url=model_url,
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=None,
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={
        "n_gpu_layers": -1,
        "torch_dtype": torch.float16,
        "load_in_8bit": True,
    },
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)

In [18]:
query_engine = index.as_query_engine(
    llm=llm,
    chat_mode="condense_question",
    streaming=False
)

In [19]:
response =  query_engine.query("what happens to frodo")


llama_print_timings:        load time =   19003.12 ms
llama_print_timings:      sample time =      34.67 ms /   137 runs   (    0.25 ms per token,  3951.54 tokens per second)
llama_print_timings: prompt eval time =   89086.94 ms /  1943 tokens (   45.85 ms per token,    21.81 tokens per second)
llama_print_timings:        eval time =   19367.95 ms /   136 runs   (  142.41 ms per token,     7.02 tokens per second)
llama_print_timings:       total time =  108787.03 ms /  2079 tokens


In [22]:
response.response

" Frodo becomes less active in Shire affairs after his return from Middle Earth, and he gradually becomes more reclusive. In the text, it is mentioned that Frodo drops out of sight and people don't know much about his deeds and adventures. He also falls ill twice but manages to conceal it. In March 1421, Frodo gets sick again, but Sam has other things to think about as Sam's wife Rosie gives birth to their child on the same day. Frodo plans a trip with Sam to Bilbo's birthday celebration, indicating that he wants to see his old friend one last time. [</INST]"

In [38]:
def predict(message, history):
    history = []
    for human, assistant in history:
        history.append({"role": "user", "content": human })
        history.append({"role": "assistant", "content":assistant})
    history.append({"role": "user", "content": message})
    response = query_engine.query(str(history))
    return str(response.response)
gr.ChatInterface(predict).launch()

Running on local URL:  http://127.0.0.1:7870

To create a public link, set `share=True` in `launch()`.




Llama.generate: prefix-match hit

llama_print_timings:        load time =   19003.12 ms
llama_print_timings:      sample time =      27.12 ms /   115 runs   (    0.24 ms per token,  4240.26 tokens per second)
llama_print_timings: prompt eval time =   70762.24 ms /  1762 tokens (   40.16 ms per token,    24.90 tokens per second)
llama_print_timings:        eval time =   16320.56 ms /   114 runs   (  143.16 ms per token,     6.99 tokens per second)
llama_print_timings:       total time =   87366.99 ms /  1876 tokens


In [None]:
query_engine.query(str([{'role': 'user', 'content': 'who died'}]))

In [37]:
predict("who died", "")

Llama.generate: prefix-match hit

llama_print_timings:        load time =   19003.12 ms
llama_print_timings:      sample time =       5.19 ms /    24 runs   (    0.22 ms per token,  4621.61 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =    3291.96 ms /    24 runs   (  137.16 ms per token,     7.29 tokens per second)
llama_print_timings:       total time =    3345.45 ms /    25 tokens


' Gandalf mentions that Théoden, the king, died and has both honor and peace. /INST]'

In [42]:
import gradio as gr


def predict(message, history):
    history = []
    for human, assistant in history:
        history.append({"role": "user", "content": human})
        history.append({"role": "assistant", "content": assistant})
    history.append({"role": "user", "content": message})
    response = query_engine.query(str(history))
    with open(
        "user_queries.txt",
        "a",
        encoding="utf-8",
    ) as f:
        f.write("q: " + message + " : " + "{}".format(response.response) + "\n")
    return str(response.response)


gr.ChatInterface(
    predict,
    textbox=gr.Textbox(
        placeholder="Ask me a question about the LOTR", container=False, scale=7
    ),
    title="Ilya horyas men carë úvië ná i carë lúmenen yan me ná antaina",
    description="Ask the bot stuff. Follow-up questions do not work, new question every time is what works. It's not chatGPT. and like me it sometimes stops mid sentence.",
    theme="soft",
    examples=["What food is mentioned in LOTR?", "Are Hobbits Vegan?", "What's the deal with the blue wizards?"],
    cache_examples=True,
    retry_btn="Re-generate",
    undo_btn="Delete Previous",
    clear_btn="Clear",
).launch()

Caching examples at: '/home/darthfader/code/medical-chatbots/gradio_cached_examples/245'
Caching example 1/3


Llama.generate: prefix-match hit

llama_print_timings:        load time =   19003.12 ms
llama_print_timings:      sample time =      10.09 ms /    43 runs   (    0.23 ms per token,  4262.91 tokens per second)
llama_print_timings: prompt eval time =   88067.73 ms /  1788 tokens (   49.25 ms per token,    20.30 tokens per second)
llama_print_timings:        eval time =    5741.56 ms /    42 runs   (  136.70 ms per token,     7.32 tokens per second)
llama_print_timings:       total time =   93921.05 ms /  1830 tokens
Llama.generate: prefix-match hit


Caching example 2/3



llama_print_timings:        load time =   19003.12 ms
llama_print_timings:      sample time =      38.64 ms /   152 runs   (    0.25 ms per token,  3933.34 tokens per second)
llama_print_timings: prompt eval time =   95143.47 ms /  1781 tokens (   53.42 ms per token,    18.72 tokens per second)
llama_print_timings:        eval time =   21063.89 ms /   151 runs   (  139.50 ms per token,     7.17 tokens per second)
llama_print_timings:       total time =  116641.77 ms /  1932 tokens
Llama.generate: prefix-match hit


Caching example 3/3



llama_print_timings:        load time =   19003.12 ms
llama_print_timings:      sample time =      23.53 ms /    96 runs   (    0.25 ms per token,  4079.55 tokens per second)
llama_print_timings: prompt eval time =   95885.94 ms /  1787 tokens (   53.66 ms per token,    18.64 tokens per second)
llama_print_timings:        eval time =   13601.06 ms /    95 runs   (  143.17 ms per token,     6.98 tokens per second)
llama_print_timings:       total time =  109753.57 ms /  1882 tokens


Running on local URL:  http://127.0.0.1:7874

To create a public link, set `share=True` in `launch()`.




Llama.generate: prefix-match hit

llama_print_timings:        load time =   19003.12 ms
llama_print_timings:      sample time =      64.20 ms /   255 runs   (    0.25 ms per token,  3972.27 tokens per second)
llama_print_timings: prompt eval time =   93528.68 ms /  1781 tokens (   52.51 ms per token,    19.04 tokens per second)
llama_print_timings:        eval time =   37174.33 ms /   254 runs   (  146.36 ms per token,     6.83 tokens per second)
llama_print_timings:       total time =  131451.24 ms /  2035 tokens
