[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/camenduru/DoctorGPT-colab/blob/main/DoctorGPT_colab.ipynb)

In [None]:
%cd /content
!apt -y install -qq aria2
!pip install -q transformers gradio bitsandbytes accelerate

!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/4bit/medllama2_7b/resolve/main/pytorch_model-00001-of-00002.bin -d /content/medllama2_7b -o pytorch_model-00001-of-00002.bin
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/4bit/medllama2_7b/resolve/main/pytorch_model-00002-of-00002.bin -d /content/medllama2_7b -o pytorch_model-00002-of-00002.bin
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/4bit/medllama2_7b/raw/main/model.safetensors.index.json -d /content/medllama2_7b -o model.safetensors.index.json
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/4bit/medllama2_7b/raw/main/special_tokens_map.json -d /content/medllama2_7b -o special_tokens_map.json
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/4bit/medllama2_7b/resolve/main/tokenizer.model -d /content/medllama2_7b -o tokenizer.model
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/4bit/medllama2_7b/raw/main/tokenizer_config.json -d /content/medllama2_7b -o tokenizer_config.json
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/4bit/medllama2_7b/raw/main/config.json -d /content/medllama2_7b -o config.json
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/4bit/medllama2_7b/raw/main/generation_config.json -d /content/medllama2_7b -o generation_config.json

import gradio as gr
import transformers
from torch import bfloat16
from threading import Thread
from gradio.themes.utils.colors import Color

model_id = "/content/medllama2_7b"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
model_config = transformers.AutoConfig.from_pretrained(model_id)

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto'
)

prompts = ["You are a helpful AI Doctor."]

def prompt_build(system_prompt, user_inp, hist):
    prompt = f"""### System:\n{system_prompt}\n\n"""
    
    for pair in hist:
        prompt += f"""### User:\n{pair[0]}\n\n### Assistant:\n{pair[1]}\n\n"""

    prompt += f"""### User:\n{user_inp}\n\n### Assistant:"""
    return prompt

def chat(user_input, history, system_prompt):

    prompt = prompt_build(system_prompt, user_input, history)
    model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

    streamer = transformers.TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)

    generate_kwargs = dict(
        model_inputs,
        streamer=streamer,
        #max_new_tokens=512, # will override "max_len" if set.
        max_length=2048,
        do_sample=True,
        top_p=0.95,
        temperature=0.8,
        top_k=50
    )
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    model_output = ""
    for new_text in streamer:
        model_output += new_text
        yield model_output
    return model_output

with gr.Blocks() as demo:
    dropdown = gr.Dropdown(choices=prompts, label="Type your own or select a system prompt", value="You are a helpful AI Doctor.", allow_custom_value=True)
    chatbot = gr.ChatInterface(fn=chat, additional_inputs=[dropdown])

demo.queue(api_open=False).launch(show_api=False, share=True)