This notebook uses **unsloth**, a framework that helps fine-tuning LLMs faster with less memory.

<a href="https://github.com/unslothai/unsloth"><img src="https://github.com/
unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png" width="115"></a>

In [1]:
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.26" trl peft accelerate bitsandbytes
!pip install gradio

In [2]:
import time
from huggingface_hub import get_token, whoami
from unsloth import FastLanguageModel
import gradio as gr

In [None]:
# Load the fine-tuned model and its tokenizer
finetuned_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=whoami()["name"]
    + "/"
    + "fine-tuned-model",  # change the model's name if necessary
    token=get_token(),
    max_seq_length=256,
    load_in_4bit=True,
    dtype=None,
)
FastLanguageModel.for_inference(finetuned_model)

In [None]:
with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    clear = gr.Button("Clear")

    def respond(message, history):
        current_history = history + [(message, None)]
        messages = ""
        for user, assistant in current_history:
            messages += "<|user|>\n" + user + "</s>\n" + "<|assistant|>\n"
            if assistant is not None:
                messages += assistant + "</s>\n"

        inputs = tokenizer(messages, return_tensors="pt").to("cuda")
        outputs = finetuned_model.generate(**inputs, max_new_tokens=256, use_cache=True)
        bot_message = tokenizer.batch_decode(outputs)[0].split("<|assistant|>\n")[-1]

        history.append((message, bot_message))
        time.sleep(2)
        return None, history

    msg.submit(respond, [msg, chatbot], [msg, chatbot])

demo.launch(share=True, debug=True)