# Install the necessary packages

In [1]:
%%capture
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git
!pip install triton
!pip install unsloth-zoo
!pip install -U xformers
!pip install -U bitsandbytes

!pip install -q gradio

# Import the necessary packages

In [2]:
import os

from transformers import AutoModel, AutoTokenizer
from transformers import TextStreamer
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template

import random
import gradio as gr
from google.colab import drive

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


### Deploy the UI on HuggingFace

In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
os.path.exists("/content/drive/MyDrive/lora-llama/app.py")

True

In [13]:
!gradio deploy

Need [32m'write'[0m access token to create a Spaces repo.

    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Creating new Spaces Repo in [32m'/content'[0m. Collecting metadata, press Enter to accept default value.
Enter Spaces app title [content]: lora-llama
Enter Gradio app file : drive/MyDrive/lora-llama/app.py
Enter Spaces hardware (cpu-basic, cpu-upgrade, t4-small, t4-medium, l4x1, l

# Load the model from HuggingFace, in 4-bit

In [6]:
max_seq_length = 2048
dtype = None
load_in_4bit = True

In [7]:
model_name = "celdot/lora_llama_model_4"

In [8]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/45.1M [00:00<?, ?B/s]

Unsloth 2024.12.4 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


# Test the prediction on 2 prompts :

### *What is capitalism* and *Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,*

### We use the code for inference that was in the template notebook to fine-tune the llama model.
### We test those promts with a smaller number of new tokens to generate, for the sake of speed.

In [9]:
def inference(message, model=model, tokenizer=tokenizer):
  tokenizer = get_chat_template(
      tokenizer,
      chat_template="llama-3.1",
  )

  FastLanguageModel.for_inference(model) # Enable native 2x faster inference

  messages = [
      {"role": "user", "content": message},
  ]

  inputs = tokenizer.apply_chat_template(
      messages,
      tokenize=True,
      add_generation_prompt=True,  # Must add for generation
      return_tensors="pt",
  ).to("cuda")

  outputs = model.generate(
      input_ids=inputs,
      max_new_tokens=128,  # Adjust the number of new tokens to generate
      use_cache=True,
      temperature=1.5,   # Adjust temperature for randomness
      min_p=0.1,         # Adjust nucleus sampling probability
  )

  generated_text = tokenizer.batch_decode(outputs)

  return generated_text

In [10]:
inference("What is capitalism")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is capitalism<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nCapitalism is an economic system in which individuals, businesses, and governments play significant roles in economic transactions, where there is a large portion of the population involved in the economy. The goal of capitalism is to create wealth through the production, distribution, and trade of goods and services. In a capitalist system, private ownership and initiative play a crucial role, and the government does not directly manage the economy.\n\nCapitalism is a market-based system, where production is directed by free market forces. It encourages competition and promotes innovation through the creation of new products and services. The benefits of capitalism are seen in higher economic efficiency, free market competitio

### We also use a `TextStreamer` for continuous inference - so we can see the generation token by token, instead of waiting the whole time

In [11]:
def stream_inference(message, model=model, tokenizer=tokenizer):

  FastLanguageModel.for_inference(model) # Enable native 2x faster inference

  messages = [
      {"role": "user", "content": message},
  ]

  inputs = tokenizer.apply_chat_template(
      messages,
      tokenize = True,
      add_generation_prompt = True, # Must add for generation
      return_tensors = "pt",
  ).to("cuda")


  text_streamer = TextStreamer(tokenizer, skip_prompt = True)
  _ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                    use_cache = True, temperature = 1.5, min_p = 0.1)

In [12]:
stream_inference("What is capitalism")

Capitalism is an economic system that focuses on the production, exchange, and distribution 

KeyboardInterrupt: 

In [None]:
inference("Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,")

In [None]:
stream_inference("Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,")

# Set up the Chatbot UI with Gradio

### Define the ```respond``` function that defines how the chatbot is going to answer

### We use the HuggingFace template to set up a chatbot, coupled with the previous functions to make inference.

In [None]:
def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
    tokenizer=tokenizer,
    model=model,
):

    tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3.1",
    )

    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

    # Construct the messages for the bot to analyze
    messages = [{"role": "system", "content": system_message}]
    for val in history:
        if val["role"] == "user":
            messages.append({"role": "user", "content": val["content"]})
        if val["role"] == "assistant":
            messages.append({"role": "assistant", "content": val["content"]})
    messages.append({"role": "user", "content": message})

    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,  # Must add for generation
        return_tensors="pt",
    ).to("cuda")

    outputs = model.generate(
        input_ids = inputs,
        max_new_tokens=256,  # Adjust the number of new tokens to generate
        use_cache=True,
        temperature=1.5,   # Adjust temperature for randomness
        min_p=0.1,         # Adjust nucleus sampling probability
    )

    generated_text = tokenizer.batch_decode(outputs
                                            , skip_special_tokens=True
                                            )

    # Extract the bot's response
    full_text = generated_text[0]
    bot_response = full_text.split("assistant\n\n")[-1]  # Remove the user message part

    # Yield the response incrementally
    response = ""
    for token in bot_response:
        response += token
        yield response


In [None]:
demo = gr.ChatInterface(
    respond,
    type='messages',
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=128, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)

In [None]:
demo.launch(share=True, debug=True)