<a href="https://colab.research.google.com/github/ashwath-tech/llama-3.2-grumpy-it-finetune/blob/main/Inference/GradioUI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1. Uninstall any conflicting versions
!pip uninstall -y -q torch torchvision torchaudio transformers peft

# 2. Install the "Golden Combination" (Versions guaranteed to work together)
!pip install -q -U torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124
!pip install -q -U "transformers>=4.46.0" "peft>=0.7.1" "bitsandbytes>=0.41.0" "accelerate>=0.26.0" "gradio"

# 3. Force Restart
import os
print("âœ… Libraries installed. The runtime will now CRASH to restart. This is expected!")
os.kill(os.getpid(), 9)

In [None]:
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
from peft import PeftModel
from threading import Thread
from huggingface_hub import login
import ast

# --- 1. AUTHENTICATION ---
print("ðŸ‘‡ PASTE TOKEN (or hit Enter if already logged in) ðŸ‘‡")
hf_token = input("HF Token: ").strip()
if hf_token:
    login(token=hf_token)

# --- 2. Config ---
ADAPTER_ID = f"NotSure123/grumpy-llama-3.2-3B"
BASE_MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"

# --- 3. Load Model ---

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16
)
model = PeftModel.from_pretrained(base_model, ADAPTER_ID)
model.eval()

# --- 4. The History Cleaner (Fixes the JSON blobs) ---
def recursive_clean(text):
    if not isinstance(text, str):
        text = str(text)
    if text.strip().startswith(("[", "{")):
        try:
            parsed = ast.literal_eval(text)
            if isinstance(parsed, list):
                return " ".join([recursive_clean(x) for x in parsed])
            if isinstance(parsed, dict):
                candidate = parsed.get('text') or parsed.get('content') or parsed.get('value')
                if candidate: return recursive_clean(candidate)
        except:
            pass
    return text.strip()

In [None]:
def chat_response(message, history):
    system_prompt = "You are a cynical, grumpy IT Systems Administrator. Refuse non-technical requests."
    conversation = [{"role": "system", "content": system_prompt}]

    # Clean and Rebuild History
    for turn in history:
        if isinstance(turn, (list, tuple)) and len(turn) == 2:
            user_clean = recursive_clean(turn[0])
            bot_clean = recursive_clean(turn[1])
            if user_clean and bot_clean:
                conversation.append({"role": "user", "content": user_clean})
                conversation.append({"role": "assistant", "content": bot_clean})
        elif isinstance(turn, dict):
             # Handle new dictionary format if it appears
             content = recursive_clean(turn.get('content', ''))
             role = turn.get('role', 'user')
             conversation.append({"role": role, "content": content})

    # Add Current Message
    clean_message = recursive_clean(message)
    conversation.append({"role": "user", "content": clean_message})

    # Tokenize
    inputs = tokenizer.apply_chat_template(
        conversation,
        add_generation_prompt=True,
        return_tensors="pt",
        return_dict=True
    )
    input_ids = inputs["input_ids"].to(model.device)

    # Streamer
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = dict(
        input_ids=input_ids,
        streamer=streamer,
        max_new_tokens=256,
        temperature=0.7
    )

    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    partial_message = ""
    for new_token in streamer:
        partial_message += new_token
        # We yield the FULL message so far, not just the new token
        yield partial_message

In [None]:
demo = gr.ChatInterface(
    fn=chat_response,
    title="ðŸ˜’ Grumpy IT Support",
    chatbot=gr.Chatbot(height=400),
    examples=["My printer is broken", "I spilled coffee on the server"]
)

demo.launch(share=True, debug=True)