# üçï OrderBot ‚Äî Offline with **llama.cpp** (Auto‚ÄëDownload Model)

This version automatically downloads the required GGUF model file if it's missing ‚Äî no `wget` needed!

### Steps
1. Run this notebook directly. It will create a `models/` folder and download the model automatically.
2. Everything runs **fully offline** after the first run.

### Dependencies
```bash
pip install --upgrade llama-cpp-python panel jupyter_bokeh
```


In [1]:
pip install llama-cpp-python

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.16.tar.gz (50.7 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m50.7/50.7 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m45.5/45.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml)

In [None]:
import os

# Where to store the model
os.makedirs("models", exist_ok=True)

CANDIDATES = [
    # Official Microsoft repo (preferred). Q4 file name is "q4.gguf".
    ("microsoft/Phi-3-mini-4k-instruct-gguf", "Phi-3-mini-4k-instruct-q4.gguf"),
    # Community mirror with Q4_K_M naming
    ("bartowski/Phi-3-mini-4k-instruct-GGUF", "Phi-3-mini-4k-instruct-Q4_K_M.gguf"),
]

def try_download():
    try:
        from huggingface_hub import hf_hub_download
    except ImportError:
        print("Installing huggingface_hub...")
        import sys, subprocess
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "huggingface_hub"])
        from huggingface_hub import hf_hub_download

    last_err = None
    for repo_id, filename in CANDIDATES:
        target_path = os.path.join("models", filename)
        # Reuse if already present (either name)
        if os.path.isfile(target_path):
            print("‚úî Model already present at", target_path)
            return target_path
        print(f"Attempting download from {repo_id} ‚Üí {filename} ...")
        try:
            path = hf_hub_download(
                repo_id=repo_id,
                filename=filename,
                local_dir="models",
            )
            print("‚úÖ Model downloaded to:", path)
            return path
        except Exception as e:
            last_err = e
            print(f"  ‚Ü™Ô∏è  Failed for {repo_id}/{filename}: {e.__class__.__name__}: {e}")

    # If we reach here, both attempts failed
    raise RuntimeError(
        "All download attempts failed. If you are on Colab, run `huggingface-cli login` "
        "with a free token, then re-run this cell. You can also manually download from:\n"
        "‚Ä¢ microsoft/Phi-3-mini-4k-instruct-gguf (file: Phi-3-mini-4k-instruct-q4.gguf)\n"
        "‚Ä¢ bartowski/Phi-3-mini-4k-instruct-GGUF (file: Phi-3-mini-4k-instruct-Q4_K_M.gguf)\n"
        f"Last error: {last_err}"
    )

model_path = try_download()



Attempting download from microsoft/Phi-3-mini-4k-instruct-gguf ‚Üí Phi-3-mini-4k-instruct-q4.gguf ...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Phi-3-mini-4k-instruct-q4.gguf:   0%|          | 0.00/2.39G [00:00<?, ?B/s]

In [6]:
# Initialize llama.cpp backend
from typing import List, Dict
from llama_cpp import Llama

class LlamaChat:
    def __init__(self, model_path: str):
        self.llm = Llama(model_path=model_path, n_ctx=4096, n_gpu_layers=0, verbose=False)

    def chat(self, messages: List[Dict], max_tokens: int = 512, temperature: float = 0.2, top_p: float = 0.95):
        def _format(messages):
            parts = []
            for m in messages:
                role, content = m.get('role','user'), m.get('content','')
                if role == 'system':
                    parts.append(f"<<SYS>>\n{content}\n<</SYS>>")
                elif role == 'user':
                    parts.append(f"### User:\n{content}")
                else:
                    parts.append(f"### Assistant:\n{content}")
            parts.append('### Assistant:')
            return '\n\n'.join(parts)

        prompt = _format(messages)
        out = self.llm(
            prompt,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            stop=['### User:', '### Assistant:', '<<SYS>>', '<</SYS>>']
        )
        return out['choices'][0]['text'].strip()

LLM = LlamaChat(model_path)
print('Backend in use: llama.cpp (local)')

def get_completion(prompt, temperature=0):
    messages = [{"role": "user", "content": prompt}]
    return LLM.chat(messages, temperature=temperature)

def get_completion_from_messages(messages, temperature=0):
    return LLM.chat(messages, temperature=temperature)


Backend in use: llama.cpp (local)


In [7]:
# Panel interface (for Jupyter or Colab with jupyter_bokeh)
import panel as pn
pn.extension()

custom_css = """
.my-markdown {
    background-color: #F6F6F6;
    padding: 10px;
    border-radius: 5px;
}
"""
pn.config.raw_css.append(custom_css)

context = [ {'role':'system', 'content':"""
You are OrderBot, an automated service to collect orders for a pizza restaurant. \
You greet the customer, collect their full order including item, size, and extras, \
then ask if it‚Äôs pickup or delivery. If delivery, ask for the address. \
Finally, summarize the order and collect payment. \
Menu: \
pepperoni pizza 12.95, 10.00, 7.00; \
cheese pizza 10.95, 9.25, 6.50; \
eggplant pizza 11.95, 9.75, 6.75; \
fries 4.50, 3.50; greek salad 7.25; \
Toppings: extra cheese 2.00, mushrooms 1.50, sausage 3.00, \
canadian bacon 3.50, AI sauce 1.50, peppers 1.00; \
Drinks: coke 3.00, 2.00, 1.00; sprite 3.00, 2.00, 1.00; bottled water 5.00
"""} ]

panels = []

def collect_messages(_):
    prompt = inp.value_input
    inp.value = ''
    context.append({'role':'user', 'content': prompt})

    response = get_completion_from_messages(context)
    context.append({'role':'assistant', 'content': response})

    panels.append(pn.Row('User:', pn.pane.Markdown(prompt, width=600)))
    panels.append(pn.Row('Assistant:', pn.pane.Markdown(response, width=600, css_classes=['my-markdown'])))

    return pn.Column(*panels)

inp = pn.widgets.TextInput(value="Hi", placeholder='Enter text here‚Ä¶')
button_conversation = pn.widgets.Button(name="Chat!")
interactive_conversation = pn.bind(collect_messages, button_conversation)

dashboard = pn.Column(
    inp,
    pn.Row(button_conversation),
    pn.panel(interactive_conversation, loading_indicator=True),
)

dashboard.servable()



    !pip install jupyter_bokeh

and try again.
  pn.extension()


