In [None]:
import os
import json
import time
from llama_cpp import Llama

# LOAD OFFLINE MODEL

In [None]:
llm = Llama(
    model_path="model/mistral-7b-instruct-v0.1.Q4_K_M.gguf",
    n_ctx=8192,
    n_threads=8,
    n_gpu_layers=0   # CPU-only on Windows, if you have a compatible GPU, set this to a higher value
)

llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from model/mistral-7b-instruct-v0.1.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.1
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llam

# TOOLS (OFFLINE)

In [11]:
def now():
    return {"ts": time.time()}

TOOLS = {
    "now": now
}

# SYSTEM PROMPT (STRICT)

In [None]:
SYSTEM_PROMPT = """
You are an assistant that can call tools.

Valid tools:
- "now"

WHEN you want to use a tool:
Respond ONLY with strict JSON:
{"tool": "now", "args": {}}

If you do NOT want to call a tool:
Respond normally with text.

Never invent new tools.
Never output JSON unless you are calling a tool.
"""

# HELPER: Detect if reply is a tool-call

In [13]:
def is_tool_call(text):
    try:
        obj = json.loads(text)
        return isinstance(obj, dict) and "tool" in obj
    except:
        return False

In [None]:
def run():
    msgs = [
        {"role": "system", "content": SYSTEM_PROMPT}
    ]
    while True:
        user = input("> ").strip()
        if user.lower() in {"exit", "quit"}:
            break

        msgs.append({"role": "user", "content": user})

        # FIRST MODEL CALL

        resp = llm.create_chat_completion(
            messages=msgs,
            temperature=0.6,
            max_tokens=256
        )

        reply = resp["choices"][0]["message"]["content"]

        if is_tool_call(reply):
            data = json.loads(reply)
            tool_name = data["tool"]
            args = data.get("args", {})

            if tool_name not in TOOLS:
                print(f"[Unknown tool: {tool_name}]")
                msgs.append({"role": "assistant", "content": "That tool does not exist."})
                continue

            # Execute tool
            result = TOOLS[tool_name](**args)

            msgs.append({
                "role": "tool",
                "content": json.dumps(result)
            })

            # SECOND MODEL CALL (FINAL ANSWER)

            final = llm.create_chat_completion(
                messages=msgs,
                temperature=0.6,
                max_tokens=256
            )
            answer = final["choices"][0]["message"]["content"]
            print(answer)

            msgs.append({"role": "assistant", "content": answer})

        # NO TOOL â€“ NORMAL TEXT ANSWER

        else:
            print(reply)
            msgs.append({"role": "assistant", "content": reply})

In [None]:
if __name__ == "__main__":
    run()