In [None]:
!pip install transformers accelerate sentencepiece --quiet
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from typing import List, Tuple, Optional
import textwrap, json, os

In [None]:
MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
BASE_SYSTEM_PROMPT = (
    "You are a custom GPT running locally. "
    "Follow user instructions carefully. "
    "Be concise and structured. "
    "If something is unclear, say it is unclear. "
    "Prefer practical examples over corporate examples unless explicitly asked. "
    "When asked for code, give runnable code."
)
MAX_NEW_TOKENS = 256

In [None]:
print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)
model.eval()
print("Model loaded.")

In [None]:
ConversationHistory = List[Tuple[str, str]]
history: ConversationHistory = [("system", BASE_SYSTEM_PROMPT)]

def wrap_text(s: str, w: int = 100) -> str:
    return "\n".join(textwrap.wrap(s, width=w))

def build_chat_prompt(history: ConversationHistory, user_msg: str) -> str:
    prompt_parts = []
    for role, content in history:
        if role == "system":
            prompt_parts.append(f"<|system|>\n{content}\n")
        elif role == "user":
            prompt_parts.append(f"<|user|>\n{content}\n")
        elif role == "assistant":
            prompt_parts.append(f"<|assistant|>\n{content}\n")
    prompt_parts.append(f"<|user|>\n{user_msg}\n")
    prompt_parts.append("<|assistant|>\n")
    return "".join(prompt_parts)

In [None]:
def local_tool_router(user_msg: str) -> Optional[str]:
    msg = user_msg.strip().lower()
    if msg.startswith("search:"):
        query = user_msg.split(":", 1)[-1].strip()
        return f"Search results about '{query}':\n- Key point 1\n- Key point 2\n- Key point 3"
    if msg.startswith("docs:"):
        topic = user_msg.split(":", 1)[-1].strip()
        return f"Documentation extract on '{topic}':\n1. The agent orchestrates tools.\n2. The model consumes output.\n3. Responses become memory."
    return None

In [None]:
def generate_reply(history: ConversationHistory, user_msg: str) -> str:
    tool_context = local_tool_router(user_msg)
    if tool_context:
        user_msg = user_msg + "\n\nUseful context:\n" + tool_context
    prompt = build_chat_prompt(history, user_msg)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=True,
            top_p=0.9,
            temperature=0.6,
            pad_token_id=tokenizer.eos_token_id
        )
    decoded = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    reply = decoded.split("<|assistant|>")[-1].strip() if "<|assistant|>" in decoded else decoded[len(prompt):].strip()
    history.append(("user", user_msg))
    history.append(("assistant", reply))
    return reply

def save_history(history: ConversationHistory, path: str = "chat_history.json") -> None:
    data = [{"role": r, "content": c} for (r, c) in history]
    with open(path, "w") as f:
        json.dump(data, f, indent=2)

def load_history(path: str = "chat_history.json") -> ConversationHistory:
    if not os.path.exists(path):
        return [("system", BASE_SYSTEM_PROMPT)]
    with open(path, "r") as f:
        data = json.load(f)
    return [(item["role"], item["content"]) for item in data]

In [None]:
print("\n--- Demo turn 1 ---")
demo_reply_1 = generate_reply(history, "Explain what this custom GPT setup is doing in 5 bullet points.")
print(wrap_text(demo_reply_1))

print("\n--- Demo turn 2 ---")
demo_reply_2 = generate_reply(history, "search: agentic ai with local models")
print(wrap_text(demo_reply_2))

def interactive_chat():
    print("\nChat ready. Type 'exit' to stop.")
    while True:
        try:
            user_msg = input("\nUser: ").strip()
        except EOFError:
            break
        if user_msg.lower() in ("exit", "quit", "q"):
            break
        reply = generate_reply(history, user_msg)
        print("\nAssistant:\n" + wrap_text(reply))

# interactive_chat()
print("\nCustom GPT initialized successfully.")

Loading model... this may take a bit in Colab.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]



Model loaded.

--- Demo turn 1 ---
