In [1]:
# Cell 1: install dependencies (ch·∫°y 1 l·∫ßn)
import sys, subprocess

def pip_install(pkg):
    subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

# FastAPI + uvicorn + ngrok + transformers/hf
pip_install("fastapi")
pip_install("uvicorn[standard]")
pip_install("pyngrok")
pip_install("huggingface_hub")
pip_install("transformers>=4.30.0")
pip_install("accelerate")
pip_install("safetensors")
# Optional fallback llama.cpp (n·∫øu mu·ªën d√πng GGUF local)
pip_install("llama-cpp-python")


In [7]:
# Cell 2: c·∫•u h√¨nh (ch·ªânh theo nhu c·∫ßu)
import os

# Hugging Face model id (text-generation / causal). Thay b·∫±ng model b·∫°n mu·ªën.
# V√≠ d·ª•: "gpt2" (test), ho·∫∑c m·ªôt model l·ªõn h∆°n: "meta-llama/Llama-2-13b-chat-hf" (c·∫ßn token + VRAM)
HF_MODEL = os.environ.get("HF_MODEL", "gpt2")
HF_TOKEN = os.environ.get("HF_HUB_TOKEN", "")  # recommended: set trong Colab runtime env
PORT = int(os.environ.get("PORT", 8000))
USE_NGROK = True
NGROK_TOKEN = os.environ.get("NGROK_TOKEN", "")

# Fallback GGUF path (local) n·∫øu HF kh√¥ng kh·∫£ d·ª•ng
GGUF_PATH = os.environ.get("GGUF_PATH", "/content/models/model.gguf")


In [4]:
# Cell 3: Load Hugging Face model ƒë√∫ng c√°ch v·ªõi accelerate (KH√îNG truy·ªÅn device cho pipeline)

import torch
from huggingface_hub import login
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline
)

device = 0 if torch.cuda.is_available() else -1
print("Torch CUDA available:", torch.cuda.is_available(), "device:", device)

tokenizer = None
text_gen = None
llama_llm = None

try:
    if HF_TOKEN:
        login(HF_TOKEN)

    print("Loading HF model:", HF_MODEL)

    tokenizer = AutoTokenizer.from_pretrained(HF_MODEL, use_fast=True)

    # Load model b·∫±ng accelerate ‚Üí KH√îNG .to("cuda")
    model = AutoModelForCausalLM.from_pretrained(
        HF_MODEL,
        torch_dtype=torch.float16 if torch.cuda.is_available() else None,
        device_map="auto",
        trust_remote_code=True,
        low_cpu_mem_usage=True
    )

    # ‚ùó‚ùó IMPORTANT: KH√îNG TRUY·ªÄN device=0 CHO PIPELINE (accelerate model b·ªã crash)
    if torch.cuda.is_available():
        generator_device = 0
    else:
        generator_device = -1

    text_gen = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        # KH√îNG truy·ªÅn device v√¨ accelerate ƒë√£ qu·∫£n l√Ω thi·∫øt b·ªã r·ªìi
        return_full_text=False
    )

    print("‚úÖ HF model loaded OK (accelerate + pipeline)")

except Exception as e:
    print("‚ö†Ô∏è HF model failed:", e)
    # b·ªè fallback n·∫øu b·∫°n kh√¥ng d√πng GGUF
    llama_llm = None

if text_gen is None and llama_llm is None:
    raise RuntimeError("No model available ‚Äî HF failed and no GGUF provided.")


Torch CUDA available: True device: 0
Loading HF model: gpt2


Device set to use cuda:0


‚úÖ HF model loaded OK (accelerate + pipeline)


In [5]:
# Cell 4: FastAPI server v·ªõi endpoint /v1/chat/completions
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
import uvicorn
import asyncio
import json

app = FastAPI()
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])

def messages_to_prompt(messages):
    """
    Chuy·ªÉn danh s√°ch messages [{'role':'system'/'user'/'assistant','content':...}]
    th√†nh prompt text d·∫°ng ƒë∆°n gi·∫£n. B·∫°n c√≥ th·ªÉ thay ƒë·ªïi policy prompt ·ªü ƒë√¢y.
    """
    if not messages:
        return ""
    parts = []
    for m in messages:
        role = m.get("role","user")
        content = m.get("content","")
        if role == "system":
            parts.append(f"[SYSTEM]\n{content}\n")
        elif role == "user":
            parts.append(f"User: {content}\n")
        else:
            parts.append(f"Assistant: {content}\n")
    # k·∫øt h·ª£p v√† th√™m instruction cho assistant
    return "\n".join(parts) + "\nAssistant:"

@app.get("/health")
async def health():
    return {"status":"ok", "model_loaded": bool(text_gen or llama_llm)}

@app.post("/v1/chat/completions")
async def chat_completions(request: Request):
    payload = await request.json()
    # H·ªó tr·ª£ c·∫£ 'messages' (OpenAI style) v√† 'prompt' (simple)
    messages = payload.get("messages")
    prompt = payload.get("prompt") or payload.get("question") or payload.get("message")
    system = payload.get("system", "")
    temperature = float(payload.get("temperature", 0.2))
    max_tokens = int(payload.get("max_tokens", 256))

    if messages:
        # ensure system message first if provided separately
        if system:
            messages = [{"role":"system","content":system}] + messages
        text_prompt = messages_to_prompt(messages)
    else:
        # build minimal prompt
        text_prompt = (("[SYSTEM]\n" + system + "\n") if system else "") + str(prompt or "")

    # HF pipeline generation
    try:
        if text_gen:
            gen = text_gen(
                text_prompt,
                do_sample=temperature>0,
                temperature=temperature,
                max_new_tokens=max_tokens,
                top_p=0.9,
                eos_token_id=tokenizer.eos_token_id if tokenizer is not None else None,
            )
            out_text = gen[0]["generated_text"]
            # If pipeline returns full prompt + generation, we can strip prompt prefix
            if out_text.startswith(text_prompt):
                out_text = out_text[len(text_prompt):].strip()
            return {"choices":[{"message":{"role":"assistant","content":out_text}}]}
        elif llama_llm:
            # llama_cpp style: either create_chat_completion or call
            try:
                r = llama_llm.create_chat_completion(messages=[m for m in (messages or [{"role":"user","content":text_prompt}])], temperature=temperature, max_tokens=max_tokens)
                out_text = r["choices"][0]["message"]["content"]
            except Exception:
                # fallback: simple call
                r = llama_llm(text_prompt, max_tokens=max_tokens, temperature=temperature)
                out_text = r.get("choices", [{}])[0].get("text","")
            return {"choices":[{"message":{"role":"assistant","content":out_text}}]}
        else:
            return {"error":"No model available"}, 503
    except Exception as e:
        return {"error": f"Generation failed: {str(e)}"}, 500

# Run uvicorn server in background (so notebook doesn't block)
def _run_server():
    uvicorn.run(app, host="0.0.0.0", port=PORT, log_level="info")

import threading
t = threading.Thread(target=_run_server, daemon=True)
t.start()
print("FastAPI server started on port", PORT)


FastAPI server started on port 8000


In [8]:
os.environ["NGROK_TOKEN"] = "YOUR_REAL_TOKEN_HERE"
NGROK_TOKEN = "YOUR_REAL_TOKEN_HERE"


In [12]:
import socket

def find_free_port():
    s = socket.socket()
    s.bind(('', 0))   # OS t·ª± t√¨m port free
    port = s.getsockname()[1]
    s.close()
    return port

print("Free port:", find_free_port())


Free port: 47489


In [13]:
# Cell 5 ‚Äî Ngrok v3 ·ªïn ƒë·ªãnh (t·ª± ki·ªÉm tra ti·∫øn tr√¨nh, retry, l·∫•y URL ch·∫Øc ch·∫Øn)

import os
import subprocess, threading, time, requests, psutil

PUBLIC_PORT = find_free_port()

NGROK_TOKEN = os.environ.get("NGROK_TOKEN", "").strip()
if not NGROK_TOKEN:
    raise ValueError("‚ùå NGROK_TOKEN ch∆∞a ƒë∆∞·ª£c ƒë·∫∑t trong m√¥i tr∆∞·ªùng!")

print("‚úîÔ∏è ƒê√£ l·∫•y NGROK_TOKEN t·ª´ m√¥i tr∆∞·ªùng.")

# --- Step 1: T·∫£i ngrok v3 ---
!wget -q -O ngrok.zip https://bin.equinox.io/c/bNyj1mQVY4c/ngrok-v3-stable-linux-amd64.zip
!unzip -o ngrok.zip >/dev/null 2>&1

# --- Step 2: C·∫•u h√¨nh token ---
!./ngrok config add-authtoken $NGROK_TOKEN

# --- Step 3: Ch·∫°y tunnel ---
def run_ngrok():
    subprocess.Popen(["./ngrok", "http", str(PUBLIC_PORT)], stdout=subprocess.PIPE, stderr=subprocess.PIPE)

threading.Thread(target=run_ngrok, daemon=True).start()

print("‚è≥ ƒêang kh·ªüi ƒë·ªông ngrok...")

# --- Step 4: ƒê·ª£i ngrok kh·ªüi ƒë·ªông ho√†n to√†n ---
def wait_for_ngrok(timeout=20):
    for i in range(timeout):
        # Ki·ªÉm tra process c√≥ ch·∫°y kh√¥ng
        for p in psutil.process_iter(attrs=['cmdline']):
            if p.info['cmdline'] and "ngrok" in " ".join(p.info['cmdline']):
                # th·ª≠ query API
                try:
                    r = requests.get("http://localhost:4040/api/tunnels", timeout=1)
                    if r.status_code == 200:
                        return r.json()
                except:
                    pass
        time.sleep(1)
    return None

tunnels = wait_for_ngrok()

# --- Step 5: In URL ho·∫∑c b√°o l·ªói ---
if tunnels:
    public_url = tunnels["tunnels"][0]["public_url"]
    print("üîó NGROK URL:", public_url)
    print("üîó Health:", public_url + "/health")
else:
    print("‚ùå Ngrok KH√îNG kh·ªüi ƒë·ªông ƒë∆∞·ª£c.")
    print("üî• G·ª£i √Ω s·ª≠a l·ªói:")
    print("1) Ki·ªÉm tra token c√≥ ƒë√∫ng ƒë·ªãnh d·∫°ng v3 (b·∫Øt ƒë·∫ßu b·∫±ng 2P...) kh√¥ng?")
    print("2) Colab c√≥ block port (th·ª≠ PORT = 8081 ho·∫∑c 9000)")
    print("3) Ch·∫°y l·∫°i cell 5")


‚úîÔ∏è ƒê√£ l·∫•y NGROK_TOKEN t·ª´ m√¥i tr∆∞·ªùng.
Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
‚è≥ ƒêang kh·ªüi ƒë·ªông ngrok...
‚ùå Ngrok KH√îNG kh·ªüi ƒë·ªông ƒë∆∞·ª£c.
üî• G·ª£i √Ω s·ª≠a l·ªói:
1) Ki·ªÉm tra token c√≥ ƒë√∫ng ƒë·ªãnh d·∫°ng v3 (b·∫Øt ƒë·∫ßu b·∫±ng 2P...) kh√¥ng?
2) Colab c√≥ block port (th·ª≠ PORT = 8081 ho·∫∑c 9000)
3) Ch·∫°y l·∫°i cell 5
