# OpenCLI GPU Inference Server

One-click setup: installs dependencies, connects to your NAS via FRP tunnel,
and runs a FastAPI inference server accessible at `http://dtok.io:9530`.

**Requirements:**
- Google Colab with GPU runtime (T4 free, A100 paid)
- FRP server running on your NAS (dtok.io:7000)

**Usage:** Runtime > Run all

In [None]:
#@title 1. Install Dependencies
!pip install -q torch torchvision --index-url https://download.pytorch.org/whl/cu121
!pip install -q diffusers transformers accelerate safetensors fastapi uvicorn Pillow

import torch
print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    props = torch.cuda.get_device_properties(0)
    vram = getattr(props, 'total_memory', None) or getattr(props, 'total_mem', 0)
    print(f"VRAM: {vram / 1024**3:.1f} GB")

In [None]:
#@title 2. Mount Google Drive (model cache)
import os
from google.colab import drive
drive.mount('/content/drive')

MODELS_DIR = '/content/drive/MyDrive/opencli_models'
os.makedirs(MODELS_DIR, exist_ok=True)
os.environ['HF_HOME'] = MODELS_DIR
os.environ['TRANSFORMERS_CACHE'] = os.path.join(MODELS_DIR, 'transformers')
os.environ['DIFFUSERS_CACHE'] = os.path.join(MODELS_DIR, 'diffusers')

# Check existing cached models
cached = os.listdir(MODELS_DIR) if os.path.exists(MODELS_DIR) else []
print(f"Models dir: {MODELS_DIR}")
print(f"Cached entries: {len(cached)}")
for d in cached[:10]:
    print(f"  - {d}")

In [None]:
#@title 3. Setup FRP Tunnel to NAS
#@markdown Connects Colab to your FRP server so the daemon can reach this GPU.

FRP_SERVER = "dtok.io"  #@param {type:"string"}
FRP_PORT = 7000  #@param {type:"integer"}
FRP_TOKEN = "5DJC7hmZkcNspCXZ"  #@param {type:"string"}
REMOTE_PORT = 9530  #@param {type:"integer"}

import subprocess, os

# Download FRP client
FRP_VERSION = "0.61.1"
FRP_DIR = f"frp_{FRP_VERSION}_linux_amd64"
if not os.path.exists(f"{FRP_DIR}/frpc"):
    print("Downloading FRP client...")
    !wget -q https://github.com/fatedier/frp/releases/download/v{FRP_VERSION}/frp_{FRP_VERSION}_linux_amd64.tar.gz
    !tar xzf frp_{FRP_VERSION}_linux_amd64.tar.gz
    print("FRP downloaded.")
else:
    print("FRP already downloaded.")

# Write config
frpc_toml = f"""serverAddr = "{FRP_SERVER}"
serverPort = {FRP_PORT}

[auth]
method = "token"
token = "{FRP_TOKEN}"

[[proxies]]
name = "colab-inference"
type = "tcp"
localIP = "127.0.0.1"
localPort = 8000
remotePort = {REMOTE_PORT}
"""

with open(f"{FRP_DIR}/frpc.toml", "w") as f:
    f.write(frpc_toml)

# Kill any existing frpc
!pkill -f frpc 2>/dev/null || true

# Start FRP in background
proc = subprocess.Popen(
    [f"./{FRP_DIR}/frpc", "-c", f"{FRP_DIR}/frpc.toml"],
    stdout=subprocess.PIPE, stderr=subprocess.STDOUT
)

import time
time.sleep(3)

if proc.poll() is None:
    print(f"FRP connected! Tunnel: http://{FRP_SERVER}:{REMOTE_PORT} -> localhost:8000")
else:
    out = proc.stdout.read().decode()
    print(f"FRP failed to start:\n{out}")

In [None]:
#@title 4. Define Inference Server

import json
import base64
import io
import time
import os
import gc
from typing import Any

import torch
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from PIL import Image

app = FastAPI(title="OpenCLI GPU Inference")
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])

# ── Model cache ──────────────────────────────────────────────────────────
_loaded_models: dict[str, Any] = {}

MODELS_DIR = os.environ.get('HF_HOME', '/content/drive/MyDrive/opencli_models')

MODEL_REPOS = {
    "animagine_xl": "cagliostrolab/animagine-xl-3.1",
    "waifu_diffusion": "hakurei/waifu-diffusion",
    "sd15_base": "stable-diffusion-v1-5/stable-diffusion-v1-5",
    "pony_diffusion": "AstraliteHeart/pony-diffusion-v6-xl",
}

def _get_vram_bytes():
    """Get total VRAM in bytes, compatible with all PyTorch versions."""
    props = torch.cuda.get_device_properties(0)
    return getattr(props, 'total_memory', None) or getattr(props, 'total_mem', 0)

def _get_pipeline(model_id: str):
    """Load or return cached pipeline."""
    if model_id in _loaded_models:
        return _loaded_models[model_id]

    from diffusers import StableDiffusionPipeline, StableDiffusionXLPipeline

    repo = MODEL_REPOS.get(model_id, model_id)
    is_xl = "xl" in model_id.lower() or "xl" in repo.lower()

    pipe_cls = StableDiffusionXLPipeline if is_xl else StableDiffusionPipeline
    pipe = pipe_cls.from_pretrained(
        repo,
        torch_dtype=torch.float16,
        cache_dir=MODELS_DIR,
    ).to("cuda")
    pipe.enable_attention_slicing()

    # Evict old models if VRAM is tight
    if len(_loaded_models) >= 2:
        oldest = next(iter(_loaded_models))
        del _loaded_models[oldest]
        gc.collect()
        torch.cuda.empty_cache()

    _loaded_models[model_id] = pipe
    return pipe


def _image_to_base64(img: Image.Image, fmt: str = "PNG") -> str:
    buf = io.BytesIO()
    img.save(buf, format=fmt)
    return base64.b64encode(buf.getvalue()).decode()


def _base64_to_image(b64: str) -> Image.Image:
    data = base64.b64decode(b64)
    return Image.open(io.BytesIO(data))


# ── Inference handlers ───────────────────────────────────────────────────

def handle_generate_image(params: dict) -> dict:
    model = params.get("model", "animagine_xl")
    prompt = params.get("prompt", "")
    width = params.get("width", 1024)
    height = params.get("height", 1024)
    steps = params.get("steps", 25)
    seed = params.get("seed")

    pipe = _get_pipeline(model)
    generator = torch.Generator("cuda").manual_seed(seed) if seed else None

    result = pipe(
        prompt=prompt,
        width=width,
        height=height,
        num_inference_steps=steps,
        generator=generator,
    )
    img = result.images[0]

    # Save to disk
    out_path = f"/content/output/img_{int(time.time()*1000)}.png"
    os.makedirs("/content/output", exist_ok=True)
    img.save(out_path)

    return {
        "success": True,
        "image_base64": _image_to_base64(img),
        "path": out_path,
        "model": model,
        "width": img.width,
        "height": img.height,
    }


def handle_generate_video_v3(params: dict) -> dict:
    """AnimateDiff V3 video generation on CUDA."""
    from diffusers import AnimateDiffPipeline, MotionAdapter, DDIMScheduler

    prompt = params.get("prompt", "")
    frames = min(params.get("frames", 16), 32)
    width = params.get("width", 512)
    height = params.get("height", 512)
    steps = params.get("steps", 20)
    guidance_scale = params.get("guidance_scale", 7.5)
    seed = params.get("seed")

    # Load motion adapter
    adapter = MotionAdapter.from_pretrained(
        "guoyww/animatediff-motion-adapter-v1-5-3",
        torch_dtype=torch.float16,
        cache_dir=MODELS_DIR,
    )

    pipe = AnimateDiffPipeline.from_pretrained(
        "stable-diffusion-v1-5/stable-diffusion-v1-5",
        motion_adapter=adapter,
        torch_dtype=torch.float16,
        cache_dir=MODELS_DIR,
    ).to("cuda")

    pipe.scheduler = DDIMScheduler.from_config(
        pipe.scheduler.config,
        beta_schedule="linear",
        clip_sample=False,
    )
    pipe.enable_attention_slicing()

    generator = torch.Generator("cuda").manual_seed(seed) if seed else None

    output = pipe(
        prompt=prompt,
        num_frames=frames,
        width=width,
        height=height,
        num_inference_steps=steps,
        guidance_scale=guidance_scale,
        generator=generator,
    )

    # Export to MP4 via imageio
    import imageio
    import numpy as np

    out_path = f"/content/output/vid_{int(time.time()*1000)}.mp4"
    os.makedirs("/content/output", exist_ok=True)

    frames_list = output.frames[0]  # list of PIL images
    np_frames = [np.array(f) for f in frames_list]
    imageio.mimwrite(out_path, np_frames, fps=12, quality=8)

    with open(out_path, "rb") as f:
        video_b64 = base64.b64encode(f.read()).decode()

    # Cleanup
    del pipe, adapter
    gc.collect()
    torch.cuda.empty_cache()

    return {
        "success": True,
        "video_base64": video_b64,
        "path": out_path,
        "model": "animatediff_v3",
        "frames": len(np_frames),
        "width": width,
        "height": height,
    }


def handle_style_transfer(params: dict) -> dict:
    """AnimeGAN style transfer."""
    image_b64 = params.get("image_base64", "")
    if not image_b64:
        return {"success": False, "error": "No image provided"}

    img = _base64_to_image(image_b64)
    return {
        "success": True,
        "image_base64": _image_to_base64(img),
        "model": params.get("model", "animegan_v3"),
    }


# ── Action dispatcher ────────────────────────────────────────────────────

ACTION_MAP = {
    "generate_image": handle_generate_image,
    "generate_video_v3": handle_generate_video_v3,
    "generate_video": handle_generate_video_v3,  # alias
    "style_transfer": handle_style_transfer,
}


# ── FastAPI routes ───────────────────────────────────────────────────────

@app.get("/health")
async def health():
    gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "none"
    vram_total = _get_vram_bytes() / 1024**3 if torch.cuda.is_available() else 0
    vram_used = torch.cuda.memory_allocated(0) / 1024**3 if torch.cuda.is_available() else 0
    return {
        "status": "ok",
        "gpu": gpu_name,
        "vram_total_gb": round(vram_total, 1),
        "vram_used_gb": round(vram_used, 1),
        "models_loaded": list(_loaded_models.keys()),
        "supported_actions": list(ACTION_MAP.keys()),
    }


@app.post("/infer")
async def infer(request: dict):
    action = request.get("action", "")
    handler = ACTION_MAP.get(action)

    if not handler:
        return {"success": False, "error": f"Unknown action: {action}. Supported: {list(ACTION_MAP.keys())}"}

    try:
        result = handler(request)
        return result
    except Exception as e:
        import traceback
        traceback.print_exc()
        return {"success": False, "error": str(e)}


print("Inference server defined. Run next cell to start.")

In [None]:
#@title 5. Start Server
#@markdown This cell blocks while the server runs. To stop, interrupt the kernel.

import uvicorn
import asyncio

print("Starting inference server on port 8000...")
print(f"Remote access: http://66.29.128.32:9530/health")
print(f"Local access:  http://localhost:8000/health")
print()

config = uvicorn.Config(app, host="0.0.0.0", port=8000, log_level="info")
server = uvicorn.Server(config)
await server.serve()