# üß† Qwen 32B on Colab with AirLLM

Run **Qwen2.5-Coder-32B** on limited VRAM using **AirLLM's layer-by-layer inference**.

> ‚ö†Ô∏è GLM-4.7 not supported (requires newer transformers incompatible with AirLLM)

---

In [None]:
#@title 1Ô∏è‚É£ Setup (~2 min)
!nvidia-smi --query-gpu=name,memory.free --format=csv 2>/dev/null || echo 'No GPU'

# Pinned versions for AirLLM compatibility
!pip install -q "optimum<2.0" "transformers<4.49" 2>&1 | tail -1
!pip install -q airllm accelerate bitsandbytes sentencepiece flask einops 2>&1 | tail -1
!wget -q https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64.deb && dpkg -i cloudflared-linux-amd64.deb 2>/dev/null

import torch
print(f"\n‚úÖ Ready! GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

In [None]:
#@title 2Ô∏è‚É£ Select Model
import re

# Qwen models work great with AirLLM
PRESET_MODEL = "Qwen/Qwen2.5-Coder-32B-Instruct"  #@param ["Qwen/Qwen2.5-Coder-32B-Instruct", "Qwen/Qwen2.5-Coder-14B-Instruct", "Qwen/Qwen2.5-Coder-7B-Instruct", "Qwen/Qwen2.5-72B-Instruct", "THUDM/chatglm3-6b", "Custom"]
CUSTOM_MODEL = ""  #@param {type:"string"}
COMPRESSION = "4bit"  #@param ["4bit", "8bit", "none"]

def parse_hf_url(url):
    match = re.match(r'https?://(?:huggingface|hf)\.co/([^/]+/[^/]+)', url.strip())
    return match.group(1) if match else url.strip()

MODEL_ID = parse_hf_url(CUSTOM_MODEL) if CUSTOM_MODEL.strip() else PRESET_MODEL
MODEL_NAME = MODEL_ID.split('/')[-1].lower().replace('-', '_').replace('.', '_')
print(f"üì¶ Model: {MODEL_ID} | Compression: {COMPRESSION}")

In [None]:
#@title 3Ô∏è‚É£ Load Model with AirLLM (‚è≥ 15-30 min first time)
from airllm import AutoModel
import gc, torch

gc.collect(); torch.cuda.empty_cache()
print(f"üîÑ Loading {MODEL_ID}...")
print("‚è≥ First download: 15-30 min for 32B model\n")

model = AutoModel.from_pretrained(MODEL_ID, compression=COMPRESSION if COMPRESSION != 'none' else None)
print(f"\n‚úÖ Loaded: {MODEL_ID}")

In [None]:
#@title 4Ô∏è‚É£ Test Generation
prompt = "Write a Python function to check if a number is prime:"
print(f"Testing: {prompt[:50]}...\n")

try:
    formatted = model.tokenizer.apply_chat_template([{"role": "user", "content": prompt}], tokenize=False, add_generation_prompt=True)
except:
    formatted = f"User: {prompt}\nAssistant:"

ids = model.tokenizer(formatted, return_tensors="pt").input_ids.to('cuda')
out = model.generate(ids, max_new_tokens=256, do_sample=True, temperature=0.7)
print(model.tokenizer.decode(out[0], skip_special_tokens=True))

In [None]:
#@title 5Ô∏è‚É£ Start API Server
from flask import Flask, request, jsonify
import threading, time, uuid

app = Flask(__name__)

@app.route('/v1/models', methods=['GET'])
def models_list(): return jsonify({'object': 'list', 'data': [{'id': MODEL_NAME, 'object': 'model'}]})

@app.route('/v1/chat/completions', methods=['POST'])
def chat():
    data = request.json
    msgs = data.get('messages', [])
    try:
        fmt = model.tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
    except:
        fmt = "\n".join([f"{m['role'].title()}: {m['content']}" for m in msgs]) + "\nAssistant:"
    ids = model.tokenizer(fmt, return_tensors="pt").input_ids.to('cuda')
    out = model.generate(ids, max_new_tokens=min(data.get('max_tokens', 512), 2048), do_sample=True, temperature=data.get('temperature', 0.7))
    txt = model.tokenizer.decode(out[0][len(ids[0]):], skip_special_tokens=True)
    return jsonify({'id': f'chatcmpl-{uuid.uuid4().hex[:8]}', 'object': 'chat.completion', 'created': int(time.time()), 'model': MODEL_NAME, 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': txt.strip()}, 'finish_reason': 'stop'}]})

@app.route('/health', methods=['GET'])
def health(): return jsonify({'status': 'ok', 'model': MODEL_NAME})

threading.Thread(target=lambda: app.run(host='0.0.0.0', port=5000, threaded=True), daemon=True).start()
time.sleep(2)
print(f"‚úÖ API Ready: /v1/chat/completions")

In [None]:
#@title 6Ô∏è‚É£ Start Tunnel üåê (Keep Running!)
import subprocess, re
from IPython.display import display, HTML

print("üöÄ Starting tunnel...\n")
tunnel = subprocess.Popen(['cloudflared', 'tunnel', '--url', 'http://localhost:5000'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)

for line in tunnel.stdout:
    if 'trycloudflare.com' in line:
        url = re.search(r'https://[^\s]+\.trycloudflare\.com', line).group()
        display(HTML(f'<div style="background:#10b981;padding:15px;border-radius:10px;color:white"><h3>üéâ API Live!</h3><code>{url}/v1</code><br><small>Model: {MODEL_NAME}</small></div>'))
        print(f"\nüìã OpenCode Config:")
        print(f"   Base URL: {url}/v1")
        print(f"   Model: {MODEL_NAME}")
        print(f"   API Key: sk-dummy")
        break

for line in tunnel.stdout: print(line, end='')