# Fast Inference Test: Qwen3-4B-Thinking-2507

Tests `fast_inference=True` with vLLM backend on Qwen3-4B-Thinking-2507.

**Key features tested:**
- FastLanguageModel loading with fast_inference mode
- Thinking model output with `<think>...</think>` tags
- Parsing and displaying thinking vs response separately

**Important:** This notebook includes a kernel shutdown cell at the end.
vLLM does not release GPU memory in single-process mode (Jupyter), so kernel
restart is required between different model tests.

In [None]:
# Environment Setup (quiet mode)
import warnings
import os
import sys
import logging

# Suppress all verbose output
warnings.filterwarnings("ignore")
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
os.environ["TQDM_DISABLE"] = "1"
logging.getLogger("unsloth").setLevel(logging.ERROR)
logging.getLogger("vllm").setLevel(logging.ERROR)
logging.getLogger("transformers").setLevel(logging.ERROR)

from dotenv import load_dotenv
load_dotenv()

# Suppress unsloth banner during import
from contextlib import redirect_stdout, redirect_stderr
from io import StringIO
with redirect_stdout(StringIO()), redirect_stderr(StringIO()):
    import unsloth
    from unsloth import FastLanguageModel

import vllm
import torch

# Suppress model loading verbosity
from transformers import logging as hf_logging
hf_logging.set_verbosity_error()

# Single-line environment summary
gpu = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"
print(f"Environment: unsloth {unsloth.__version__}, vLLM {vllm.__version__}, {gpu}")

In [None]:
# Test Qwen3-4B-Thinking-2507 with fast_inference=True
MODEL_NAME = "unsloth/Qwen3-4B-Thinking-2507-unsloth-bnb-4bit"
print(f"\nTesting {MODEL_NAME.split('/')[-1]} with fast_inference=True...")

from vllm import SamplingParams
import time
import os

# Suppress verbose model loading output by redirecting to /dev/null
_stdout_fd = os.dup(1)
_stderr_fd = os.dup(2)
_devnull = os.open(os.devnull, os.O_WRONLY)
os.dup2(_devnull, 1)
os.dup2(_devnull, 2)

try:
    model, tokenizer = FastLanguageModel.from_pretrained(
        MODEL_NAME,
        max_seq_length=1024,  # Increased for thinking content
        load_in_4bit=True,
        fast_inference=True,
    )
finally:
    os.dup2(_stdout_fd, 1)
    os.dup2(_stderr_fd, 2)
    os.close(_devnull)
    os.close(_stdout_fd)
    os.close(_stderr_fd)

print(f"Model loaded: {type(model).__name__}")

In [None]:
# Test generation with thinking model
FastLanguageModel.for_inference(model)

# Use a prompt that encourages reasoning
messages = [{"role": "user", "content": "What is 15 + 27? Show your thinking."}]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

sampling_params = SamplingParams(
    temperature=0.6,  # Recommended for thinking models
    top_p=0.95,
    top_k=20,
    max_tokens=256,  # Allow room for thinking
)

import time
start = time.time()
outputs = model.fast_generate([prompt], sampling_params=sampling_params)
elapsed = time.time() - start

# Get the raw output
raw_output = outputs[0].outputs[0].text
print(f"Generation time: {elapsed:.2f}s")
print(f"\n{'='*60}")
print("RAW OUTPUT:")
print(f"{'='*60}")
print(raw_output)

In [None]:
# Parse thinking content vs final response
import re

def parse_thinking_response(text):
    """
    Parse thinking model output into thinking content and final response.
    Thinking-2507 models output <think>...</think> followed by response.
    """
    # Try to find thinking block
    think_match = re.search(r'<think>(.*?)</think>', text, re.DOTALL)
    
    if think_match:
        thinking = think_match.group(1).strip()
        # Response is everything after </think>
        response = text[think_match.end():].strip()
    else:
        # No thinking tags found - check if output starts with thinking content
        # (Thinking-2507 models may output only </think> as the template adds <think>)
        if '</think>' in text:
            parts = text.split('</think>', 1)
            thinking = parts[0].strip()
            response = parts[1].strip() if len(parts) > 1 else ""
        else:
            thinking = ""
            response = text.strip()
    
    return thinking, response

# Parse the output
thinking_content, response_content = parse_thinking_response(raw_output)

print(f"{'='*60}")
print("THINKING CONTENT:")
print(f"{'='*60}")
print(thinking_content if thinking_content else "(No thinking content found)")

print(f"\n{'='*60}")
print("FINAL RESPONSE:")
print(f"{'='*60}")
print(response_content if response_content else "(No response found)")

In [None]:
# Verification summary
has_thinking = bool(thinking_content)
has_response = bool(response_content)

print(f"\n{'='*60}")
print("VERIFICATION SUMMARY")
print(f"{'='*60}")
print(f"Model: {MODEL_NAME}")
print(f"FastInference: ✅ SUPPORTED")
print(f"Thinking tags present: {'✅ YES' if has_thinking else '❌ NO'}")
print(f"Response generated: {'✅ YES' if has_response else '❌ NO'}")
print(f"Generation time: {elapsed:.2f}s")
print(f"{'='*60}")

if has_thinking and has_response:
    print("\n✅ Qwen3-4B-Thinking-2507 Fast Inference Test PASSED")
else:
    print("\n⚠️ Test completed but thinking output may need review")

## Test Complete

The Qwen3-4B-Thinking-2507 fast_inference test has completed. The kernel will now shut down to release all GPU memory.

### What Was Verified
- FastLanguageModel loading with fast_inference mode (vLLM backend)
- Thinking model generates `<think>...</think>` content
- Parsing separates thinking from final response
- Self-questioning reasoning style in thinking block

### Ready for Production
If this test passed, your environment is ready for:
- Thinking model inference with vLLM acceleration
- Chain-of-thought reasoning workflows
- Training notebooks that require thinking output parsing

In [None]:
# Shutdown kernel to release all GPU memory
import IPython
print("Shutting down kernel to release GPU memory...")
app = IPython.Application.instance()
app.kernel.do_shutdown(restart=False)