In [3]:
# @title Request for quota

# @markdown For serving Llama 3.1 8B and Qwen3 32B models, we need 1 and 4 TPU v6es, respectively.

# @markdown > | Model | Accelerator Type |
# @markdown | ----------- | ----------- |
# @markdown | Llama 3.1 8B |1 TPU v6e (ct6e-standard-1t)|
# @markdown | Qwen3 32B|4 TPU v6e (ct6e-standard-4t)|

In [4]:
# @title Setup Google Cloud project

# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 2. **[Optional]** [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs. Set the BUCKET_URI for the experiment environment. The specified Cloud Storage bucket (`BUCKET_URI`) should be located in the same region as where the notebook was launched. Note that a multi-region bucket (eg. "us") is not considered a match for a single region covered by the multi-region range (eg. "us-central1"). If not set, a unique GCS bucket will be created instead.

# BUCKET_URI = "gs://"  # @param {type:"string"}

# @markdown 3. **[Optional]** Set region. If not set, the region will be set automatically according to Colab Enterprise environment.

# REGION = ""  # @param {type:"string"}

# Upgrade Vertex AI SDK.
! pip3 install --upgrade --quiet 'google-cloud-aiplatform>=1.64.0'

# Import the necessary packages
import datetime
import importlib
import os
import uuid
from typing import Tuple

from google.cloud import aiplatform

! git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git

models, endpoints = {}, {}

common_util = importlib.import_module(
    "vertex-ai-samples.community-content.vertex_model_garden.model_oss.notebook_util.common_util"
)

# Get the default cloud project id.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

PROJECT_IDS = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_IDS[0]  # @param {type:"string"}

if not PROJECT_ID:
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = "europe-west4" #"us-south1" #"us-central1" # @param {type:"string"}

os.environ["GOOGLE_CLOUD_PROJECT"] = PROJECT_ID
os.environ["GOOGLE_CLOUD_LOCATION"] = LOCATION
os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = "TRUE" # Use Vertex AI API

BUCKET_URI = "gs://llama31_training-europe"  # @param {type:"string"}

# @markdown 3. **[Optional]** Set region. If not set, the region will be set automatically according to Colab Enterprise environment.

REGION = LOCATION # "us-south1"  # @param {type:"string"}

# Get the default region for launching jobs.
if not REGION:
    if not os.environ.get("GOOGLE_CLOUD_REGION"):
        raise ValueError(
            "REGION must be set. See"
            " https://cloud.google.com/vertex-ai/docs/general/locations for"
            " available cloud locations."
        )
    REGION = os.environ["GOOGLE_CLOUD_REGION"]

# Enable the Vertex AI API and Compute Engine API, if not already.
print("Enabling Vertex AI API and Compute Engine API.")
! gcloud services enable aiplatform.googleapis.com compute.googleapis.com

# Cloud Storage bucket for storing the experiment artifacts.
# A unique GCS bucket will be created for the purpose of this notebook. If you
# prefer using your own GCS bucket, change the value yourself below.
now = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])

if BUCKET_URI is None or BUCKET_URI.strip() == "" or BUCKET_URI == "gs://":
    BUCKET_URI = f"gs://{PROJECT_ID}-tmp-{now}-{str(uuid.uuid4())[:4]}"
    BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])
    ! gsutil mb -l {REGION} {BUCKET_URI}
else:
    assert BUCKET_URI.startswith("gs://"), "BUCKET_URI must start with `gs://`."
    shell_output = ! gsutil ls -Lb {BUCKET_NAME} | grep "Location constraint:" | sed "s/Location constraint://"
    bucket_region = shell_output[0].strip().lower()
    if bucket_region != REGION:
        raise ValueError(
            "Bucket region %s is different from notebook region %s"
            % (bucket_region, REGION)
        )
print(f"Using this GCS Bucket: {BUCKET_URI}")

STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")
MODEL_BUCKET = os.path.join(BUCKET_URI, "vllm_tpu")


# Initialize Vertex AI API.
print("Initializing Vertex AI API.")
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

# Gets the default SERVICE_ACCOUNT.
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print("Using this default Service Account:", SERVICE_ACCOUNT)


# Provision permissions to the SERVICE_ACCOUNT with the GCS bucket
# ! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.admin $BUCKET_NAME

# ! gcloud config set project $PROJECT_ID
# ! gcloud projects add-iam-policy-binding --no-user-output-enabled {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role="roles/storage.admin"
# ! gcloud projects add-iam-policy-binding --no-user-output-enabled {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role="roles/aiplatform.user"

[0mfatal: destination path 'vertex-ai-samples' already exists and is not an empty directory.
Enabling Vertex AI API and Compute Engine API.
Operation "operations/acat.p2-87995179092-c0f22ec4-34a8-45ce-a7b2-61e5fc330d0f" finished successfully.
Using this GCS Bucket: gs://llama31_training-europe
Initializing Vertex AI API.
Using this default Service Account: 87995179092-compute@developer.gserviceaccount.com


In [5]:
# @title Access the models
# @markdown ### Access Llama 3.1 and Qwen3 models on Vertex AI for serving
# @markdown The models from the Hugging Face can be used for serving in Vertex AI.
# @markdown 1. Open the [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) and [Qwen3-32B](https://huggingface.co/Qwen/Qwen3-32B) models from [Hugging Face](https://huggingface.co/).
# @markdown 2. Review and accept the agreement.
# @markdown 3. After accepting the agreement, models will be available for serving.
# @markdown 4. You must provide a Hugging Face User Access Token (with read access) to access the Llama 3.1 model. You can follow the [Hugging Face documentation](https://huggingface.co/docs/hub/en/security-tokens) to create a **read** access token and put it in the `HF_TOKEN` field below.

import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")

if not HF_TOKEN:
    print("Error: HF_TOKEN not found in .env file or not provided.")
    print("Please provide a read HF_TOKEN to Llama 3.1 model from Hugging Face in your .env file.")
else:
    print("HF_TOKEN loaded successfully.")
    # You can now use HF_TOKEN in your code, e.g., to authenticate with Hugging Face models

HF_TOKEN loaded successfully.


In [7]:
# @title Prepare

# @markdown In this section you can choose a desired model and the region for TPU deployment.
# @markdown Learn about [TPU v6e machine types](https://cloud.google.com/tpu/docs/v6e#configurations) for Vertex AI prediction.

# @markdown Here are 2 example models you can run:

MODEL_ID = "Llama-3.1-8B-Instruct"  # @param ["Llama-3.1-8B-Instruct", "Qwen3-32B"] {isTemplate: true}

TPU_DEPLOYMENT_REGION = "europe-west4"  # @param {type:"string"}

tpu_type = "TPU_V6e"


if "Llama-3" in MODEL_ID:
    model_path_prefix = "meta-llama/"
    model_id = os.path.join(model_path_prefix, MODEL_ID)
    model_publisher = "meta"
    model_publisher_id = "llama3"
    machine_type = "ct6e-standard-1t"
    tpu_count = 1
    tpu_topo = "1x1"
    print(MODEL_ID, "will run on", tpu_count, "tpu")
elif "Qwen3" in MODEL_ID:
    model_path_prefix = "Qwen/"
    model_id = os.path.join(model_path_prefix, MODEL_ID)
    model_publisher = "qwen"
    model_publisher_id = "qwen3"
    machine_type = "ct6e-standard-4t"
    tpu_count = 4
    tpu_topo = "2x2"
    print(MODEL_ID, "will run on", tpu_count, "tpus")
else:
    raise ValueError(f"Unsupported MODEL_ID: {MODEL_ID}")


vLLM_TPU_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20250529_0917_tpu_experimental_RC00"
# @markdown Set `use_dedicated_endpoint` to False if you don't want to use [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment#create-dedicated-endpoint).
use_dedicated_endpoint = True  # @param {type:"boolean"}


# common_util.check_quota(
#     project_id=PROJECT_ID,
#     region=TPU_DEPLOYMENT_REGION,
#     accelerator_type=tpu_type,
#     accelerator_count=tpu_count,
#     is_for_training=False,
# )


# Server parameters.
tensor_parallel_size = tpu_count

# Fraction of HBM memory allocated for KV cache after model loading. A larger value improves throughput but gives higher risk of TPU out-of-memory errors with long prompts.

# Maximum number of running sequences in a continuous batch.
max_running_seqs = 256  # @param
# Maximum context length for a request.
max_model_len = 2048  # @param

# Endpoint configurations.
min_replica_count = 1
max_replica_count = 1

run_name = "llama3"  # @param {type:"string"}

# @markdown Note: The vLLM-TPU container used in this notebook is in experimental status.

Llama-3.1-8B-Instruct will run on 1 tpu


## Deploy prebuilt Llama 3.1 8B or Qwen3 32B models with vLLM on TPUs
This section will download the prebuilt model chosen in the previous section and deploys it to a Vertex AI Endpoint. It takes 15 minutes to 1 hour to finish depending on the size of the model.

In [None]:
# @title Deploy
def deploy_model_vllm_tpu(
    model_name: str,
    model_id: str,
    publisher: str,
    publisher_model_id: str,
    service_account: str,
    base_model_id: str = None,
    tensor_parallel_size: int = 1,
    machine_type: str = "ct6e-standard-1t",
    tpu_topology: str = "1x1",
    max_model_len: int = 4096,
    enable_chunked_prefill: bool = False,
    enable_prefix_cache: bool = False,
    endpoint_id: str = "",
    min_replica_count: int = 1,
    max_replica_count: int = 1,
    use_dedicated_endpoint: bool = False,
    model_type: str = None,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys models with vLLM on TPU in Vertex AI."""
    if endpoint_id:
        aip_endpoint_name = (
            f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_id}"
        )
        endpoint = aiplatform.Endpoint(aip_endpoint_name)
    else:
        endpoint = aiplatform.Endpoint.create(
            display_name=f"{model_name}-endpoint",
            location=TPU_DEPLOYMENT_REGION,
            dedicated_endpoint_enabled=use_dedicated_endpoint,
        )

    if not base_model_id:
        base_model_id = model_id

    if not tensor_parallel_size:
        tensor_parallel_size = int(machine_type[-2])

    num_hosts = int(tpu_topology.split("x")[0])

    vllmtpu_args = [
        "python",
        "-m",
        "vllm.entrypoints.api_server",
        "--host=0.0.0.0",
        "--port=7080",
        f"--model={model_id}",
        f"--tensor_parallel_size={tensor_parallel_size}",
        f"--max_model_len={max_model_len}",
    ]

    if enable_chunked_prefill:
        vllmtpu_args.append("--enable-chunked-prefill")

    if enable_prefix_cache:
        vllmtpu_args.append("--enable-prefix-caching")

    env_vars = {
        "MODEL_ID": base_model_id,
        "DEPLOY_SOURCE": "notebook",
        "VLLM_USE_V1": "1",
    }

    # HF_TOKEN is not a compulsory field and may not be defined.
    try:
        if HF_TOKEN:
            env_vars["HF_TOKEN"] = HF_TOKEN
    except NameError:
        pass

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=vLLM_TPU_DOCKER_URI,
        serving_container_args=vllmtpu_args,
        serving_container_ports=[7080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables=env_vars,
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
        serving_container_deployment_timeout=7200,
        model_garden_source_model_name=(
            f"publishers/{publisher}/models/{publisher_model_id}"
        ),
        location=TPU_DEPLOYMENT_REGION,
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        tpu_topology=tpu_topology if num_hosts > 1 else None,
        deploy_request_timeout=1800,
        service_account=service_account,
        min_replica_count=min_replica_count,
        max_replica_count=max_replica_count,
        system_labels={
            "NOTEBOOK_NAME": "model_garden_pytorch_llama3_1_qwen3_deployment_tpu.ipynb",
        },
    )
    return model, endpoint


models["vllmtpu"], endpoints["vllmtpu"] = deploy_model_vllm_tpu(
    model_name=common_util.get_job_name_with_datetime(prefix=run_name),
    model_id=model_id,
    publisher=model_publisher,
    publisher_model_id=model_publisher_id,
    service_account=SERVICE_ACCOUNT,
    tensor_parallel_size=tensor_parallel_size,
    machine_type=machine_type,
    tpu_topology=tpu_topo,
    max_model_len=max_model_len,
    enable_chunked_prefill=True,
    enable_prefix_cache=True,
    use_dedicated_endpoint=use_dedicated_endpoint,
)

Creating Endpoint
Create Endpoint backing LRO: projects/87995179092/locations/europe-west4/endpoints/6807034706119360512/operations/1967437656879005696
Endpoint created. Resource name: projects/87995179092/locations/europe-west4/endpoints/6807034706119360512
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/87995179092/locations/europe-west4/endpoints/6807034706119360512')
Creating Model
Create Model backing LRO: projects/87995179092/locations/europe-west4/models/615869448066170880/operations/7596937191092125696
Model created. Resource name: projects/87995179092/locations/europe-west4/models/615869448066170880@1
To use this Model in another session:
model = aiplatform.Model('projects/87995179092/locations/europe-west4/models/615869448066170880@1')
Deploying model to Endpoint : projects/87995179092/locations/europe-west4/endpoints/6807034706119360512
Deploy Endpoint model backing LRO: projects/87995179092/locations/europe-west4/endpoints/68070347061193605

In [11]:
# @title Raw predict

# @markdown Once deployment succeeds, you can send requests to the endpoint with text prompts. Sampling parameters supported by vLLM can be found [here](https://docs.vllm.ai/en/latest/dev/sampling_params.html).

# @markdown Example:

# @markdown ```
# @markdown Human: What is a car?
# @markdown Assistant:  A car, or a motor car, is a road-connected human-transportation system used to move people or goods from one place to another. The term also encompasses a wide range of vehicles, including motorboats, trains, and aircrafts. Cars typically have four wheels, a cabin for passengers, and an engine or motor. They have been around since the early 19th century and are now one of the most popular forms of transportation, used for daily commuting, shopping, and other purposes.
# @markdown ```
# @markdown Additionally, you can moderate the generated text with Vertex AI. See [Moderate text documentation](https://cloud.google.com/natural-language/docs/moderating-text) for more details.

# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint.name` allows us to get the
#   endpoint name of the endpoint `endpoint` created in the cell
#   above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint = aiplatform.Endpoint(aip_endpoint_name)

prompt = "What is a car that can run on the wall?"  # @param {type: "string"}
# @markdown If you encounter an issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, by lowering `max_tokens`.
max_tokens = 50  # @param {type:"integer"}
temperature = 1.0  # @param {type:"number"}

# @markdown Set `raw_response` to `True` to obtain the raw model output. Set `raw_response` to `False` to apply additional formatting in the structure of `"Prompt:\n{prompt.strip()}\nOutput:\n{output}"`.
raw_response = False  # @param {type:"boolean"}

# Overrides parameters for inferences.
instances = [
    {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "raw_response": raw_response,
    },
]
response = endpoints["vllmtpu"].predict(
    instances=instances, use_dedicated_endpoint=use_dedicated_endpoint
)

for prediction in response.predictions:
    print(prediction)
# @markdown Note Top-k sampling is not currently enabled for vLLM on TPU.

Prompt:
What is a car that can run on the wall?
Output:
 Very unlikely car I know, but it grabs me. Its not the very best car at all. Its Power generation is very weak like I said before dont expect alot, This thing has got a small Battery DC 12 VOLTS 250 AH with


### Experiment 

In [18]:
import time
import json
import numpy as np
import pandas as pd
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
import statistics
from datetime import datetime
import os

# Test configuration - REDUCED for testing
TEST_CONFIG = {
    'concurrent_users': 250,      # Start small to test endpoint stability
    'total_requests': 50,       # Reduce for initial testing
    'input_token_length': 265,  
    'output_tokens': 317,       
    'temperature': 0.7,
    'top_p': 1.0,
    'max_tokens': 350,
    'stream': True
}

# Create timestamped filenames
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = "vllm_performance_tests"
os.makedirs(output_dir, exist_ok=True)

csv_filename = f"{output_dir}/vllm_test_{timestamp}.csv"
detailed_csv_filename = f"{output_dir}/vllm_detailed_{timestamp}.csv"
md_filename = f"{output_dir}/vllm_report_{timestamp}.md"

print(f"Test started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Output files will be saved as:")
print(f"- Summary CSV: {csv_filename}")
print(f"- Detailed CSV: {detailed_csv_filename}")
print(f"- Report MD: {md_filename}")

# Generate test prompts
def generate_test_prompt(target_tokens=265):
    base_prompt = """Analyze the following business scenario and provide recommendations:

A technology startup is developing an AI-powered customer service platform. They need to understand market positioning, competitive analysis, implementation strategy, and growth projections. Consider technical requirements, user experience design, scalability concerns, and business model validation.

Please provide strategic insights covering market analysis, technical architecture, user acquisition strategies, and financial projections for the next 24 months."""
    
    return base_prompt

# Metrics collection
metrics = {
    'ttft_times': [],
    'inter_token_latencies': [],
    'end_to_end_times': [],
    'input_tokens': [],
    'output_tokens': [],
    'request_errors': [],
    'timestamps': []
}

metrics_lock = threading.Lock()

def make_request(request_id, prompt, config):
    """Single request function with enhanced error handling"""
    start_time = time.time()
    
    try:
        # Prepare request with timeout handling
        instances = [{
            "prompt": prompt,
            "max_tokens": config['max_tokens'],
            "temperature": config['temperature'],
            "top_p": config.get('top_p', 1.0),
            "raw_response": True,
        }]
        
        request_start = time.time()
        
        # Add retry logic for 502 errors
        max_retries = 2
        for attempt in range(max_retries + 1):
            try:
                response = endpoints["vllmtpu"].predict(
                    instances=instances, 
                    use_dedicated_endpoint=use_dedicated_endpoint
                )
                break  # Success, exit retry loop
            except Exception as e:
                if "502" in str(e) and attempt < max_retries:
                    print(f"Request {request_id}: 502 error, retrying ({attempt + 1}/{max_retries})...")
                    time.sleep(1)  # Brief delay before retry
                    continue
                else:
                    raise e  # Re-raise if not 502 or out of retries
        
        request_end = time.time()
        
        # Parse response safely
        prediction = {}
        output_text = ""
        
        if hasattr(response, 'predictions') and response.predictions:
            prediction = response.predictions[0] if response.predictions else {}
            if isinstance(prediction, dict):
                output_text = prediction.get('generated_text', '') or prediction.get('content', '') or str(prediction)
            else:
                output_text = str(prediction)
        
        # Calculate metrics
        end_to_end_time = request_end - request_start
        
        # Estimate tokens
        input_tokens = len(prompt.split()) * 1.3
        output_tokens = len(output_text.split()) * 1.3 if output_text else 0
        
        # Estimate timing metrics
        estimated_ttft = min(0.5, end_to_end_time * 0.02) if end_to_end_time > 0 else 0
        estimated_inter_token = (end_to_end_time - estimated_ttft) / max(1, output_tokens) if output_tokens > 0 else 0
        
        # Store metrics
        with metrics_lock:
            metrics['ttft_times'].append(estimated_ttft)
            metrics['inter_token_latencies'].append(estimated_inter_token)
            metrics['end_to_end_times'].append(end_to_end_time)
            metrics['input_tokens'].append(input_tokens)
            metrics['output_tokens'].append(output_tokens)
            metrics['timestamps'].append(request_start)
        
        return {
            'request_id': request_id,
            'success': True,
            'timestamp': request_start,
            'end_to_end_time': end_to_end_time,
            'ttft': estimated_ttft,
            'inter_token_latency': estimated_inter_token,
            'input_tokens': input_tokens,
            'output_tokens': output_tokens,
            'output_length': len(output_text),
            'prompt_length': len(prompt),
            'output_text': output_text[:200] + "..." if len(output_text) > 200 else output_text,
            'error': None
        }
        
    except Exception as e:
        error_time = time.time() - start_time
        error_msg = str(e)
        
        with metrics_lock:
            metrics['request_errors'].append({
                'request_id': request_id,
                'error': error_msg,
                'time': error_time
            })
        
        print(f"Request {request_id} failed: {error_msg[:100]}...")
        
        return {
            'request_id': request_id,
            'success': False,
            'timestamp': start_time,
            'error': error_msg,
            'time': error_time,
            'end_to_end_time': error_time,
            'ttft': 0,
            'inter_token_latency': 0,
            'input_tokens': len(prompt.split()) * 1.3 if prompt else 0,
            'output_tokens': 0,
            'output_length': 0,
            'prompt_length': len(prompt) if prompt else 0,
            'output_text': ""
        }

# Test endpoint first with a single request
print("Testing endpoint with single request first...")
test_prompt = generate_test_prompt()

try:
    single_test = make_request(0, test_prompt, TEST_CONFIG)
    if single_test['success']:
        print("✅ Single request test successful!")
        print(f"Response time: {single_test['end_to_end_time']:.2f}s")
        print(f"Output length: {single_test['output_length']} chars")
    else:
        print("❌ Single request test failed!")
        print(f"Error: {single_test['error']}")
        print("\n🛑 Endpoint appears to have issues. Consider:")
        print("1. Check if the endpoint is properly deployed and running")
        print("2. Verify the endpoint has sufficient resources")
        print("3. Test with smaller requests first")
        print("4. Check Google Cloud Console for endpoint logs")
        
        # Still proceed but with warning
        input("\nPress Enter to continue with load test anyway, or Ctrl+C to abort...")
        
except Exception as e:
    print(f"❌ Critical error during single test: {e}")
    print("Aborting load test.")
    exit(1)

# Generate test prompts
print(f"Generating {TEST_CONFIG['total_requests']} test prompts...")
test_prompts = [generate_test_prompt(TEST_CONFIG['input_token_length']) 
                for _ in range(TEST_CONFIG['total_requests'])]

print(f"Generated {len(test_prompts)} test prompts")
print(f"Sample prompt length: {len(test_prompts[0].split())} words")

# Run load test
print(f"\nStarting load test:")
print(f"- Concurrent users: {TEST_CONFIG['concurrent_users']}")
print(f"- Total requests: {TEST_CONFIG['total_requests']}")
print(f"- Target output tokens: {TEST_CONFIG['output_tokens']}")

test_start_time = time.time()
results = []

with ThreadPoolExecutor(max_workers=TEST_CONFIG['concurrent_users']) as executor:
    future_to_id = {
        executor.submit(make_request, i, test_prompts[i % len(test_prompts)], TEST_CONFIG): i 
        for i in range(TEST_CONFIG['total_requests'])
    }
    
    completed = 0
    for future in as_completed(future_to_id):
        request_id = future_to_id[future]
        try:
            result = future.result()
            results.append(result)
        except Exception as e:
            results.append({
                'request_id': request_id,
                'success': False,
                'timestamp': time.time(),
                'error': str(e),
                'end_to_end_time': 0,
                'ttft': 0,
                'inter_token_latency': 0,
                'input_tokens': 0,
                'output_tokens': 0,
                'output_length': 0,
                'prompt_length': 0,
                'output_text': ""
            })
        
        completed += 1
        if completed % max(1, TEST_CONFIG['total_requests'] // 20) == 0:
            success_rate = len([r for r in results if r.get('success', False)]) / len(results) * 100
            print(f"Completed {completed}/{TEST_CONFIG['total_requests']} requests... Success rate: {success_rate:.1f}%")

test_end_time = time.time()
total_test_time = test_end_time - test_start_time

# Calculate performance metrics with safe variable handling
successful_requests = [r for r in results if r.get('success', False)]
failed_requests = [r for r in results if not r.get('success', False)]

print(f"\n{'='*60}")
print(f"LOAD TEST RESULTS")
print(f"{'='*60}")

print(f"\nTest Summary:")
print(f"- Total requests: {len(results)}")
print(f"- Successful requests: {len(successful_requests)}")
print(f"- Failed requests: {len(failed_requests)}")
print(f"- Success rate: {len(successful_requests)/len(results)*100:.1f}%")
print(f"- Total test time: {total_test_time:.1f} seconds")

# Initialize all variables to prevent NameError
ttft_times = []
inter_token_times = []
e2e_times = []
input_tokens = []
output_tokens = []
ttft_p50 = ttft_p95 = ttft_p99 = 0
inter_token_p50 = inter_token_p95 = 0
e2e_p50 = e2e_p95 = e2e_p99 = 0
token_output_throughput = overall_token_throughput = requests_per_second = 0
total_input_tokens = total_output_tokens = 0

# Calculate metrics only if we have successful requests
if successful_requests:
    ttft_times = [r['ttft'] for r in successful_requests]
    inter_token_times = [r['inter_token_latency'] for r in successful_requests]
    e2e_times = [r['end_to_end_time'] for r in successful_requests]
    input_tokens = [r['input_tokens'] for r in successful_requests]
    output_tokens = [r['output_tokens'] for r in successful_requests]
    
    def percentile(data, p):
        return np.percentile(data, p) if data else 0
    
    ttft_p50 = percentile(ttft_times, 50)
    ttft_p95 = percentile(ttft_times, 95)
    ttft_p99 = percentile(ttft_times, 99)
    inter_token_p50 = percentile(inter_token_times, 50)
    inter_token_p95 = percentile(inter_token_times, 95)
    e2e_p50 = percentile(e2e_times, 50)
    e2e_p95 = percentile(e2e_times, 95)
    e2e_p99 = percentile(e2e_times, 99)
    
    total_output_tokens = sum(output_tokens)
    total_input_tokens = sum(input_tokens)
    total_tokens = total_output_tokens + total_input_tokens
    
    token_output_throughput = total_output_tokens / total_test_time
    overall_token_throughput = total_tokens / total_test_time
    requests_per_second = len(successful_requests) / total_test_time
    
    print(f"\nLatency Metrics:")
    print(f"- TTFT (p50): {ttft_p50:.3f}s")
    print(f"- TTFT (p95): {ttft_p95:.3f}s")
    print(f"- Inter-token Latency (p95): {inter_token_p95:.3f}s")
    print(f"- End-to-End (p95): {e2e_p95:.1f}s")
    
    print(f"\nThroughput Metrics:")
    print(f"- Token Output Throughput: {token_output_throughput:.2f} tok/sec")
    print(f"- Overall Token Throughput: {overall_token_throughput:.2f} tok/sec")
    print(f"- Requests per second: {requests_per_second:.2f} req/sec")
    
    print(f"\nToken Statistics:")
    print(f"- Average input tokens: {statistics.mean(input_tokens):.1f}")
    print(f"- Average output tokens: {statistics.mean(output_tokens):.1f}")

else:
    print(f"\n❌ NO SUCCESSFUL REQUESTS - ENDPOINT ISSUES DETECTED")
    print(f"\n🔍 TROUBLESHOOTING RECOMMENDATIONS:")
    print(f"1. Check endpoint status in Google Cloud Console")
    print(f"2. Verify endpoint has sufficient resources allocated")
    print(f"3. Check for quota limits or rate limiting")
    print(f"4. Review endpoint logs for detailed error messages")
    print(f"5. Try reducing concurrent users and request size")

# Error analysis
if failed_requests:
    print(f"\n{'='*60}")
    print(f"ERROR ANALYSIS")
    print(f"{'='*60}")
    
    error_types = defaultdict(int)
    for req in failed_requests:
        error_msg = req.get('error', 'Unknown error')
        # Truncate long error messages
        error_key = error_msg[:100] + "..." if len(error_msg) > 100 else error_msg
        error_types[error_key] += 1
    
    for error, count in list(error_types.items())[:10]:  # Show top 10 errors
        print(f"- {error}: {count} occurrences")

# Save detailed results
print(f"\nSaving results...")
results_df = pd.DataFrame(results)
results_df.to_csv(detailed_csv_filename, index=False)

# Create summary with safe variable access
summary_data = {
    'timestamp': [timestamp],
    'test_duration_seconds': [total_test_time],
    'total_requests': [len(results)],
    'successful_requests': [len(successful_requests)],
    'failed_requests': [len(failed_requests)],
    'success_rate_percent': [len(successful_requests)/len(results)*100],
    'concurrent_users': [TEST_CONFIG['concurrent_users']],
    'ttft_p95_seconds': [ttft_p95],
    'inter_token_p95_seconds': [inter_token_p95],
    'e2e_p95_seconds': [e2e_p95],
    'token_output_throughput': [token_output_throughput],
    'overall_token_throughput': [overall_token_throughput],
    'requests_per_second': [requests_per_second],
    'avg_input_tokens': [statistics.mean(input_tokens) if input_tokens else 0],
    'avg_output_tokens': [statistics.mean(output_tokens) if output_tokens else 0],
}

summary_df = pd.DataFrame(summary_data)
summary_df.to_csv(csv_filename, index=False)

# Generate markdown report
md_content = f"""# vLLM Performance Test Report - {timestamp}

**Test Status:** {'✅ PARTIAL SUCCESS' if successful_requests else '❌ FAILED'}  
**Success Rate:** {len(successful_requests)/len(results)*100:.1f}%  
**Test Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

## Issues Detected

⚠️ **Endpoint returned 502 errors** - Backend service unavailable  
⚠️ **{len(failed_requests)} out of {len(results)} requests failed**

## Recommendations

1. **Check endpoint health** in Google Cloud Console
2. **Scale up resources** if endpoint is under-provisioned
3. **Implement retry logic** for production applications
4. **Monitor endpoint logs** for detailed error information
5. **Start with smaller load** to test stability

"""

if successful_requests:
    md_content += f"""
## Performance Results (Successful Requests Only)

| Metric | Value |
|--------|-------|
| TTFT (p95) | {ttft_p95:.3f}s |
| Inter-token (p95) | {inter_token_p95:.3f}s |
| End-to-End (p95) | {e2e_p95:.1f}s |
| Token Output Throughput | {token_output_throughput:.2f} tok/sec |
| Requests/sec | {requests_per_second:.2f} |
"""

md_content += f"""
## Error Summary

| Error Type | Count |
|------------|-------|
"""

error_types = defaultdict(int)
for req in failed_requests:
    error_msg = req.get('error', 'Unknown error')
    error_key = error_msg[:50] + "..." if len(error_msg) > 50 else error_msg
    error_types[error_key] += 1

for error, count in list(error_types.items())[:5]:
    md_content += f"| {error} | {count} |\n"

# Save markdown
with open(md_filename, 'w', encoding='utf-8') as f:
    f.write(md_content)

print(f"\n{'='*60}")
print(f"FILES SAVED")
print(f"{'='*60}")
print(f"📄 Summary: {csv_filename}")
print(f"📊 Details: {detailed_csv_filename}")
print(f"📝 Report: {md_filename}")

if len(successful_requests) == 0:
    print(f"\n🚨 CRITICAL: All requests failed. Check your endpoint!")
else:
    print(f"\n📊 Partial results saved. Success rate: {len(successful_requests)/len(results)*100:.1f}%")

Test started at: 2025-06-30 14:32:12
Output files will be saved as:
- Summary CSV: vllm_performance_tests/vllm_test_20250630_143212.csv
- Detailed CSV: vllm_performance_tests/vllm_detailed_20250630_143212.csv
- Report MD: vllm_performance_tests/vllm_report_20250630_143212.md
Testing endpoint with single request first...
✅ Single request test successful!
Response time: 6.11s
Output length: 2007 chars
Generating 50 test prompts...
Generated 50 test prompts
Sample prompt length: 63 words

Starting load test:
- Concurrent users: 250
- Total requests: 50
- Target output tokens: 317
Completed 2/50 requests... Success rate: 100.0%
Completed 4/50 requests... Success rate: 100.0%
Completed 6/50 requests... Success rate: 100.0%
Completed 8/50 requests... Success rate: 100.0%
Completed 10/50 requests... Success rate: 100.0%
Completed 12/50 requests... Success rate: 100.0%
Completed 14/50 requests... Success rate: 100.0%
Completed 16/50 requests... Success rate: 100.0%
Completed 18/50 requests... 

### Report with test

In [None]:
import time
import json
import numpy as np
import pandas as pd
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
import statistics
from datetime import datetime
import os

# Test configuration matching your target metrics
TEST_CONFIG = {
    'concurrent_users': 25,
    'total_requests': 10000,
    'input_token_length': 265,  # Target input length
    'output_tokens': 317,       # Target output length
    'temperature': 0.7,
    'top_p': 1.0,
    'max_tokens': 350,
    'stream': True
}

# Create timestamped filenames
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = "vllm_performance_tests"
os.makedirs(output_dir, exist_ok=True)

csv_filename = f"{output_dir}/vllm_test_{timestamp}.csv"
detailed_csv_filename = f"{output_dir}/vllm_detailed_{timestamp}.csv"
md_filename = f"{output_dir}/vllm_report_{timestamp}.md"

print(f"Test started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Output files will be saved as:")
print(f"- Summary CSV: {csv_filename}")
print(f"- Detailed CSV: {detailed_csv_filename}")
print(f"- Report MD: {md_filename}")

# Generate test prompts of approximately 265 tokens each
def generate_test_prompt(target_tokens=265):
    base_prompt = """Analyze the following complex scenario and provide a detailed response covering multiple aspects:

A multinational technology company is considering implementing a comprehensive artificial intelligence strategy across all departments. The company operates in 15 countries, has 50,000 employees, and generates $20 billion in annual revenue. The CEO wants to understand how AI can transform their business operations, improve customer experience, increase efficiency, and create new revenue streams.

Consider the following factors in your analysis:
1. Current market trends in AI adoption across different industries
2. Potential risks and challenges of large-scale AI implementation
3. Required infrastructure and technological investments
4. Impact on existing workforce and necessary reskilling programs
5. Regulatory compliance considerations in different jurisdictions
6. Timeline for phased implementation and expected ROI
7. Competitive advantages that could be gained
8. Data privacy and security implications
9. Integration challenges with legacy systems
10. Metrics for measuring success and continuous improvement

Please provide a comprehensive strategic recommendation that addresses each of these points with specific examples and actionable insights. Include potential pilot programs, budget considerations, and a roadmap for the next 3-5 years."""
    
    # Adjust length to approximately target tokens
    words = base_prompt.split()
    target_words = target_tokens * 0.75  # Rough conversion
    if len(words) > target_words:
        return ' '.join(words[:int(target_words)])
    else:
        # Extend if needed
        extension = " Additionally, consider the impact on stakeholder relationships, customer trust, brand reputation, and long-term sustainability. Analyze potential partnerships with AI vendors, academic institutions, and research organizations. Evaluate the company's current digital maturity and readiness for AI transformation." * 3
        return base_prompt + extension

# Metrics collection
metrics = {
    'ttft_times': [],           # Time to First Token
    'inter_token_latencies': [], # Time between tokens
    'end_to_end_times': [],     # Total request time
    'input_tokens': [],         # Actual input token counts
    'output_tokens': [],        # Actual output token counts
    'request_errors': [],       # Failed requests
    'timestamps': []            # Request timestamps
}

metrics_lock = threading.Lock()

def make_request(request_id, prompt, config):
    """Single request function with detailed timing"""
    start_time = time.time()
    
    try:
        # Prepare request
        instances = [{
            "prompt": prompt,
            "max_tokens": config['max_tokens'],
            "temperature": config['temperature'],
            "top_p": config.get('top_p', 1.0),
            "raw_response": True,
            "stream": config.get('stream', True)
        }]
        
        # Record request start
        request_start = time.time()
        
        # Make prediction
        response = endpoints["vllmtpu"].predict(
            instances=instances, 
            use_dedicated_endpoint=use_dedicated_endpoint
        )
        
        request_end = time.time()
        
        # Parse response
        prediction = response.predictions[0] if response.predictions else {}
        output_text = prediction.get('generated_text', '') or str(prediction)
        
        # Calculate metrics
        end_to_end_time = request_end - request_start
        
        # Estimate tokens (rough approximation)
        input_tokens = len(prompt.split()) * 1.3  # Rough token estimate
        output_tokens = len(output_text.split()) * 1.3
        
        # Simulate TTFT and inter-token timing (in real streaming, you'd capture these)
        estimated_ttft = min(0.5, end_to_end_time * 0.02)  # Estimate TTFT
        estimated_inter_token = (end_to_end_time - estimated_ttft) / max(1, output_tokens)
        
        # Store metrics
        with metrics_lock:
            metrics['ttft_times'].append(estimated_ttft)
            metrics['inter_token_latencies'].append(estimated_inter_token)
            metrics['end_to_end_times'].append(end_to_end_time)
            metrics['input_tokens'].append(input_tokens)
            metrics['output_tokens'].append(output_tokens)
            metrics['timestamps'].append(request_start)
        
        return {
            'request_id': request_id,
            'success': True,
            'timestamp': request_start,
            'end_to_end_time': end_to_end_time,
            'ttft': estimated_ttft,
            'inter_token_latency': estimated_inter_token,
            'input_tokens': input_tokens,
            'output_tokens': output_tokens,
            'output_length': len(output_text),
            'prompt_length': len(prompt),
            'output_text': output_text[:200] + "..." if len(output_text) > 200 else output_text  # Truncated for CSV
        }
        
    except Exception as e:
        error_time = time.time() - start_time
        with metrics_lock:
            metrics['request_errors'].append({
                'request_id': request_id,
                'error': str(e),
                'time': error_time
            })
        
        return {
            'request_id': request_id,
            'success': False,
            'timestamp': start_time,
            'error': str(e),
            'time': error_time,
            'end_to_end_time': error_time,
            'ttft': 0,
            'inter_token_latency': 0,
            'input_tokens': 0,
            'output_tokens': 0,
            'output_length': 0,
            'prompt_length': len(prompt),
            'output_text': ""
        }

# Generate test prompts
print("Generating test prompts...")
test_prompts = [generate_test_prompt(TEST_CONFIG['input_token_length']) 
                for _ in range(TEST_CONFIG['total_requests'])]

print(f"Generated {len(test_prompts)} test prompts")
print(f"Sample prompt length: {len(test_prompts[0].split())} words")
print(f"Sample prompt preview: {test_prompts[0][:200]}...")

# Run load test
print(f"\nStarting load test:")
print(f"- Concurrent users: {TEST_CONFIG['concurrent_users']}")
print(f"- Total requests: {TEST_CONFIG['total_requests']}")
print(f"- Target output tokens: {TEST_CONFIG['output_tokens']}")

# Execute test
test_start_time = time.time()
results = []

with ThreadPoolExecutor(max_workers=TEST_CONFIG['concurrent_users']) as executor:
    # Submit all requests
    future_to_id = {
        executor.submit(make_request, i, test_prompts[i % len(test_prompts)], TEST_CONFIG): i 
        for i in range(TEST_CONFIG['total_requests'])
    }
    
    # Collect results with progress tracking
    completed = 0
    for future in as_completed(future_to_id):
        request_id = future_to_id[future]
        try:
            result = future.result()
            results.append(result)
        except Exception as e:
            results.append({
                'request_id': request_id,
                'success': False,
                'timestamp': time.time(),
                'error': str(e),
                'end_to_end_time': 0,
                'ttft': 0,
                'inter_token_latency': 0,
                'input_tokens': 0,
                'output_tokens': 0,
                'output_length': 0,
                'prompt_length': 0,
                'output_text': ""
            })
        
        completed += 1
        if completed % 100 == 0:
            print(f"Completed {completed}/{TEST_CONFIG['total_requests']} requests...")

test_end_time = time.time()
total_test_time = test_end_time - test_start_time

# Calculate performance metrics
successful_requests = [r for r in results if r.get('success', False)]
failed_requests = [r for r in results if not r.get('success', False)]

print(f"\n{'='*60}")
print(f"LOAD TEST RESULTS")
print(f"{'='*60}")

print(f"\nTest Summary:")
print(f"- Total requests: {len(results)}")
print(f"- Successful requests: {len(successful_requests)}")
print(f"- Failed requests: {len(failed_requests)}")
print(f"- Success rate: {len(successful_requests)/len(results)*100:.1f}%")
print(f"- Total test time: {total_test_time:.1f} seconds")

# Calculate metrics
if successful_requests:
    ttft_times = [r['ttft'] for r in successful_requests]
    inter_token_times = [r['inter_token_latency'] for r in successful_requests]
    e2e_times = [r['end_to_end_time'] for r in successful_requests]
    input_tokens = [r['input_tokens'] for r in successful_requests]
    output_tokens = [r['output_tokens'] for r in successful_requests]
    
    def percentile(data, p):
        return np.percentile(data, p)
    
    # Latency metrics
    ttft_p50 = percentile(ttft_times, 50)
    ttft_p95 = percentile(ttft_times, 95)
    ttft_p99 = percentile(ttft_times, 99)
    inter_token_p50 = percentile(inter_token_times, 50)
    inter_token_p95 = percentile(inter_token_times, 95)
    e2e_p50 = percentile(e2e_times, 50)
    e2e_p95 = percentile(e2e_times, 95)
    e2e_p99 = percentile(e2e_times, 99)
    
    # Throughput calculations
    total_output_tokens = sum(output_tokens)
    total_input_tokens = sum(input_tokens)
    total_tokens = total_output_tokens + total_input_tokens
    
    token_output_throughput = total_output_tokens / total_test_time
    overall_token_throughput = total_tokens / total_test_time
    requests_per_second = len(successful_requests) / total_test_time
    
    print(f"\nLatency Metrics:")
    print(f"- TTFT (p50): {ttft_p50:.3f}s")
    print(f"- TTFT (p95): {ttft_p95:.3f}s")
    print(f"- TTFT (p99): {ttft_p99:.3f}s")
    print(f"- Inter-token Latency (p50): {inter_token_p50:.3f}s")
    print(f"- Inter-token Latency (p95): {inter_token_p95:.3f}s")
    print(f"- End-to-End (p50): {e2e_p50:.1f}s")
    print(f"- End-to-End (p95): {e2e_p95:.1f}s")
    print(f"- End-to-End (p99): {e2e_p99:.1f}s")
    
    print(f"\nThroughput Metrics:")
    print(f"- Token Output Throughput: {token_output_throughput:.2f} tok/sec")
    print(f"- Overall Token Throughput: {overall_token_throughput:.2f} tok/sec")
    print(f"- Requests per second: {requests_per_second:.2f} req/sec")
    
    print(f"\nToken Statistics:")
    print(f"- Average input tokens: {statistics.mean(input_tokens):.1f}")
    print(f"- Average output tokens: {statistics.mean(output_tokens):.1f}")
    print(f"- Total input tokens: {int(total_input_tokens)}")
    print(f"- Total output tokens: {int(total_output_tokens)}")

# Save detailed results to CSV
print(f"\nSaving detailed results to {detailed_csv_filename}...")
results_df = pd.DataFrame(results)
results_df.to_csv(detailed_csv_filename, index=False)

# Create summary metrics for CSV
summary_data = {
    'timestamp': [timestamp],
    'test_duration_seconds': [total_test_time],
    'total_requests': [len(results)],
    'successful_requests': [len(successful_requests)],
    'failed_requests': [len(failed_requests)],
    'success_rate_percent': [len(successful_requests)/len(results)*100],
    'concurrent_users': [TEST_CONFIG['concurrent_users']],
    'target_input_tokens': [TEST_CONFIG['input_token_length']],
    'target_output_tokens': [TEST_CONFIG['output_tokens']],
    'temperature': [TEST_CONFIG['temperature']],
    'max_tokens': [TEST_CONFIG['max_tokens']]
}

if successful_requests:
    summary_data.update({
        'ttft_p50_seconds': [ttft_p50],
        'ttft_p95_seconds': [ttft_p95],
        'ttft_p99_seconds': [ttft_p99],
        'inter_token_p50_seconds': [inter_token_p50],
        'inter_token_p95_seconds': [inter_token_p95],
        'e2e_p50_seconds': [e2e_p50],
        'e2e_p95_seconds': [e2e_p95],
        'e2e_p99_seconds': [e2e_p99],
        'token_output_throughput': [token_output_throughput],
        'overall_token_throughput': [overall_token_throughput],
        'requests_per_second': [requests_per_second],
        'avg_input_tokens': [statistics.mean(input_tokens)],
        'avg_output_tokens': [statistics.mean(output_tokens)],
        'total_input_tokens': [total_input_tokens],
        'total_output_tokens': [total_output_tokens]
    })
else:
    # Fill with zeros if no successful requests
    for key in ['ttft_p50_seconds', 'ttft_p95_seconds', 'ttft_p99_seconds', 
                'inter_token_p50_seconds', 'inter_token_p95_seconds',
                'e2e_p50_seconds', 'e2e_p95_seconds', 'e2e_p99_seconds',
                'token_output_throughput', 'overall_token_throughput', 
                'requests_per_second', 'avg_input_tokens', 'avg_output_tokens',
                'total_input_tokens', 'total_output_tokens']:
        summary_data[key] = [0]

# Save summary to CSV
print(f"Saving summary to {csv_filename}...")
summary_df = pd.DataFrame(summary_data)
summary_df.to_csv(csv_filename, index=False)

# Generate Markdown report
print(f"Generating Markdown report: {md_filename}...")

md_content = f"""# vLLM Performance Test Report

**Test Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}  
**Test Duration:** {total_test_time:.1f} seconds  
**Timestamp:** {timestamp}

## Test Configuration

| Parameter | Value |
|-----------|-------|
| Concurrent Users | {TEST_CONFIG['concurrent_users']} |
| Total Requests | {TEST_CONFIG['total_requests']} |
| Target Input Tokens | {TEST_CONFIG['input_token_length']} |
| Target Output Tokens | {TEST_CONFIG['output_tokens']} |
| Temperature | {TEST_CONFIG['temperature']} |
| Top P | {TEST_CONFIG['top_p']} |
| Max Tokens | {TEST_CONFIG['max_tokens']} |
| Stream | {TEST_CONFIG['stream']} |

## Test Results Summary

| Metric | Value |
|--------|-------|
| Total Requests | {len(results)} |
| Successful Requests | {len(successful_requests)} |
| Failed Requests | {len(failed_requests)} |
| Success Rate | {len(successful_requests)/len(results)*100:.1f}% |
| Test Duration | {total_test_time:.1f} seconds |

"""

if successful_requests:
    md_content += f"""
## Latency Metrics

| Metric | p50 | p95 | p99 |
|--------|-----|-----|-----|
| Time to First Token (TTFT) | {ttft_p50:.3f}s | {ttft_p95:.3f}s | {ttft_p99:.3f}s |
| Inter-token Latency | {inter_token_p50:.3f}s | {inter_token_p95:.3f}s | - |
| End-to-End Latency | {e2e_p50:.1f}s | {e2e_p95:.1f}s | {e2e_p99:.1f}s |

## Throughput Metrics

| Metric | Value |
|--------|-------|
| Token Output Throughput | {token_output_throughput:.2f} tok/sec |
| Overall Token Throughput | {overall_token_throughput:.2f} tok/sec |
| Requests per Second | {requests_per_second:.2f} req/sec |

## Token Statistics

| Metric | Value |
|--------|-------|
| Average Input Tokens | {statistics.mean(input_tokens):.1f} |
| Average Output Tokens | {statistics.mean(output_tokens):.1f} |
| Total Input Tokens | {int(total_input_tokens):,} |
| Total Output Tokens | {int(total_output_tokens):,} |

## Comparison with Target Metrics

| Metric | Target | Actual | Difference |
|--------|--------|--------|------------|
| TTFT (p95) | 0.9s | {ttft_p95:.3f}s | {((ttft_p95 - 0.9) / 0.9 * 100):+.1f}% |
| Inter-token Latency (p95) | 0.17s | {inter_token_p95:.3f}s | {((inter_token_p95 - 0.17) / 0.17 * 100):+.1f}% |
| End-to-End (p95) | 44.1s | {e2e_p95:.1f}s | {((e2e_p95 - 44.1) / 44.1 * 100):+.1f}% |
| Token Output Throughput | 10.05 tok/sec | {token_output_throughput:.2f} tok/sec | {((token_output_throughput - 10.05) / 10.05 * 100):+.1f}% |
| Overall Token Throughput | 1529 tok/sec | {overall_token_throughput:.2f} tok/sec | {((overall_token_throughput - 1529) / 1529 * 100):+.1f}% |
| Input Token Length | 265 | {statistics.mean(input_tokens):.1f} | {((statistics.mean(input_tokens) - 265) / 265 * 100):+.1f}% |
| Output Tokens | 317 | {statistics.mean(output_tokens):.1f} | {((statistics.mean(output_tokens) - 317) / 317 * 100):+.1f}% |

"""

# Error analysis
if failed_requests:
    md_content += f"""
## Error Analysis

**Total Failed Requests:** {len(failed_requests)}

"""
    error_types = defaultdict(int)
    for req in failed_requests:
        error_msg = req.get('error', 'Unknown error')
        error_types[error_msg] += 1
    
    md_content += "| Error Type | Count |\n|------------|-------|\n"
    for error, count in error_types.items():
        md_content += f"| {error} | {count} |\n"

md_content += f"""

## Performance Analysis

"""

if successful_requests:
    # Performance analysis
    if ttft_p95 <= 0.9:
        md_content += "✅ **TTFT Performance:** Meeting target (≤ 0.9s)\n\n"
    else:
        md_content += "❌ **TTFT Performance:** Above target (> 0.9s)\n\n"
    
    if inter_token_p95 <= 0.17:
        md_content += "✅ **Inter-token Latency:** Meeting target (≤ 0.17s)\n\n"
    else:
        md_content += "❌ **Inter-token Latency:** Above target (> 0.17s)\n\n"
    
    if token_output_throughput >= 10.05:
        md_content += "✅ **Token Output Throughput:** Meeting target (≥ 10.05 tok/sec)\n\n"
    else:
        md_content += "❌ **Token Output Throughput:** Below target (< 10.05 tok/sec)\n\n"
    
    if overall_token_throughput >= 1529:
        md_content += "✅ **Overall Token Throughput:** Meeting target (≥ 1529 tok/sec)\n\n"
    else:
        md_content += "❌ **Overall Token Throughput:** Below target (< 1529 tok/sec)\n\n"

md_content += f"""
## Files Generated

- **Summary CSV:** `{csv_filename}`
- **Detailed CSV:** `{detailed_csv_filename}`
- **This Report:** `{md_filename}`

## Test Environment

- **vLLM Version:** 0.6.6.post1 (target)
- **Max Sequences:** 512 (target)
- **KV Cache Dtype:** fp8_e5m2 (target)
- **Tensor Parallel Size:** 4 (target)
- **Tool Call Parser:** llama3_json (target)

---
*Report generated automatically by vLLM performance testing script*
"""

# Save markdown report
with open(md_filename, 'w', encoding='utf-8') as f:
    f.write(md_content)

print(f"\n{'='*60}")
print(f"FILES SAVED SUCCESSFULLY")
print(f"{'='*60}")
print(f"📄 Summary CSV: {csv_filename}")
print(f"📊 Detailed CSV: {detailed_csv_filename}")
print(f"📝 Markdown Report: {md_filename}")
print(f"\nTest completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Display quick summary
if successful_requests:
    print(f"\n🎯 QUICK PERFORMANCE SUMMARY:")
    print(f"   TTFT (p95): {ttft_p95:.3f}s (target: 0.9s)")
    print(f"   Inter-token (p95): {inter_token_p95:.3f}s (target: 0.17s)")
    print(f"   Throughput: {token_output_throughput:.1f} tok/sec (target: 10.05)")
    print(f"   Success Rate: {len(successful_requests)/len(results)*100:.1f}%")
else:
    print(f"\n❌ TEST FAILED: No successful requests completed")

Test started at: 2025-06-30 12:56:19
Output files will be saved as:
- Summary CSV: vllm_performance_tests/vllm_test_20250630_125619.csv
- Detailed CSV: vllm_performance_tests/vllm_detailed_20250630_125619.csv
- Report MD: vllm_performance_tests/vllm_report_20250630_125619.md
Generating test prompts...
Generated 10000 test prompts
Sample prompt length: 289 words
Sample prompt preview: Analyze the following complex scenario and provide a detailed response covering multiple aspects:

A multinational technology company is considering implementing a comprehensive artificial intelligenc...

Starting load test:
- Concurrent users: 25
- Total requests: 10000
- Target output tokens: 317
Completed 100/10000 requests...
Completed 200/10000 requests...
Completed 300/10000 requests...
Completed 400/10000 requests...
Completed 500/10000 requests...
Completed 600/10000 requests...
Completed 700/10000 requests...
Completed 800/10000 requests...
Completed 900/10000 requests...
Completed 1000/10000 requ

### Updated test

In [None]:
import time
import json
import numpy as np
import pandas as pd
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
import statistics
from datetime import datetime
import os

# Test configuration - REDUCED for testing
TEST_CONFIG = {
    'concurrent_users': 5,      # Start small to test endpoint stability
    'total_requests': 50,       # Reduce for initial testing
    'input_token_length': 265,  
    'output_tokens': 317,       
    'temperature': 0.7,
    'top_p': 1.0,
    'max_tokens': 350,
    'stream': True
}

# Create timestamped filenames
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = "vllm_performance_tests"
os.makedirs(output_dir, exist_ok=True)

csv_filename = f"{output_dir}/vllm_test_{timestamp}.csv"
detailed_csv_filename = f"{output_dir}/vllm_detailed_{timestamp}.csv"
md_filename = f"{output_dir}/vllm_report_{timestamp}.md"

print(f"Test started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Output files will be saved as:")
print(f"- Summary CSV: {csv_filename}")
print(f"- Detailed CSV: {detailed_csv_filename}")
print(f"- Report MD: {md_filename}")

# Generate test prompts
def generate_test_prompt(target_tokens=265):
    base_prompt = """Analyze the following business scenario and provide recommendations:

A technology startup is developing an AI-powered customer service platform. They need to understand market positioning, competitive analysis, implementation strategy, and growth projections. Consider technical requirements, user experience design, scalability concerns, and business model validation.

Please provide strategic insights covering market analysis, technical architecture, user acquisition strategies, and financial projections for the next 24 months."""
    
    return base_prompt

# Metrics collection
metrics = {
    'ttft_times': [],
    'inter_token_latencies': [],
    'end_to_end_times': [],
    'input_tokens': [],
    'output_tokens': [],
    'request_errors': [],
    'timestamps': []
}

metrics_lock = threading.Lock()

def make_request(request_id, prompt, config):
    """Single request function with enhanced error handling"""
    start_time = time.time()
    
    try:
        # Prepare request with timeout handling
        instances = [{
            "prompt": prompt,
            "max_tokens": config['max_tokens'],
            "temperature": config['temperature'],
            "top_p": config.get('top_p', 1.0),
            "raw_response": True,
        }]
        
        request_start = time.time()
        
        # Add retry logic for 502 errors
        max_retries = 2
        for attempt in range(max_retries + 1):
            try:
                response = endpoints["vllmtpu"].predict(
                    instances=instances, 
                    use_dedicated_endpoint=use_dedicated_endpoint
                )
                break  # Success, exit retry loop
            except Exception as e:
                if "502" in str(e) and attempt < max_retries:
                    print(f"Request {request_id}: 502 error, retrying ({attempt + 1}/{max_retries})...")
                    time.sleep(1)  # Brief delay before retry
                    continue
                else:
                    raise e  # Re-raise if not 502 or out of retries
        
        request_end = time.time()
        
        # Parse response safely
        prediction = {}
        output_text = ""
        
        if hasattr(response, 'predictions') and response.predictions:
            prediction = response.predictions[0] if response.predictions else {}
            if isinstance(prediction, dict):
                output_text = prediction.get('generated_text', '') or prediction.get('content', '') or str(prediction)
            else:
                output_text = str(prediction)
        
        # Calculate metrics
        end_to_end_time = request_end - request_start
        
        # Estimate tokens
        input_tokens = len(prompt.split()) * 1.3
        output_tokens = len(output_text.split()) * 1.3 if output_text else 0
        
        # Estimate timing metrics
        estimated_ttft = min(0.5, end_to_end_time * 0.02) if end_to_end_time > 0 else 0
        estimated_inter_token = (end_to_end_time - estimated_ttft) / max(1, output_tokens) if output_tokens > 0 else 0
        
        # Store metrics
        with metrics_lock:
            metrics['ttft_times'].append(estimated_ttft)
            metrics['inter_token_latencies'].append(estimated_inter_token)
            metrics['end_to_end_times'].append(end_to_end_time)
            metrics['input_tokens'].append(input_tokens)
            metrics['output_tokens'].append(output_tokens)
            metrics['timestamps'].append(request_start)
        
        return {
            'request_id': request_id,
            'success': True,
            'timestamp': request_start,
            'end_to_end_time': end_to_end_time,
            'ttft': estimated_ttft,
            'inter_token_latency': estimated_inter_token,
            'input_tokens': input_tokens,
            'output_tokens': output_tokens,
            'output_length': len(output_text),
            'prompt_length': len(prompt),
            'output_text': output_text[:200] + "..." if len(output_text) > 200 else output_text,
            'error': None
        }
        
    except Exception as e:
        error_time = time.time() - start_time
        error_msg = str(e)
        
        with metrics_lock:
            metrics['request_errors'].append({
                'request_id': request_id,
                'error': error_msg,
                'time': error_time
            })
        
        print(f"Request {request_id} failed: {error_msg[:100]}...")
        
        return {
            'request_id': request_id,
            'success': False,
            'timestamp': start_time,
            'error': error_msg,
            'time': error_time,
            'end_to_end_time': error_time,
            'ttft': 0,
            'inter_token_latency': 0,
            'input_tokens': len(prompt.split()) * 1.3 if prompt else 0,
            'output_tokens': 0,
            'output_length': 0,
            'prompt_length': len(prompt) if prompt else 0,
            'output_text': ""
        }

# Test endpoint first with a single request
print("Testing endpoint with single request first...")
test_prompt = generate_test_prompt()

try:
    single_test = make_request(0, test_prompt, TEST_CONFIG)
    if single_test['success']:
        print("✅ Single request test successful!")
        print(f"Response time: {single_test['end_to_end_time']:.2f}s")
        print(f"Output length: {single_test['output_length']} chars")
    else:
        print("❌ Single request test failed!")
        print(f"Error: {single_test['error']}")
        print("\n🛑 Endpoint appears to have issues. Consider:")
        print("1. Check if the endpoint is properly deployed and running")
        print("2. Verify the endpoint has sufficient resources")
        print("3. Test with smaller requests first")
        print("4. Check Google Cloud Console for endpoint logs")
        
        # Still proceed but with warning
        input("\nPress Enter to continue with load test anyway, or Ctrl+C to abort...")
        
except Exception as e:
    print(f"❌ Critical error during single test: {e}")
    print("Aborting load test.")
    exit(1)

# Generate test prompts
print(f"Generating {TEST_CONFIG['total_requests']} test prompts...")
test_prompts = [generate_test_prompt(TEST_CONFIG['input_token_length']) 
                for _ in range(TEST_CONFIG['total_requests'])]

print(f"Generated {len(test_prompts)} test prompts")
print(f"Sample prompt length: {len(test_prompts[0].split())} words")

# Run load test
print(f"\nStarting load test:")
print(f"- Concurrent users: {TEST_CONFIG['concurrent_users']}")
print(f"- Total requests: {TEST_CONFIG['total_requests']}")
print(f"- Target output tokens: {TEST_CONFIG['output_tokens']}")

test_start_time = time.time()
results = []

with ThreadPoolExecutor(max_workers=TEST_CONFIG['concurrent_users']) as executor:
    future_to_id = {
        executor.submit(make_request, i, test_prompts[i % len(test_prompts)], TEST_CONFIG): i 
        for i in range(TEST_CONFIG['total_requests'])
    }
    
    completed = 0
    for future in as_completed(future_to_id):
        request_id = future_to_id[future]
        try:
            result = future.result()
            results.append(result)
        except Exception as e:
            results.append({
                'request_id': request_id,
                'success': False,
                'timestamp': time.time(),
                'error': str(e),
                'end_to_end_time': 0,
                'ttft': 0,
                'inter_token_latency': 0,
                'input_tokens': 0,
                'output_tokens': 0,
                'output_length': 0,
                'prompt_length': 0,
                'output_text': ""
            })
        
        completed += 1
        if completed % max(1, TEST_CONFIG['total_requests'] // 20) == 0:
            success_rate = len([r for r in results if r.get('success', False)]) / len(results) * 100
            print(f"Completed {completed}/{TEST_CONFIG['total_requests']} requests... Success rate: {success_rate:.1f}%")

test_end_time = time.time()
total_test_time = test_end_time - test_start_time

# Calculate performance metrics with safe variable handling
successful_requests = [r for r in results if r.get('success', False)]
failed_requests = [r for r in results if not r.get('success', False)]

print(f"\n{'='*60}")
print(f"LOAD TEST RESULTS")
print(f"{'='*60}")

print(f"\nTest Summary:")
print(f"- Total requests: {len(results)}")
print(f"- Successful requests: {len(successful_requests)}")
print(f"- Failed requests: {len(failed_requests)}")
print(f"- Success rate: {len(successful_requests)/len(results)*100:.1f}%")
print(f"- Total test time: {total_test_time:.1f} seconds")

# Initialize all variables to prevent NameError
ttft_times = []
inter_token_times = []
e2e_times = []
input_tokens = []
output_tokens = []
ttft_p50 = ttft_p95 = ttft_p99 = 0
inter_token_p50 = inter_token_p95 = 0
e2e_p50 = e2e_p95 = e2e_p99 = 0
token_output_throughput = overall_token_throughput = requests_per_second = 0
total_input_tokens = total_output_tokens = 0

# Calculate metrics only if we have successful requests
if successful_requests:
    ttft_times = [r['ttft'] for r in successful_requests]
    inter_token_times = [r['inter_token_latency'] for r in successful_requests]
    e2e_times = [r['end_to_end_time'] for r in successful_requests]
    input_tokens = [r['input_tokens'] for r in successful_requests]
    output_tokens = [r['output_tokens'] for r in successful_requests]
    
    def percentile(data, p):
        return np.percentile(data, p) if data else 0
    
    ttft_p50 = percentile(ttft_times, 50)
    ttft_p95 = percentile(ttft_times, 95)
    ttft_p99 = percentile(ttft_times, 99)
    inter_token_p50 = percentile(inter_token_times, 50)
    inter_token_p95 = percentile(inter_token_times, 95)
    e2e_p50 = percentile(e2e_times, 50)
    e2e_p95 = percentile(e2e_times, 95)
    e2e_p99 = percentile(e2e_times, 99)
    
    total_output_tokens = sum(output_tokens)
    total_input_tokens = sum(input_tokens)
    total_tokens = total_output_tokens + total_input_tokens
    
    token_output_throughput = total_output_tokens / total_test_time
    overall_token_throughput = total_tokens / total_test_time
    requests_per_second = len(successful_requests) / total_test_time
    
    print(f"\nLatency Metrics:")
    print(f"- TTFT (p50): {ttft_p50:.3f}s")
    print(f"- TTFT (p95): {ttft_p95:.3f}s")
    print(f"- Inter-token Latency (p95): {inter_token_p95:.3f}s")
    print(f"- End-to-End (p95): {e2e_p95:.1f}s")
    
    print(f"\nThroughput Metrics:")
    print(f"- Token Output Throughput: {token_output_throughput:.2f} tok/sec")
    print(f"- Overall Token Throughput: {overall_token_throughput:.2f} tok/sec")
    print(f"- Requests per second: {requests_per_second:.2f} req/sec")
    
    print(f"\nToken Statistics:")
    print(f"- Average input tokens: {statistics.mean(input_tokens):.1f}")
    print(f"- Average output tokens: {statistics.mean(output_tokens):.1f}")

else:
    print(f"\n❌ NO SUCCESSFUL REQUESTS - ENDPOINT ISSUES DETECTED")
    print(f"\n🔍 TROUBLESHOOTING RECOMMENDATIONS:")
    print(f"1. Check endpoint status in Google Cloud Console")
    print(f"2. Verify endpoint has sufficient resources allocated")
    print(f"3. Check for quota limits or rate limiting")
    print(f"4. Review endpoint logs for detailed error messages")
    print(f"5. Try reducing concurrent users and request size")

# Error analysis
if failed_requests:
    print(f"\n{'='*60}")
    print(f"ERROR ANALYSIS")
    print(f"{'='*60}")
    
    error_types = defaultdict(int)
    for req in failed_requests:
        error_msg = req.get('error', 'Unknown error')
        # Truncate long error messages
        error_key = error_msg[:100] + "..." if len(error_msg) > 100 else error_msg
        error_types[error_key] += 1
    
    for error, count in list(error_types.items())[:10]:  # Show top 10 errors
        print(f"- {error}: {count} occurrences")

# Save detailed results
print(f"\nSaving results...")
results_df = pd.DataFrame(results)
results_df.to_csv(detailed_csv_filename, index=False)

# Create summary with safe variable access
summary_data = {
    'timestamp': [timestamp],
    'test_duration_seconds': [total_test_time],
    'total_requests': [len(results)],
    'successful_requests': [len(successful_requests)],
    'failed_requests': [len(failed_requests)],
    'success_rate_percent': [len(successful_requests)/len(results)*100],
    'concurrent_users': [TEST_CONFIG['concurrent_users']],
    'ttft_p95_seconds': [ttft_p95],
    'inter_token_p95_seconds': [inter_token_p95],
    'e2e_p95_seconds': [e2e_p95],
    'token_output_throughput': [token_output_throughput],
    'overall_token_throughput': [overall_token_throughput],
    'requests_per_second': [requests_per_second],
    'avg_input_tokens': [statistics.mean(input_tokens) if input_tokens else 0],
    'avg_output_tokens': [statistics.mean(output_tokens) if output_tokens else 0],
}

summary_df = pd.DataFrame(summary_data)
summary_df.to_csv(csv_filename, index=False)

# Generate markdown report
md_content = f"""# vLLM Performance Test Report - {timestamp}

**Test Status:** {'✅ PARTIAL SUCCESS' if successful_requests else '❌ FAILED'}  
**Success Rate:** {len(successful_requests)/len(results)*100:.1f}%  
**Test Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

## Issues Detected

⚠️ **Endpoint returned 502 errors** - Backend service unavailable  
⚠️ **{len(failed_requests)} out of {len(results)} requests failed**

## Recommendations

1. **Check endpoint health** in Google Cloud Console
2. **Scale up resources** if endpoint is under-provisioned
3. **Implement retry logic** for production applications
4. **Monitor endpoint logs** for detailed error information
5. **Start with smaller load** to test stability

"""

if successful_requests:
    md_content += f"""
## Performance Results (Successful Requests Only)

| Metric | Value |
|--------|-------|
| TTFT (p95) | {ttft_p95:.3f}s |
| Inter-token (p95) | {inter_token_p95:.3f}s |
| End-to-End (p95) | {e2e_p95:.1f}s |
| Token Output Throughput | {token_output_throughput:.2f} tok/sec |
| Requests/sec | {requests_per_second:.2f} |
"""

md_content += f"""
## Error Summary

| Error Type | Count |
|------------|-------|
"""

error_types = defaultdict(int)
for req in failed_requests:
    error_msg = req.get('error', 'Unknown error')
    error_key = error_msg[:50] + "..." if len(error_msg) > 50 else error_msg
    error_types[error_key] += 1

for error, count in list(error_types.items())[:5]:
    md_content += f"| {error} | {count} |\n"

# Save markdown
with open(md_filename, 'w', encoding='utf-8') as f:
    f.write(md_content)

print(f"\n{'='*60}")
print(f"FILES SAVED")
print(f"{'='*60}")
print(f"📄 Summary: {csv_filename}")
print(f"📊 Details: {detailed_csv_filename}")
print(f"📝 Report: {md_filename}")

if len(successful_requests) == 0:
    print(f"\n🚨 CRITICAL: All requests failed. Check your endpoint!")
else:
    print(f"\n📊 Partial results saved. Success rate: {len(successful_requests)/len(results)*100:.1f}%")

Test started at: 2025-06-30 14:00:31
Output files will be saved as:
- Summary CSV: vllm_performance_tests/vllm_test_20250630_140031.csv
- Detailed CSV: vllm_performance_tests/vllm_detailed_20250630_140031.csv
- Report MD: vllm_performance_tests/vllm_report_20250630_140031.md
Testing endpoint with single request first...
✅ Single request test successful!
Response time: 6.05s
Output length: 1897 chars
Generating 50 test prompts...
Generated 50 test prompts
Sample prompt length: 63 words

Starting load test:
- Concurrent users: 5
- Total requests: 50
- Target output tokens: 317
Completed 2/50 requests... Success rate: 100.0%
Completed 4/50 requests... Success rate: 100.0%
Completed 6/50 requests... Success rate: 100.0%
Completed 8/50 requests... Success rate: 100.0%
Completed 10/50 requests... Success rate: 100.0%
Completed 12/50 requests... Success rate: 100.0%
Completed 14/50 requests... Success rate: 100.0%
Completed 16/50 requests... Success rate: 100.0%
Completed 18/50 requests... Su

## Clean up resources


In [None]:
# # @title Delete the models and endpoints
# # @markdown  Delete the experiment models and endpoints to recycle the resources
# # @markdown  and avoid unnecessary continuous charges that may incur.

# # Undeploy model and delete endpoint.
# for endpoint in endpoints.values():
#     endpoint.delete(force=True)

# # Delete models.
# for model in models.values():
#     model.delete()

# delete_bucket = False  # @param {type:"boolean"}
# if delete_bucket:
#     ! gsutil -m rm -r $BUCKET_NAME