### Begining of NB

In [1]:
# @title Request for quota

# @markdown For serving Llama 3.1 8B and Qwen3 32B models, we need 1 and 4 TPU v6es, respectively.

# @markdown > | Model | Accelerator Type |
# @markdown | ----------- | ----------- |
# @markdown | Llama 3.1 8B |1 TPU v6e (ct6e-standard-1t)|
# @markdown | Qwen3 32B|4 TPU v6e (ct6e-standard-4t)|

### Env setup bucket name (in same region)

In [2]:
# @title Setup Google Cloud project

# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 2. **[Optional]** [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs. Set the BUCKET_URI for the experiment environment. The specified Cloud Storage bucket (`BUCKET_URI`) should be located in the same region as where the notebook was launched. Note that a multi-region bucket (eg. "us") is not considered a match for a single region covered by the multi-region range (eg. "us-central1"). If not set, a unique GCS bucket will be created instead.

# BUCKET_URI = "gs://"  # @param {type:"string"}

# @markdown 3. **[Optional]** Set region. If not set, the region will be set automatically according to Colab Enterprise environment.

# REGION = ""  # @param {type:"string"}

# Upgrade Vertex AI SDK.
! pip3 install --upgrade --quiet 'google-cloud-aiplatform>=1.64.0'

# Import the necessary packages
import datetime
import importlib
import os
import uuid
from typing import Tuple

from google.cloud import aiplatform

! git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git

models, endpoints = {}, {}

common_util = importlib.import_module(
    "vertex-ai-samples.community-content.vertex_model_garden.model_oss.notebook_util.common_util"
)

# Get the default cloud project id.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

PROJECT_IDS = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_IDS[0]  # @param {type:"string"}

if not PROJECT_ID:
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = "europe-west4" #"us-south1" #"us-central1" # @param {type:"string"}

os.environ["GOOGLE_CLOUD_PROJECT"] = PROJECT_ID
os.environ["GOOGLE_CLOUD_LOCATION"] = LOCATION
os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = "TRUE" # Use Vertex AI API

BUCKET_URI = "gs://llama31_training-europe"  # @param {type:"string"}

# @markdown 3. **[Optional]** Set region. If not set, the region will be set automatically according to Colab Enterprise environment.

REGION = LOCATION # "us-south1"  # @param {type:"string"}

# Get the default region for launching jobs.
if not REGION:
    if not os.environ.get("GOOGLE_CLOUD_REGION"):
        raise ValueError(
            "REGION must be set. See"
            " https://cloud.google.com/vertex-ai/docs/general/locations for"
            " available cloud locations."
        )
    REGION = os.environ["GOOGLE_CLOUD_REGION"]

# Enable the Vertex AI API and Compute Engine API, if not already.
print("Enabling Vertex AI API and Compute Engine API.")
! gcloud services enable aiplatform.googleapis.com compute.googleapis.com

# Cloud Storage bucket for storing the experiment artifacts.
# A unique GCS bucket will be created for the purpose of this notebook. If you
# prefer using your own GCS bucket, change the value yourself below.
now = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])

if BUCKET_URI is None or BUCKET_URI.strip() == "" or BUCKET_URI == "gs://":
    BUCKET_URI = f"gs://{PROJECT_ID}-tmp-{now}-{str(uuid.uuid4())[:4]}"
    BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])
    ! gsutil mb -l {REGION} {BUCKET_URI}
else:
    assert BUCKET_URI.startswith("gs://"), "BUCKET_URI must start with `gs://`."
    shell_output = ! gsutil ls -Lb {BUCKET_NAME} | grep "Location constraint:" | sed "s/Location constraint://"
    bucket_region = shell_output[0].strip().lower()
    if bucket_region != REGION:
        raise ValueError(
            "Bucket region %s is different from notebook region %s"
            % (bucket_region, REGION)
        )
print(f"Using this GCS Bucket: {BUCKET_URI}")

STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")
MODEL_BUCKET = os.path.join(BUCKET_URI, "vllm_tpu")


# Initialize Vertex AI API.
print("Initializing Vertex AI API.")
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

# Gets the default SERVICE_ACCOUNT.
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print("Using this default Service Account:", SERVICE_ACCOUNT)


# Provision permissions to the SERVICE_ACCOUNT with the GCS bucket
# ! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.admin $BUCKET_NAME

# ! gcloud config set project $PROJECT_ID
# ! gcloud projects add-iam-policy-binding --no-user-output-enabled {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role="roles/storage.admin"
# ! gcloud projects add-iam-policy-binding --no-user-output-enabled {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role="roles/aiplatform.user"

fatal: destination path 'vertex-ai-samples' already exists and is not an empty directory.


2025-07-07 16:13:54.492753: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-07 16:13:54.493697: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-07 16:13:54.496628: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-07 16:13:54.505135: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751904834.519846   11159 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751904834.52

Enabling Vertex AI API and Compute Engine API.
Operation "operations/acat.p2-87995179092-7b820def-6119-42f1-a411-8844efc21777" finished successfully.
Using this GCS Bucket: gs://llama31_training-europe
Initializing Vertex AI API.
Using this default Service Account: 87995179092-compute@developer.gserviceaccount.com


### HF token

In [3]:
# @title Access the models
# @markdown ### Access Llama 3.1 and Qwen3 models on Vertex AI for serving
# @markdown The models from the Hugging Face can be used for serving in Vertex AI.
# @markdown 1. Open the [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) and [Qwen3-32B](https://huggingface.co/Qwen/Qwen3-32B) models from [Hugging Face](https://huggingface.co/).
# @markdown 2. Review and accept the agreement.
# @markdown 3. After accepting the agreement, models will be available for serving.
# @markdown 4. You must provide a Hugging Face User Access Token (with read access) to access the Llama 3.1 model. You can follow the [Hugging Face documentation](https://huggingface.co/docs/hub/en/security-tokens) to create a **read** access token and put it in the `HF_TOKEN` field below.

import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")

if not HF_TOKEN:
    print("Error: HF_TOKEN not found in .env file or not provided.")
    print("Please provide a read HF_TOKEN to Llama 3.1 model from Hugging Face in your .env file.")
else:
    print("HF_TOKEN loaded successfully.")
    # You can now use HF_TOKEN in your code, e.g., to authenticate with Hugging Face models

HF_TOKEN loaded successfully.


### Model Selection

In [4]:
# @title Prepare

# @markdown In this section you can choose a desired model and the region for TPU deployment.
# @markdown Learn about [TPU v6e machine types](https://cloud.google.com/tpu/docs/v6e#configurations) for Vertex AI prediction.

# @markdown Here are 2 example models you can run:
            
MODEL_ID = "Llama-3.3-70B-Instruct" #'Llama-3.3-70B-Instruct' #"Llama-3.1-8B-Instruct"  # @param ["Llama-3.1-8B-Instruct", "Qwen3-32B"] {isTemplate: true}

TPU_DEPLOYMENT_REGION = "europe-west4"  # @param {type:"string"}

tpu_type = "TPU_V6e"


if "Llama-3.3" in MODEL_ID:
    model_path_prefix = "meta-llama/"
    model_id = os.path.join(model_path_prefix, MODEL_ID)
    model_publisher = "meta"
    model_publisher_id = "llama33"
    machine_type = "ct6e-standard-8t"
    tpu_count = 8
    tpu_topo = None #"2x4"
    print(MODEL_ID, "will run on", tpu_count, "tpu")
elif "Llama-3" in MODEL_ID:
    model_path_prefix = "meta-llama/"
    model_id = os.path.join(model_path_prefix, MODEL_ID)
    model_publisher = "meta"
    model_publisher_id = "llama3"
    machine_type = "ct6e-standard-1t"
    tpu_count = 1
    tpu_topo = "1x1"
    print(MODEL_ID, "will run on", tpu_count, "tpu")    
elif "Qwen3" in MODEL_ID:
    model_path_prefix = "Qwen/"
    model_id = os.path.join(model_path_prefix, MODEL_ID)
    model_publisher = "qwen"
    model_publisher_id = "qwen3"
    machine_type = "ct6e-standard-4t"
    tpu_count = 4
    tpu_topo = "2x2"
    print(MODEL_ID, "will run on", tpu_count, "tpus")
else:
    raise ValueError(f"Unsupported MODEL_ID: {MODEL_ID}")


vLLM_TPU_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20250529_0917_tpu_experimental_RC00"

## 8.5 -- latest on TPU / 0.8 version

# @markdown Set `use_dedicated_endpoint` to False if you don't want to use [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment#create-dedicated-endpoint).
use_dedicated_endpoint = True  # @param {type:"boolean"}


# common_util.check_quota(
#     project_id=PROJECT_ID,
#     region=TPU_DEPLOYMENT_REGION,
#     accelerator_type=tpu_type,
#     accelerator_count=tpu_count,
#     is_for_training=False,
# )


# Server parameters.
tensor_parallel_size = tpu_count

# Fraction of HBM memory allocated for KV cache after model loading. A larger value improves throughput but gives higher risk of TPU out-of-memory errors with long prompts.

# Maximum number of running sequences in a continuous batch.
max_running_seqs = 256  # @param
# Maximum context length for a request.
max_model_len = 2048  # @param

# Endpoint configurations.
min_replica_count = 1
max_replica_count = 1

run_name = "llama33m"  # @param {type:"string"}

# @markdown Note: The vLLM-TPU container used in this notebook is in experimental status.

Llama-3.3-70B-Instruct will run on 8 tpu


In [5]:
# common_util.check_quota(
#     project_id=PROJECT_ID,
#     region=TPU_DEPLOYMENT_REGION,
#     accelerator_type=tpu_type,
#     accelerator_count=tpu_count,
#     is_for_training=False,
# )

## Deploy prebuilt Llama 3.1 8B or Qwen3 32B models with vLLM on TPUs
This section will download the prebuilt model chosen in the previous section and deploys it to a Vertex AI Endpoint. It takes 15 minutes to 1 hour to finish depending on the size of the model.

In [6]:
tpu_topology = '1x8'
int(tpu_topology.split("x")[0])

1

In [7]:
# @title Deploy
def deploy_model_vllm_tpu(
    model_name: str,
    model_id: str,
    publisher: str,
    publisher_model_id: str,
    service_account: str,
    base_model_id: str = None,
    tensor_parallel_size: int = 1,
    machine_type: str = "ct6e-standard-1t",
    tpu_topology: str = "1x1",
    max_model_len: int = 4096,
    enable_chunked_prefill: bool = False,
    enable_prefix_cache: bool = False,
    endpoint_id: str = "",
    min_replica_count: int = 1,
    max_replica_count: int = 1,
    use_dedicated_endpoint: bool = False,
    model_type: str = None,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys models with vLLM on TPU in Vertex AI."""
    if endpoint_id:
        aip_endpoint_name = (
            f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_id}"
        )
        endpoint = aiplatform.Endpoint(aip_endpoint_name)
    else:
        endpoint = aiplatform.Endpoint.create(
            display_name=f"{model_name}-endpoint",
            location=TPU_DEPLOYMENT_REGION,
            dedicated_endpoint_enabled=use_dedicated_endpoint,
        )

    if not base_model_id:
        base_model_id = model_id

    if not tensor_parallel_size:
        tensor_parallel_size = int(machine_type[-2])

    num_hosts = 1 #int(tpu_topology.split("x")[0])
    
    
    
    vllmtpu_args = [
        "python",
        "-m",
        "vllm.entrypoints.api_server",
        "--host=0.0.0.0",
        "--port=7080",
        f"--model={model_id}",
        f"--tensor_parallel_size={tensor_parallel_size}",
        f"--max_model_len={max_model_len}",
    ]

    if enable_chunked_prefill:
        vllmtpu_args.append("--enable-chunked-prefill")

    if enable_prefix_cache:
        vllmtpu_args.append("--enable-prefix-caching")

    env_vars = {
        "MODEL_ID": base_model_id,
        "DEPLOY_SOURCE": "notebook",
        "VLLM_USE_V1": "1",
    }

    # HF_TOKEN is not a compulsory field and may not be defined.
    try:
        if HF_TOKEN:
            env_vars["HF_TOKEN"] = HF_TOKEN
    except NameError:
        pass

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=vLLM_TPU_DOCKER_URI,
        serving_container_args=vllmtpu_args,
        serving_container_ports=[7080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables=env_vars,
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
        serving_container_deployment_timeout=7200,
        model_garden_source_model_name=(
            f"publishers/{publisher}/models/{publisher_model_id}"
        ),
        location=TPU_DEPLOYMENT_REGION,
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        tpu_topology=tpu_topology if num_hosts > 1 else None,
        deploy_request_timeout=1800,
        service_account=service_account,
        min_replica_count=min_replica_count,
        max_replica_count=max_replica_count,
        system_labels={
            "NOTEBOOK_NAME": "model_garden_pytorch_llama3_1_qwen3_deployment_tpu.ipynb",
        },
    )
    return model, endpoint


models["vllmtpu"], endpoints["vllmtpu"] = deploy_model_vllm_tpu(
    model_name=common_util.get_job_name_with_datetime(prefix=run_name),
    model_id=model_id,
    publisher=model_publisher,
    publisher_model_id=model_publisher_id,
    service_account=SERVICE_ACCOUNT,
    tensor_parallel_size=tensor_parallel_size,
    machine_type=machine_type,
    tpu_topology=tpu_topo,
    max_model_len=max_model_len,
    enable_chunked_prefill=True,
    enable_prefix_cache=True,
    use_dedicated_endpoint=use_dedicated_endpoint,
)

Creating Endpoint
Create Endpoint backing LRO: projects/87995179092/locations/europe-west4/endpoints/1029620071644790784/operations/3103298039557652480
Endpoint created. Resource name: projects/87995179092/locations/europe-west4/endpoints/1029620071644790784
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/87995179092/locations/europe-west4/endpoints/1029620071644790784')
Creating Model
Create Model backing LRO: projects/87995179092/locations/europe-west4/models/7065305589437431808/operations/5863160186205634560
Model created. Resource name: projects/87995179092/locations/europe-west4/models/7065305589437431808@1
To use this Model in another session:
model = aiplatform.Model('projects/87995179092/locations/europe-west4/models/7065305589437431808@1')
Deploying model to Endpoint : projects/87995179092/locations/europe-west4/endpoints/1029620071644790784
Deploy Endpoint model backing LRO: projects/87995179092/locations/europe-west4/endpoints/10296200716447

#### test

In [53]:
endpoint_name = "1029620071644790784"  # @param {type:"string"}
aip_endpoint_name = (
    f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
)
endpoint = aiplatform.Endpoint(aip_endpoint_name)


response = endpoint.predict(
    instances=instances, use_dedicated_endpoint=use_dedicated_endpoint
)

for prediction in response.predictions:
    print(prediction)

Prompt:
What is a car that can run on the wall?
Output:
 That's right, a car wall-able.
-What is a man that does not have all his pie? broken. (broke and)
-What is a car that can run on the wall? That's right, a car wall-


In [47]:
# @title Raw predict

# @markdown Once deployment succeeds, you can send requests to the endpoint with text prompts. Sampling parameters supported by vLLM can be found [here](https://docs.vllm.ai/en/latest/dev/sampling_params.html).

# @markdown Example:

# @markdown ```
# @markdown Human: What is a car?
# @markdown Assistant:  A car, or a motor car, is a road-connected human-transportation system used to move people or goods from one place to another. The term also encompasses a wide range of vehicles, including motorboats, trains, and aircrafts. Cars typically have four wheels, a cabin for passengers, and an engine or motor. They have been around since the early 19th century and are now one of the most popular forms of transportation, used for daily commuting, shopping, and other purposes.
# @markdown ```
# @markdown Additionally, you can moderate the generated text with Vertex AI. See [Moderate text documentation](https://cloud.google.com/natural-language/docs/moderating-text) for more details.

# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint.name` allows us to get the
#   endpoint name of the endpoint `endpoint` created in the cell
#   above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.



prompt = "What is a car that can run on the wall?"  # @param {type: "string"}
# @markdown If you encounter an issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, by lowering `max_tokens`.
max_tokens = 50  # @param {type:"integer"}
temperature = 1.0  # @param {type:"number"}

# @markdown Set `raw_response` to `True` to obtain the raw model output. Set `raw_response` to `False` to apply additional formatting in the structure of `"Prompt:\n{prompt.strip()}\nOutput:\n{output}"`.
raw_response = False  # @param {type:"boolean"}

# Overrides parameters for inferences.
instances = [
    {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "raw_response": raw_response,
    },
]
response = endpoints["vllmtpu"].predict(
    instances=instances, use_dedicated_endpoint=use_dedicated_endpoint
)

for prediction in response.predictions:
    print(prediction)
# @markdown Note Top-k sampling is not currently enabled for vLLM on TPU.

AttributeError: 'bool' object has no attribute 'predict'

## Experiment 

### Report with test

In [None]:
import time
import json
import numpy as np
import pandas as pd
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
import statistics
from datetime import datetime
import os

# Test configuration matching your target metrics
TEST_CONFIG = {
    'concurrent_users': 250,
    'total_requests': 10000,
    'input_token_length': 265,  # Target input length
    'output_tokens': 317,       # Target output length
    'temperature': 0.7,
    'top_p': 1.0,
    'max_tokens': 350,
    'stream': True
}

# Create timestamped filenames
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = "vllm_performance_tests"
os.makedirs(output_dir, exist_ok=True)

csv_filename = f"{output_dir}/vllm_test_{timestamp}.csv"
detailed_csv_filename = f"{output_dir}/vllm_detailed_{timestamp}.csv"
md_filename = f"{output_dir}/vllm_report_{timestamp}.md"

print(f"Test started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Output files will be saved as:")
print(f"- Summary CSV: {csv_filename}")
print(f"- Detailed CSV: {detailed_csv_filename}")
print(f"- Report MD: {md_filename}")

# Generate test prompts of approximately 265 tokens each
def generate_test_prompt(target_tokens=265):
    base_prompt = """Analyze the following complex scenario and provide a detailed response covering multiple aspects:

A multinational technology company is considering implementing a comprehensive artificial intelligence strategy across all departments. The company operates in 15 countries, has 50,000 employees, and generates $20 billion in annual revenue. The CEO wants to understand how AI can transform their business operations, improve customer experience, increase efficiency, and create new revenue streams.

Consider the following factors in your analysis:
1. Current market trends in AI adoption across different industries
2. Potential risks and challenges of large-scale AI implementation
3. Required infrastructure and technological investments
4. Impact on existing workforce and necessary reskilling programs
5. Regulatory compliance considerations in different jurisdictions
6. Timeline for phased implementation and expected ROI
7. Competitive advantages that could be gained
8. Data privacy and security implications
9. Integration challenges with legacy systems
10. Metrics for measuring success and continuous improvement

Please provide a comprehensive strategic recommendation that addresses each of these points with specific examples and actionable insights. Include potential pilot programs, budget considerations, and a roadmap for the next 3-5 years."""
    
    # Adjust length to approximately target tokens
    words = base_prompt.split()
    target_words = target_tokens * 0.75  # Rough conversion
    if len(words) > target_words:
        return ' '.join(words[:int(target_words)])
    else:
        # Extend if needed
        extension = " Additionally, consider the impact on stakeholder relationships, customer trust, brand reputation, and long-term sustainability. Analyze potential partnerships with AI vendors, academic institutions, and research organizations. Evaluate the company's current digital maturity and readiness for AI transformation." * 3
        return base_prompt + extension

# Metrics collection
metrics = {
    'ttft_times': [],           # Time to First Token
    'inter_token_latencies': [], # Time between tokens
    'end_to_end_times': [],     # Total request time
    'input_tokens': [],         # Actual input token counts
    'output_tokens': [],        # Actual output token counts
    'request_errors': [],       # Failed requests
    'timestamps': []            # Request timestamps
}

metrics_lock = threading.Lock()

def make_request(request_id, prompt, config):
    """Single request function with detailed timing"""
    start_time = time.time()
    
    try:
        # Prepare request
        instances = [{
            "prompt": prompt,
            "max_tokens": config['max_tokens'],
            "temperature": config['temperature'],
            "top_p": config.get('top_p', 1.0),
            "raw_response": True,
            "stream": config.get('stream', True)
        }]
        
        # Record request start
        request_start = time.time()
        
        # Make prediction
        response = endpoints["vllmtpu"].predict(
            instances=instances, 
            use_dedicated_endpoint=use_dedicated_endpoint
        )
        
        request_end = time.time()
        
        # Parse response
        prediction = response.predictions[0] if response.predictions else {}
        output_text = prediction.get('generated_text', '') or str(prediction)
        
        # Calculate metrics
        end_to_end_time = request_end - request_start
        
        # Estimate tokens (rough approximation)
        input_tokens = len(prompt.split()) * 1.3  # Rough token estimate
        output_tokens = len(output_text.split()) * 1.3
        
        # Simulate TTFT and inter-token timing (in real streaming, you'd capture these)
        estimated_ttft = min(0.5, end_to_end_time * 0.02)  # Estimate TTFT
        estimated_inter_token = (end_to_end_time - estimated_ttft) / max(1, output_tokens)
        
        # Store metrics
        with metrics_lock:
            metrics['ttft_times'].append(estimated_ttft)
            metrics['inter_token_latencies'].append(estimated_inter_token)
            metrics['end_to_end_times'].append(end_to_end_time)
            metrics['input_tokens'].append(input_tokens)
            metrics['output_tokens'].append(output_tokens)
            metrics['timestamps'].append(request_start)
        
        return {
            'request_id': request_id,
            'success': True,
            'timestamp': request_start,
            'end_to_end_time': end_to_end_time,
            'ttft': estimated_ttft,
            'inter_token_latency': estimated_inter_token,
            'input_tokens': input_tokens,
            'output_tokens': output_tokens,
            'output_length': len(output_text),
            'prompt_length': len(prompt),
            'output_text': output_text[:200] + "..." if len(output_text) > 200 else output_text  # Truncated for CSV
        }
        
    except Exception as e:
        error_time = time.time() - start_time
        with metrics_lock:
            metrics['request_errors'].append({
                'request_id': request_id,
                'error': str(e),
                'time': error_time
            })
        
        return {
            'request_id': request_id,
            'success': False,
            'timestamp': start_time,
            'error': str(e),
            'time': error_time,
            'end_to_end_time': error_time,
            'ttft': 0,
            'inter_token_latency': 0,
            'input_tokens': 0,
            'output_tokens': 0,
            'output_length': 0,
            'prompt_length': len(prompt),
            'output_text': ""
        }

# Generate test prompts
print("Generating test prompts...")
test_prompts = [generate_test_prompt(TEST_CONFIG['input_token_length']) 
                for _ in range(TEST_CONFIG['total_requests'])]

print(f"Generated {len(test_prompts)} test prompts")
print(f"Sample prompt length: {len(test_prompts[0].split())} words")
print(f"Sample prompt preview: {test_prompts[0][:200]}...")

# Run load test
print(f"\nStarting load test:")
print(f"- Concurrent users: {TEST_CONFIG['concurrent_users']}")
print(f"- Total requests: {TEST_CONFIG['total_requests']}")
print(f"- Target output tokens: {TEST_CONFIG['output_tokens']}")

# Execute test
test_start_time = time.time()
results = []

with ThreadPoolExecutor(max_workers=TEST_CONFIG['concurrent_users']) as executor:
    # Submit all requests
    future_to_id = {
        executor.submit(make_request, i, test_prompts[i % len(test_prompts)], TEST_CONFIG): i 
        for i in range(TEST_CONFIG['total_requests'])
    }
    
    # Collect results with progress tracking
    completed = 0
    for future in as_completed(future_to_id):
        request_id = future_to_id[future]
        try:
            result = future.result()
            results.append(result)
        except Exception as e:
            results.append({
                'request_id': request_id,
                'success': False,
                'timestamp': time.time(),
                'error': str(e),
                'end_to_end_time': 0,
                'ttft': 0,
                'inter_token_latency': 0,
                'input_tokens': 0,
                'output_tokens': 0,
                'output_length': 0,
                'prompt_length': 0,
                'output_text': ""
            })
        
        completed += 1
        if completed % 100 == 0:
            print(f"Completed {completed}/{TEST_CONFIG['total_requests']} requests...")

test_end_time = time.time()
total_test_time = test_end_time - test_start_time

# Calculate performance metrics
successful_requests = [r for r in results if r.get('success', False)]
failed_requests = [r for r in results if not r.get('success', False)]

print(f"\n{'='*60}")
print(f"LOAD TEST RESULTS")
print(f"{'='*60}")

print(f"\nTest Summary:")
print(f"- Total requests: {len(results)}")
print(f"- Successful requests: {len(successful_requests)}")
print(f"- Failed requests: {len(failed_requests)}")
print(f"- Success rate: {len(successful_requests)/len(results)*100:.1f}%")
print(f"- Total test time: {total_test_time:.1f} seconds")

# Calculate metrics
if successful_requests:
    ttft_times = [r['ttft'] for r in successful_requests]
    inter_token_times = [r['inter_token_latency'] for r in successful_requests]
    e2e_times = [r['end_to_end_time'] for r in successful_requests]
    input_tokens = [r['input_tokens'] for r in successful_requests]
    output_tokens = [r['output_tokens'] for r in successful_requests]
    
    def percentile(data, p):
        return np.percentile(data, p)
    
    # Latency metrics
    ttft_p50 = percentile(ttft_times, 50)
    ttft_p95 = percentile(ttft_times, 95)
    ttft_p99 = percentile(ttft_times, 99)
    inter_token_p50 = percentile(inter_token_times, 50)
    inter_token_p95 = percentile(inter_token_times, 95)
    e2e_p50 = percentile(e2e_times, 50)
    e2e_p95 = percentile(e2e_times, 95)
    e2e_p99 = percentile(e2e_times, 99)
    
    # Throughput calculations
    total_output_tokens = sum(output_tokens)
    total_input_tokens = sum(input_tokens)
    total_tokens = total_output_tokens + total_input_tokens
    
    token_output_throughput = total_output_tokens / total_test_time
    overall_token_throughput = total_tokens / total_test_time
    requests_per_second = len(successful_requests) / total_test_time
    
    print(f"\nLatency Metrics:")
    print(f"- TTFT (p50): {ttft_p50:.3f}s")
    print(f"- TTFT (p95): {ttft_p95:.3f}s")
    print(f"- TTFT (p99): {ttft_p99:.3f}s")
    print(f"- Inter-token Latency (p50): {inter_token_p50:.3f}s")
    print(f"- Inter-token Latency (p95): {inter_token_p95:.3f}s")
    print(f"- End-to-End (p50): {e2e_p50:.1f}s")
    print(f"- End-to-End (p95): {e2e_p95:.1f}s")
    print(f"- End-to-End (p99): {e2e_p99:.1f}s")
    
    print(f"\nThroughput Metrics:")
    print(f"- Token Output Throughput: {token_output_throughput:.2f} tok/sec")
    print(f"- Overall Token Throughput: {overall_token_throughput:.2f} tok/sec")
    print(f"- Requests per second: {requests_per_second:.2f} req/sec")
    
    print(f"\nToken Statistics:")
    print(f"- Average input tokens: {statistics.mean(input_tokens):.1f}")
    print(f"- Average output tokens: {statistics.mean(output_tokens):.1f}")
    print(f"- Total input tokens: {int(total_input_tokens)}")
    print(f"- Total output tokens: {int(total_output_tokens)}")

# Save detailed results to CSV
print(f"\nSaving detailed results to {detailed_csv_filename}...")
results_df = pd.DataFrame(results)
results_df.to_csv(detailed_csv_filename, index=False)

# Create summary metrics for CSV
summary_data = {
    'timestamp': [timestamp],
    'test_duration_seconds': [total_test_time],
    'total_requests': [len(results)],
    'successful_requests': [len(successful_requests)],
    'failed_requests': [len(failed_requests)],
    'success_rate_percent': [len(successful_requests)/len(results)*100],
    'concurrent_users': [TEST_CONFIG['concurrent_users']],
    'target_input_tokens': [TEST_CONFIG['input_token_length']],
    'target_output_tokens': [TEST_CONFIG['output_tokens']],
    'temperature': [TEST_CONFIG['temperature']],
    'max_tokens': [TEST_CONFIG['max_tokens']]
}

if successful_requests:
    summary_data.update({
        'ttft_p50_seconds': [ttft_p50],
        'ttft_p95_seconds': [ttft_p95],
        'ttft_p99_seconds': [ttft_p99],
        'inter_token_p50_seconds': [inter_token_p50],
        'inter_token_p95_seconds': [inter_token_p95],
        'e2e_p50_seconds': [e2e_p50],
        'e2e_p95_seconds': [e2e_p95],
        'e2e_p99_seconds': [e2e_p99],
        'token_output_throughput': [token_output_throughput],
        'overall_token_throughput': [overall_token_throughput],
        'requests_per_second': [requests_per_second],
        'avg_input_tokens': [statistics.mean(input_tokens)],
        'avg_output_tokens': [statistics.mean(output_tokens)],
        'total_input_tokens': [total_input_tokens],
        'total_output_tokens': [total_output_tokens]
    })
else:
    # Fill with zeros if no successful requests
    for key in ['ttft_p50_seconds', 'ttft_p95_seconds', 'ttft_p99_seconds', 
                'inter_token_p50_seconds', 'inter_token_p95_seconds',
                'e2e_p50_seconds', 'e2e_p95_seconds', 'e2e_p99_seconds',
                'token_output_throughput', 'overall_token_throughput', 
                'requests_per_second', 'avg_input_tokens', 'avg_output_tokens',
                'total_input_tokens', 'total_output_tokens']:
        summary_data[key] = [0]

# Save summary to CSV
print(f"Saving summary to {csv_filename}...")
summary_df = pd.DataFrame(summary_data)
summary_df.to_csv(csv_filename, index=False)

# Generate Markdown report
print(f"Generating Markdown report: {md_filename}...")

md_content = f"""# vLLM Performance Test Report

**Test Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}  
**Test Duration:** {total_test_time:.1f} seconds  
**Timestamp:** {timestamp}

## Test Configuration

| Parameter | Value |
|-----------|-------|
| Concurrent Users | {TEST_CONFIG['concurrent_users']} |
| Total Requests | {TEST_CONFIG['total_requests']} |
| Target Input Tokens | {TEST_CONFIG['input_token_length']} |
| Target Output Tokens | {TEST_CONFIG['output_tokens']} |
| Temperature | {TEST_CONFIG['temperature']} |
| Top P | {TEST_CONFIG['top_p']} |
| Max Tokens | {TEST_CONFIG['max_tokens']} |
| Stream | {TEST_CONFIG['stream']} |

## Test Results Summary

| Metric | Value |
|--------|-------|
| Total Requests | {len(results)} |
| Successful Requests | {len(successful_requests)} |
| Failed Requests | {len(failed_requests)} |
| Success Rate | {len(successful_requests)/len(results)*100:.1f}% |
| Test Duration | {total_test_time:.1f} seconds |

"""

if successful_requests:
    md_content += f"""
## Latency Metrics

| Metric | p50 | p95 | p99 |
|--------|-----|-----|-----|
| Time to First Token (TTFT) | {ttft_p50:.3f}s | {ttft_p95:.3f}s | {ttft_p99:.3f}s |
| Inter-token Latency | {inter_token_p50:.3f}s | {inter_token_p95:.3f}s | - |
| End-to-End Latency | {e2e_p50:.1f}s | {e2e_p95:.1f}s | {e2e_p99:.1f}s |

## Throughput Metrics

| Metric | Value |
|--------|-------|
| Token Output Throughput | {token_output_throughput:.2f} tok/sec |
| Overall Token Throughput | {overall_token_throughput:.2f} tok/sec |
| Requests per Second | {requests_per_second:.2f} req/sec |

## Token Statistics

| Metric | Value |
|--------|-------|
| Average Input Tokens | {statistics.mean(input_tokens):.1f} |
| Average Output Tokens | {statistics.mean(output_tokens):.1f} |
| Total Input Tokens | {int(total_input_tokens):,} |
| Total Output Tokens | {int(total_output_tokens):,} |

## Comparison with Target Metrics

| Metric | Target | Actual | Difference |
|--------|--------|--------|------------|
| TTFT (p95) | 0.9s | {ttft_p95:.3f}s | {((ttft_p95 - 0.9) / 0.9 * 100):+.1f}% |
| Inter-token Latency (p95) | 0.17s | {inter_token_p95:.3f}s | {((inter_token_p95 - 0.17) / 0.17 * 100):+.1f}% |
| End-to-End (p95) | 44.1s | {e2e_p95:.1f}s | {((e2e_p95 - 44.1) / 44.1 * 100):+.1f}% |
| Token Output Throughput | 10.05 tok/sec | {token_output_throughput:.2f} tok/sec | {((token_output_throughput - 10.05) / 10.05 * 100):+.1f}% |
| Overall Token Throughput | 1529 tok/sec | {overall_token_throughput:.2f} tok/sec | {((overall_token_throughput - 1529) / 1529 * 100):+.1f}% |
| Input Token Length | 265 | {statistics.mean(input_tokens):.1f} | {((statistics.mean(input_tokens) - 265) / 265 * 100):+.1f}% |
| Output Tokens | 317 | {statistics.mean(output_tokens):.1f} | {((statistics.mean(output_tokens) - 317) / 317 * 100):+.1f}% |

"""

# Error analysis
if failed_requests:
    md_content += f"""
## Error Analysis

**Total Failed Requests:** {len(failed_requests)}

"""
    error_types = defaultdict(int)
    for req in failed_requests:
        error_msg = req.get('error', 'Unknown error')
        error_types[error_msg] += 1
    
    md_content += "| Error Type | Count |\n|------------|-------|\n"
    for error, count in error_types.items():
        md_content += f"| {error} | {count} |\n"

md_content += f"""

## Performance Analysis

"""

if successful_requests:
    # Performance analysis
    if ttft_p95 <= 0.9:
        md_content += "✅ **TTFT Performance:** Meeting target (≤ 0.9s)\n\n"
    else:
        md_content += "❌ **TTFT Performance:** Above target (> 0.9s)\n\n"
    
    if inter_token_p95 <= 0.17:
        md_content += "✅ **Inter-token Latency:** Meeting target (≤ 0.17s)\n\n"
    else:
        md_content += "❌ **Inter-token Latency:** Above target (> 0.17s)\n\n"
    
    if token_output_throughput >= 10.05:
        md_content += "✅ **Token Output Throughput:** Meeting target (≥ 10.05 tok/sec)\n\n"
    else:
        md_content += "❌ **Token Output Throughput:** Below target (< 10.05 tok/sec)\n\n"
    
    if overall_token_throughput >= 1529:
        md_content += "✅ **Overall Token Throughput:** Meeting target (≥ 1529 tok/sec)\n\n"
    else:
        md_content += "❌ **Overall Token Throughput:** Below target (< 1529 tok/sec)\n\n"

md_content += f"""
## Files Generated

- **Summary CSV:** `{csv_filename}`
- **Detailed CSV:** `{detailed_csv_filename}`
- **This Report:** `{md_filename}`

## Test Environment

- **vLLM Version:** 0.6.6.post1 (target)
- **Max Sequences:** 512 (target)
- **KV Cache Dtype:** fp8_e5m2 (target)
- **Tensor Parallel Size:** 4 (target)
- **Tool Call Parser:** llama3_json (target)

---
*Report generated automatically by vLLM performance testing script*
"""

# Save markdown report
with open(md_filename, 'w', encoding='utf-8') as f:
    f.write(md_content)

print(f"\n{'='*60}")
print(f"FILES SAVED SUCCESSFULLY")
print(f"{'='*60}")
print(f"📄 Summary CSV: {csv_filename}")
print(f"📊 Detailed CSV: {detailed_csv_filename}")
print(f"📝 Markdown Report: {md_filename}")
print(f"\nTest completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Display quick summary
if successful_requests:
    print(f"\n🎯 QUICK PERFORMANCE SUMMARY:")
    print(f"   TTFT (p95): {ttft_p95:.3f}s (target: 0.9s)")
    print(f"   Inter-token (p95): {inter_token_p95:.3f}s (target: 0.17s)")
    print(f"   Throughput: {token_output_throughput:.1f} tok/sec (target: 10.05)")
    print(f"   Success Rate: {len(successful_requests)/len(results)*100:.1f}%")
else:
    print(f"\n❌ TEST FAILED: No successful requests completed")

Test started at: 2025-07-07 16:44:37
Output files will be saved as:
- Summary CSV: vllm_performance_tests/vllm_test_20250707_164437.csv
- Detailed CSV: vllm_performance_tests/vllm_detailed_20250707_164437.csv
- Report MD: vllm_performance_tests/vllm_report_20250707_164437.md
Generating test prompts...
Generated 10000 test prompts
Sample prompt length: 289 words
Sample prompt preview: Analyze the following complex scenario and provide a detailed response covering multiple aspects:

A multinational technology company is considering implementing a comprehensive artificial intelligenc...

Starting load test:
- Concurrent users: 250
- Total requests: 10000
- Target output tokens: 317
Completed 5100/10000 requests...
Completed 5200/10000 requests...
Completed 5300/10000 requests...
Completed 5400/10000 requests...
Completed 5500/10000 requests...
Completed 5600/10000 requests...
Completed 5700/10000 requests...
Completed 5800/10000 requests...
Completed 5900/10000 requests...
Completed 6000/

### Updated test

In [12]:
import time
import json
import numpy as np
import pandas as pd
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
import statistics
from datetime import datetime
import os

# Test configuration - REDUCED for testing
TEST_CONFIG = {
    'concurrent_users': 250,      # Start small to test endpoint stability
    'total_requests': 500,       # Reduce for initial testing
    'input_token_length': 265,  
    'output_tokens': 317,       
    'temperature': 0.7,
    'top_p': 1.0,
    'max_tokens': 350,
    'stream': True
}

# Create timestamped filenames
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = "vllm_performance_tests"
os.makedirs(output_dir, exist_ok=True)

csv_filename = f"{output_dir}/vllm_test_{timestamp}.csv"
detailed_csv_filename = f"{output_dir}/vllm_detailed_{timestamp}.csv"
md_filename = f"{output_dir}/vllm_report_{timestamp}.md"

print(f"Test started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Output files will be saved as:")
print(f"- Summary CSV: {csv_filename}")
print(f"- Detailed CSV: {detailed_csv_filename}")
print(f"- Report MD: {md_filename}")

# Generate test prompts
def generate_test_prompt(target_tokens=265):
    base_prompt = """Analyze the following business scenario and provide recommendations:

A technology startup is developing an AI-powered customer service platform. They need to understand market positioning, competitive analysis, implementation strategy, and growth projections. Consider technical requirements, user experience design, scalability concerns, and business model validation.

Please provide strategic insights covering market analysis, technical architecture, user acquisition strategies, and financial projections for the next 24 months."""
    
    return base_prompt

# Metrics collection
metrics = {
    'ttft_times': [],
    'inter_token_latencies': [],
    'end_to_end_times': [],
    'input_tokens': [],
    'output_tokens': [],
    'request_errors': [],
    'timestamps': []
}

metrics_lock = threading.Lock()

def make_request(request_id, prompt, config):
    """Single request function with enhanced error handling"""
    start_time = time.time()
    
    try:
        # Prepare request with timeout handling
        instances = [{
            "prompt": prompt,
            "max_tokens": config['max_tokens'],
            "temperature": config['temperature'],
            "top_p": config.get('top_p', 1.0),
            "raw_response": True,
        }]
        
        request_start = time.time()
        
        # Add retry logic for 502 errors
        max_retries = 2
        for attempt in range(max_retries + 1):
            try:
                response = endpoints["vllmtpu"].predict(
                    instances=instances, 
                    use_dedicated_endpoint=use_dedicated_endpoint
                )
                break  # Success, exit retry loop
            except Exception as e:
                if "502" in str(e) and attempt < max_retries:
                    print(f"Request {request_id}: 502 error, retrying ({attempt + 1}/{max_retries})...")
                    time.sleep(1)  # Brief delay before retry
                    continue
                else:
                    raise e  # Re-raise if not 502 or out of retries
        
        request_end = time.time()
        
        # Parse response safely
        prediction = {}
        output_text = ""
        
        if hasattr(response, 'predictions') and response.predictions:
            prediction = response.predictions[0] if response.predictions else {}
            if isinstance(prediction, dict):
                output_text = prediction.get('generated_text', '') or prediction.get('content', '') or str(prediction)
            else:
                output_text = str(prediction)
        
        # Calculate metrics
        end_to_end_time = request_end - request_start
        
        # Estimate tokens
        input_tokens = len(prompt.split()) * 1.3
        output_tokens = len(output_text.split()) * 1.3 if output_text else 0
        
        # Estimate timing metrics
        estimated_ttft = min(0.5, end_to_end_time * 0.02) if end_to_end_time > 0 else 0
        estimated_inter_token = (end_to_end_time - estimated_ttft) / max(1, output_tokens) if output_tokens > 0 else 0
        
        # Store metrics
        with metrics_lock:
            metrics['ttft_times'].append(estimated_ttft)
            metrics['inter_token_latencies'].append(estimated_inter_token)
            metrics['end_to_end_times'].append(end_to_end_time)
            metrics['input_tokens'].append(input_tokens)
            metrics['output_tokens'].append(output_tokens)
            metrics['timestamps'].append(request_start)
        
        return {
            'request_id': request_id,
            'success': True,
            'timestamp': request_start,
            'end_to_end_time': end_to_end_time,
            'ttft': estimated_ttft,
            'inter_token_latency': estimated_inter_token,
            'input_tokens': input_tokens,
            'output_tokens': output_tokens,
            'output_length': len(output_text),
            'prompt_length': len(prompt),
            'output_text': output_text[:200] + "..." if len(output_text) > 200 else output_text,
            'error': None
        }
        
    except Exception as e:
        error_time = time.time() - start_time
        error_msg = str(e)
        
        with metrics_lock:
            metrics['request_errors'].append({
                'request_id': request_id,
                'error': error_msg,
                'time': error_time
            })
        
        print(f"Request {request_id} failed: {error_msg[:100]}...")
        
        return {
            'request_id': request_id,
            'success': False,
            'timestamp': start_time,
            'error': error_msg,
            'time': error_time,
            'end_to_end_time': error_time,
            'ttft': 0,
            'inter_token_latency': 0,
            'input_tokens': len(prompt.split()) * 1.3 if prompt else 0,
            'output_tokens': 0,
            'output_length': 0,
            'prompt_length': len(prompt) if prompt else 0,
            'output_text': ""
        }

# Test endpoint first with a single request
print("Testing endpoint with single request first...")
test_prompt = generate_test_prompt()

try:
    single_test = make_request(0, test_prompt, TEST_CONFIG)
    if single_test['success']:
        print("✅ Single request test successful!")
        print(f"Response time: {single_test['end_to_end_time']:.2f}s")
        print(f"Output length: {single_test['output_length']} chars")
    else:
        print("❌ Single request test failed!")
        print(f"Error: {single_test['error']}")
        print("\n🛑 Endpoint appears to have issues. Consider:")
        print("1. Check if the endpoint is properly deployed and running")
        print("2. Verify the endpoint has sufficient resources")
        print("3. Test with smaller requests first")
        print("4. Check Google Cloud Console for endpoint logs")
        
        # Still proceed but with warning
        input("\nPress Enter to continue with load test anyway, or Ctrl+C to abort...")
        
except Exception as e:
    print(f"❌ Critical error during single test: {e}")
    print("Aborting load test.")
    exit(1)

# Generate test prompts
print(f"Generating {TEST_CONFIG['total_requests']} test prompts...")
test_prompts = [generate_test_prompt(TEST_CONFIG['input_token_length']) 
                for _ in range(TEST_CONFIG['total_requests'])]

print(f"Generated {len(test_prompts)} test prompts")
print(f"Sample prompt length: {len(test_prompts[0].split())} words")

# Run load test
print(f"\nStarting load test:")
print(f"- Concurrent users: {TEST_CONFIG['concurrent_users']}")
print(f"- Total requests: {TEST_CONFIG['total_requests']}")
print(f"- Target output tokens: {TEST_CONFIG['output_tokens']}")

test_start_time = time.time()
results = []

with ThreadPoolExecutor(max_workers=TEST_CONFIG['concurrent_users']) as executor:
    future_to_id = {
        executor.submit(make_request, i, test_prompts[i % len(test_prompts)], TEST_CONFIG): i 
        for i in range(TEST_CONFIG['total_requests'])
    }
    
    completed = 0
    for future in as_completed(future_to_id):
        request_id = future_to_id[future]
        try:
            result = future.result()
            results.append(result)
        except Exception as e:
            results.append({
                'request_id': request_id,
                'success': False,
                'timestamp': time.time(),
                'error': str(e),
                'end_to_end_time': 0,
                'ttft': 0,
                'inter_token_latency': 0,
                'input_tokens': 0,
                'output_tokens': 0,
                'output_length': 0,
                'prompt_length': 0,
                'output_text': ""
            })
        
        completed += 1
        if completed % max(1, TEST_CONFIG['total_requests'] // 20) == 0:
            success_rate = len([r for r in results if r.get('success', False)]) / len(results) * 100
            print(f"Completed {completed}/{TEST_CONFIG['total_requests']} requests... Success rate: {success_rate:.1f}%")

test_end_time = time.time()
total_test_time = test_end_time - test_start_time

# Calculate performance metrics with safe variable handling
successful_requests = [r for r in results if r.get('success', False)]
failed_requests = [r for r in results if not r.get('success', False)]

print(f"\n{'='*60}")
print(f"LOAD TEST RESULTS")
print(f"{'='*60}")

print(f"\nTest Summary:")
print(f"- Total requests: {len(results)}")
print(f"- Successful requests: {len(successful_requests)}")
print(f"- Failed requests: {len(failed_requests)}")
print(f"- Success rate: {len(successful_requests)/len(results)*100:.1f}%")
print(f"- Total test time: {total_test_time:.1f} seconds")

# Initialize all variables to prevent NameError
ttft_times = []
inter_token_times = []
e2e_times = []
input_tokens = []
output_tokens = []
ttft_p50 = ttft_p95 = ttft_p99 = 0
inter_token_p50 = inter_token_p95 = 0
e2e_p50 = e2e_p95 = e2e_p99 = 0
token_output_throughput = overall_token_throughput = requests_per_second = 0
total_input_tokens = total_output_tokens = 0

# Calculate metrics only if we have successful requests
if successful_requests:
    ttft_times = [r['ttft'] for r in successful_requests]
    inter_token_times = [r['inter_token_latency'] for r in successful_requests]
    e2e_times = [r['end_to_end_time'] for r in successful_requests]
    input_tokens = [r['input_tokens'] for r in successful_requests]
    output_tokens = [r['output_tokens'] for r in successful_requests]
    
    def percentile(data, p):
        return np.percentile(data, p) if data else 0
    
    ttft_p50 = percentile(ttft_times, 50)
    ttft_p95 = percentile(ttft_times, 95)
    ttft_p99 = percentile(ttft_times, 99)
    inter_token_p50 = percentile(inter_token_times, 50)
    inter_token_p95 = percentile(inter_token_times, 95)
    e2e_p50 = percentile(e2e_times, 50)
    e2e_p95 = percentile(e2e_times, 95)
    e2e_p99 = percentile(e2e_times, 99)
    
    total_output_tokens = sum(output_tokens)
    total_input_tokens = sum(input_tokens)
    total_tokens = total_output_tokens + total_input_tokens
    
    token_output_throughput = total_output_tokens / total_test_time
    overall_token_throughput = total_tokens / total_test_time
    requests_per_second = len(successful_requests) / total_test_time
    
    print(f"\nLatency Metrics:")
    print(f"- TTFT (p50): {ttft_p50:.3f}s")
    print(f"- TTFT (p95): {ttft_p95:.3f}s")
    print(f"- Inter-token Latency (p95): {inter_token_p95:.3f}s")
    print(f"- End-to-End (p95): {e2e_p95:.1f}s")
    
    print(f"\nThroughput Metrics:")
    print(f"- Token Output Throughput: {token_output_throughput:.2f} tok/sec")
    print(f"- Overall Token Throughput: {overall_token_throughput:.2f} tok/sec")
    print(f"- Requests per second: {requests_per_second:.2f} req/sec")
    
    print(f"\nToken Statistics:")
    print(f"- Average input tokens: {statistics.mean(input_tokens):.1f}")
    print(f"- Average output tokens: {statistics.mean(output_tokens):.1f}")

else:
    print(f"\n❌ NO SUCCESSFUL REQUESTS - ENDPOINT ISSUES DETECTED")
    print(f"\n🔍 TROUBLESHOOTING RECOMMENDATIONS:")
    print(f"1. Check endpoint status in Google Cloud Console")
    print(f"2. Verify endpoint has sufficient resources allocated")
    print(f"3. Check for quota limits or rate limiting")
    print(f"4. Review endpoint logs for detailed error messages")
    print(f"5. Try reducing concurrent users and request size")

# Error analysis
if failed_requests:
    print(f"\n{'='*60}")
    print(f"ERROR ANALYSIS")
    print(f"{'='*60}")
    
    error_types = defaultdict(int)
    for req in failed_requests:
        error_msg = req.get('error', 'Unknown error')
        # Truncate long error messages
        error_key = error_msg[:100] + "..." if len(error_msg) > 100 else error_msg
        error_types[error_key] += 1
    
    for error, count in list(error_types.items())[:10]:  # Show top 10 errors
        print(f"- {error}: {count} occurrences")

# Save detailed results
print(f"\nSaving results...")
results_df = pd.DataFrame(results)
results_df.to_csv(detailed_csv_filename, index=False)

# Create summary with safe variable access
summary_data = {
    'timestamp': [timestamp],
    'test_duration_seconds': [total_test_time],
    'total_requests': [len(results)],
    'successful_requests': [len(successful_requests)],
    'failed_requests': [len(failed_requests)],
    'success_rate_percent': [len(successful_requests)/len(results)*100],
    'concurrent_users': [TEST_CONFIG['concurrent_users']],
    'ttft_p95_seconds': [ttft_p95],
    'inter_token_p95_seconds': [inter_token_p95],
    'e2e_p95_seconds': [e2e_p95],
    'token_output_throughput': [token_output_throughput],
    'overall_token_throughput': [overall_token_throughput],
    'requests_per_second': [requests_per_second],
    'avg_input_tokens': [statistics.mean(input_tokens) if input_tokens else 0],
    'avg_output_tokens': [statistics.mean(output_tokens) if output_tokens else 0],
}

summary_df = pd.DataFrame(summary_data)
summary_df.to_csv(csv_filename, index=False)

# Generate markdown report
md_content = f"""# vLLM Performance Test Report - {timestamp}

**Test Status:** {'✅ PARTIAL SUCCESS' if successful_requests else '❌ FAILED'}  
**Success Rate:** {len(successful_requests)/len(results)*100:.1f}%  
**Test Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

## Issues Detected

⚠️ **Endpoint returned 502 errors** - Backend service unavailable  
⚠️ **{len(failed_requests)} out of {len(results)} requests failed**

## Recommendations

1. **Check endpoint health** in Google Cloud Console
2. **Scale up resources** if endpoint is under-provisioned
3. **Implement retry logic** for production applications
4. **Monitor endpoint logs** for detailed error information
5. **Start with smaller load** to test stability

"""

if successful_requests:
    md_content += f"""
## Performance Results (Successful Requests Only)

| Metric | Value |
|--------|-------|
| TTFT (p95) | {ttft_p95:.3f}s |
| Inter-token (p95) | {inter_token_p95:.3f}s |
| End-to-End (p95) | {e2e_p95:.1f}s |
| Token Output Throughput | {token_output_throughput:.2f} tok/sec |
| Requests/sec | {requests_per_second:.2f} |
"""

md_content += f"""
## Error Summary

| Error Type | Count |
|------------|-------|
"""

error_types = defaultdict(int)
for req in failed_requests:
    error_msg = req.get('error', 'Unknown error')
    error_key = error_msg[:50] + "..." if len(error_msg) > 50 else error_msg
    error_types[error_key] += 1

for error, count in list(error_types.items())[:5]:
    md_content += f"| {error} | {count} |\n"

# Save markdown
with open(md_filename, 'w', encoding='utf-8') as f:
    f.write(md_content)

print(f"\n{'='*60}")
print(f"FILES SAVED")
print(f"{'='*60}")
print(f"📄 Summary: {csv_filename}")
print(f"📊 Details: {detailed_csv_filename}")
print(f"📝 Report: {md_filename}")

if len(successful_requests) == 0:
    print(f"\n🚨 CRITICAL: All requests failed. Check your endpoint!")
else:
    print(f"\n📊 Partial results saved. Success rate: {len(successful_requests)/len(results)*100:.1f}%")

Test started at: 2025-07-07 23:52:28
Output files will be saved as:
- Summary CSV: vllm_performance_tests/vllm_test_20250707_235228.csv
- Detailed CSV: vllm_performance_tests/vllm_detailed_20250707_235228.csv
- Report MD: vllm_performance_tests/vllm_report_20250707_235228.md
Testing endpoint with single request first...
Request 0 failed: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))...
❌ Single request test failed!
Error: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

🛑 Endpoint appears to have issues. Consider:
1. Check if the endpoint is properly deployed and running
2. Verify the endpoint has sufficient resources
3. Test with smaller requests first
4. Check Google Cloud Console for endpoint logs



Press Enter to continue with load test anyway, or Ctrl+C to abort... 


Generating 500 test prompts...
Generated 500 test prompts
Sample prompt length: 63 words

Starting load test:
- Concurrent users: 250
- Total requests: 500
- Target output tokens: 317
Request 1 failed: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))...
Request 2 failed: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))...
Request 3 failed: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))...
Request 7 failed: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))...
Request 8 failed: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))...
Request 11 failed: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))...
Request 13 failed: HTTPSConnectionPool(host='1029620071644790784.europe-west4-87995179092.prediction.vertexai.goog', po...
Request 

In [None]:
## using chat complitions API (adds some inputs ) 

In [22]:
# endpoint_id = "1029620071644790784"
# LOCATION="europe-west4",

# client_options = {"api_endpoint": api_endpoint}

# client = aiplatform.gapic.PredictionServiceClient(
#   client_options=client_options
# )
    
# endpoint = client.endpoint_path(
#   project=PROJECT_ID, location=LOCATION, endpoint=endpoint_id
# )
# response = client.predict(
#   endpoint=endpoint, instances=instances, parameters=parameters
# )
# print("response")

### Multiple types of test

In [54]:
import time
import json
import numpy as np
import pandas as pd
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
import statistics
from datetime import datetime
import os
import argparse

# Test scenarios configuration
TEST_SCENARIOS = {
    'email_generation': {
        'name': 'Email Generation',
        'description': 'Short prompts generating long emails',
        'concurrent_users': 100,
        'total_requests': 2000,
        'input_token_length': 80,
        'output_tokens': 350,
        'temperature': 0.7,
        'max_tokens': 400,
        'raw_response': True  # Match your parameter structure
    },
    'summarization': {
        'name': 'Text Summarization',
        'description': 'Long text summarization to concise output',
        'concurrent_users': 150,
        'total_requests': 3000,
        'input_token_length': 500,
        'output_tokens': 80,
        'temperature': 0.3,
        'max_tokens': 120,
        'raw_response': True
    },
    'rewrite_small': {
        'name': 'Content Rewriting (Small)',
        'description': 'Small content rewriting and improvement',
        'concurrent_users': 200,
        'total_requests': 4000,
        'input_token_length': 250,
        'output_tokens': 250,
        'temperature': 0.5,
        'max_tokens': 300,
        'raw_response': True
    },
    'rewrite_large': {
        'name': 'Content Rewriting (Large)',
        'description': 'Large document rewriting and enhancement',
        'concurrent_users': 80,
        'total_requests': 1500,
        'input_token_length': 1000,
        'output_tokens': 1000,
        'temperature': 0.4,
        'max_tokens': 1200,
        'raw_response': True
    },
    'code_generation': {
        'name': 'Code Generation',
        'description': 'Code generation from specifications',
        'concurrent_users': 120,
        'total_requests': 2500,
        'input_token_length': 200,
        'output_tokens': 400,
        'temperature': 0.2,
        'max_tokens': 500,
        'raw_response': True
    },
    'conversation': {
        'name': 'Conversational AI',
        'description': 'Multi-turn conversation simulation',
        'concurrent_users': 180,
        'total_requests': 5000,
        'input_token_length': 150,
        'output_tokens': 200,
        'temperature': 0.8,
        'max_tokens': 250,
        'raw_response': True
    },
    'qa_long_context': {
        'name': 'Long Context Q&A',
        'description': 'Question answering with long context',
        'concurrent_users': 60,
        'total_requests': 1000,
        'input_token_length': 1500,
        'output_tokens': 300,
        'temperature': 0.3,
        'max_tokens': 400,
        'raw_response': True
    },
    'creative_writing': {
        'name': 'Creative Writing',
        'description': 'Story and creative content generation',
        'concurrent_users': 100,
        'total_requests': 2000,
        'input_token_length': 120,
        'output_tokens': 600,
        'temperature': 0.9,
        'max_tokens': 700,
        'raw_response': True
    }
}

# Prompt generators for different scenarios
def generate_email_prompt(target_tokens=80):
    """Generate email generation prompts"""
    prompts = [
        "Write a professional follow-up email to a client who hasn't responded to our proposal in 2 weeks. Include next steps and maintain a friendly tone.",
        "Create a welcome email for new employees joining our tech startup. Include company culture, first-day instructions, and contact information.",
        "Draft an apology email to customers about a service outage that lasted 3 hours. Explain what happened and our prevention measures.",
        "Write a promotional email for our new AI software product launch. Highlight key features and include a special discount offer.",
        "Create a meeting request email for quarterly business review with key stakeholders. Include agenda items and time options.",
        "Draft a thank you email to conference speakers and sponsors after successful event completion.",
        "Write a customer onboarding email explaining how to get started with our platform and available resources.",
        "Create a re-engagement email for inactive users offering special incentives to return to our service."
    ]
    return prompts[hash(str(target_tokens)) % len(prompts)]

def generate_summarization_prompt(target_tokens=500):
    """Generate summarization prompts with long content"""
    base_content = """
    In today's rapidly evolving business landscape, companies are increasingly turning to artificial intelligence and machine learning technologies to gain competitive advantages and streamline their operations. The implementation of AI systems has become a critical factor in determining organizational success across various industries, from healthcare and finance to retail and manufacturing.

    The adoption of AI technologies brings numerous benefits including improved efficiency, enhanced decision-making capabilities, cost reduction, and the ability to process vast amounts of data in real-time. However, organizations also face significant challenges when implementing these systems, including data privacy concerns, integration complexities, workforce adaptation requirements, and substantial initial investment costs.

    Recent studies indicate that companies successfully implementing AI solutions report an average increase in productivity of 40% and cost savings of up to 30% within the first two years of deployment. These improvements are primarily attributed to automation of repetitive tasks, enhanced predictive analytics, and improved customer service through chatbots and virtual assistants.

    The key to successful AI implementation lies in developing a comprehensive strategy that addresses technical requirements, organizational readiness, and change management processes. Companies must invest in employee training, establish clear governance frameworks, and ensure compliance with relevant regulations and ethical guidelines.

    Looking ahead, the future of AI in business appears promising, with emerging technologies such as generative AI, computer vision, and natural language processing offering new opportunities for innovation and growth. Organizations that proactively embrace these technologies while addressing associated challenges will be better positioned to thrive in the digital economy.

    The integration of AI systems also requires careful consideration of data quality, security measures, and ongoing maintenance requirements. Companies must establish robust data management practices, implement appropriate security protocols, and develop sustainable support structures to ensure long-term success of their AI initiatives.
    """
    
    # Extend content to reach target tokens
    extended_content = base_content
    while len(extended_content.split()) < target_tokens * 0.75:
        extended_content += " " + base_content
    
    return f"Please provide a concise summary of the following business analysis:\n\n{extended_content[:int(target_tokens*4)]}\n\nSummary:"

def generate_rewrite_prompt(target_tokens=250):
    """Generate content rewriting prompts"""
    if target_tokens <= 300:
        content = """
        Our company has been working on developing new software solutions for the past several years. We have created various applications that help businesses manage their operations more effectively. The software includes features for inventory management, customer relationship management, and financial reporting.

        The development team consists of experienced programmers who use modern programming languages and frameworks. We follow agile development methodologies to ensure quick delivery of high-quality software products. Our testing procedures include both automated and manual testing to identify and fix any issues before deployment.

        Customer feedback has been very positive, with many users praising the user-friendly interface and robust functionality. We continue to add new features based on user requests and market demands. Our support team provides excellent customer service to help users maximize the benefits of our software solutions.
        """
    else:
        content = """
        Our organization has been dedicated to the development and deployment of innovative software solutions for the past decade, establishing ourselves as a leader in the enterprise software market. We have successfully created and launched a comprehensive suite of applications designed to help businesses across various industries manage their complex operations more effectively and efficiently.

        The software portfolio includes advanced features for inventory management, customer relationship management, financial reporting, human resources management, project management, and business intelligence. Each application is built with scalability in mind, allowing businesses to grow without worrying about system limitations.

        Our development team consists of highly experienced software engineers, architects, and designers who utilize cutting-edge programming languages, frameworks, and development tools. We follow industry best practices including agile development methodologies, continuous integration, and deployment practices to ensure rapid delivery of high-quality software products that meet the evolving needs of our clients.

        Our comprehensive testing procedures include automated unit testing, integration testing, performance testing, security testing, and manual user acceptance testing to identify and resolve any issues before deployment. We maintain strict quality assurance standards throughout the development lifecycle.

        Customer feedback has been overwhelmingly positive, with many users praising the intuitive user interface, robust functionality, reliable performance, and comprehensive feature set. We continuously gather user feedback and market intelligence to guide our product development roadmap.

        We maintain a dedicated customer support team that provides exceptional service through multiple channels including phone, email, chat, and an extensive knowledge base. Our support team helps users maximize the benefits of our software solutions and ensures smooth implementation and adoption.
        """
    
    return f"Please rewrite and improve the following content to make it more professional, engaging, and comprehensive:\n\n{content}\n\nImproved version:"

def generate_code_prompt(target_tokens=200):
    """Generate code generation prompts"""
    prompts = [
        "Create a Python function that implements a binary search algorithm with error handling, type hints, and comprehensive documentation. Include unit tests.",
        "Write a JavaScript React component for a responsive navigation bar with dropdown menus, mobile hamburger menu, and smooth animations.",
        "Develop a SQL query to analyze customer purchase patterns including total spend, frequency, and product categories with performance optimization.",
        "Create a Python class for managing database connections with connection pooling, error handling, and transaction management.",
        "Write a REST API endpoint in Python Flask for user authentication with JWT tokens, rate limiting, and input validation.",
        "Implement a sorting algorithm visualization in JavaScript with HTML5 Canvas showing step-by-step execution and performance metrics.",
        "Create a data validation function in TypeScript for form inputs with custom error messages and real-time validation feedback.",
        "Write a Python script for automated testing of API endpoints with comprehensive test cases and detailed reporting."
    ]
    return prompts[hash(str(target_tokens)) % len(prompts)]

def generate_conversation_prompt(target_tokens=150):
    """Generate conversational prompts"""
    prompts = [
        "I'm planning a career change from marketing to data science. Can you help me understand the key skills I need to develop and create a learning roadmap?",
        "I'm having trouble with my team's productivity. We're missing deadlines and communication seems poor. What strategies would you recommend?",
        "I want to start a small business selling handmade crafts online. Can you guide me through the essential steps and considerations?",
        "I'm preparing for a job interview for a senior management position. What questions should I expect and how should I prepare?",
        "I need to improve my public speaking skills for upcoming presentations. Can you provide practical tips and practice exercises?",
        "I'm considering investing in renewable energy stocks. What factors should I consider and what are the current market trends?",
        "I want to learn a new programming language to advance my career. Which language would you recommend and why?",
        "I'm struggling with work-life balance as a remote worker. Can you suggest strategies to maintain productivity and well-being?"
    ]
    return prompts[hash(str(target_tokens)) % len(prompts)]

def generate_qa_long_context_prompt(target_tokens=1500):
    """Generate Q&A prompts with long context"""
    context = """
    The history of artificial intelligence dates back to ancient times, with myths and stories of artificial beings endowed with intelligence or consciousness by master craftsmen. The formal field of AI research was founded at a conference at Dartmouth College in 1956, where the term "artificial intelligence" was coined.

    Early AI research focused on problem-solving and symbolic methods. In the 1960s, the US Department of Defense took interest in this type of work and began training computers to mimic basic human reasoning. This early work paved the way for the automation and formal reasoning that we see in computers today.

    The field experienced several boom and bust cycles, known as "AI winters," when funding and interest waned due to overinflated expectations and limited practical applications. However, the field has experienced a renaissance since the 2000s, driven by advances in machine learning, particularly deep learning, and the availability of large datasets and powerful computing resources.

    Machine learning, a subset of AI, involves training algorithms on data to make predictions or decisions without being explicitly programmed for every scenario. Deep learning, a subset of machine learning, uses neural networks with multiple layers to model and understand complex patterns in data.

    The current wave of AI advancement is characterized by breakthrough applications in computer vision, natural language processing, speech recognition, and game playing. Notable achievements include IBM's Deep Blue defeating world chess champion Garry Kasparov in 1997, IBM's Watson winning at Jeopardy! in 2011, and Google's AlphaGo defeating the world champion Go player in 2016.

    Modern AI applications are ubiquitous in our daily lives, from recommendation systems on streaming platforms and e-commerce sites to virtual assistants like Siri and Alexa, autonomous vehicles, and medical diagnosis tools. The technology continues to advance rapidly, with new breakthroughs in generative AI, large language models, and multimodal AI systems.

    The development of AI raises important ethical and societal questions about privacy, job displacement, bias in algorithms, and the concentration of power among tech companies. There are ongoing debates about AI governance, safety, and the need for regulation to ensure AI benefits humanity while minimizing risks.

    Looking to the future, AI is expected to continue advancing rapidly, with potential developments in artificial general intelligence (AGI) that could match or exceed human intelligence across all domains. This prospect brings both tremendous opportunities and significant challenges that society must address.
    """
    
    # Extend context to reach target tokens
    extended_context = context
    while len(extended_context.split()) < target_tokens * 0.75:
        extended_context += " " + context
    
    questions = [
        "Based on the provided context, what were the key factors that led to the AI winters, and how did the field recover?",
        "Explain the relationship between artificial intelligence, machine learning, and deep learning as described in the context.",
        "What are the major ethical and societal concerns raised by AI development according to the passage?",
        "Describe the evolution of AI from its early days to modern applications, highlighting major milestones.",
        "What role did computing power and data availability play in the recent AI renaissance?"
    ]
    
    question = questions[hash(str(target_tokens)) % len(questions)]
    return f"Context:\n{extended_context[:int(target_tokens*4)]}\n\nQuestion: {question}\n\nAnswer:"

def generate_creative_writing_prompt(target_tokens=120):
    """Generate creative writing prompts"""
    prompts = [
        "Write a short story about a time traveler who discovers that changing the past has unexpected consequences in the present.",
        "Create a dramatic monologue from the perspective of the last tree in a deforested world speaking to humanity.",
        "Write a humorous story about a superhero whose power is the ability to make anyone laugh uncontrollably at inappropriate times.",
        "Compose a mystery story where the detective realizes they are actually the criminal they've been hunting.",
        "Write a science fiction story about first contact between humans and an alien species that communicates through colors.",
        "Create a fantasy tale about a young wizard who discovers their magic only works when they're telling the truth.",
        "Write a story about a librarian who discovers that books in their library can transport readers into the stories.",
        "Compose a thriller about a person who receives messages from their future self warning about upcoming dangers."
    ]
    return prompts[hash(str(target_tokens)) % len(prompts)]

# Prompt generator mapping
PROMPT_GENERATORS = {
    'email_generation': generate_email_prompt,
    'summarization': generate_summarization_prompt,
    'rewrite_small': generate_rewrite_prompt,
    'rewrite_large': generate_rewrite_prompt,
    'code_generation': generate_code_prompt,
    'conversation': generate_conversation_prompt,
    'qa_long_context': generate_qa_long_context_prompt,
    'creative_writing': generate_creative_writing_prompt
}

class TPUBenchmarkSuite:
    def __init__(self, endpoint, use_dedicated_endpoint=True):
        self.endpoint = endpoint  # Single endpoint object, not a dictionary
        self.use_dedicated_endpoint = use_dedicated_endpoint
        self.metrics = {}
        self.metrics_lock = threading.Lock()
        
    def reset_metrics(self):
        """Reset metrics for new test"""
        self.metrics = {
            'ttft_times': [],
            'inter_token_latencies': [],
            'end_to_end_times': [],
            'input_tokens': [],
            'output_tokens': [],
            'request_errors': [],
            'timestamps': []
        }
    
    def make_request(self, request_id, prompt, config):
        """Single request function with detailed timing"""
        start_time = time.time()
        
        try:
            # Prepare request exactly like your working code
            instances = [{
                "prompt": prompt,
                "max_tokens": config['max_tokens'],
                "temperature": config['temperature'],
                "raw_response": config.get('raw_response', True)  # Match your structure
            }]
            
            # Debug: Print first few requests to see what's being sent
            if request_id < 3:
                print(f"DEBUG Request {request_id}: {instances[0]}")
            
            # Record request start
            request_start = time.time()
            
            # Make prediction using your exact method
            response = self.endpoint.predict(
                instances=instances, 
                use_dedicated_endpoint=self.use_dedicated_endpoint
            )
            
            request_end = time.time()
            
            # Debug: Print first few responses
            if request_id < 3:
                print(f"DEBUG Response {request_id}: {response}")
                print(f"DEBUG Predictions: {response.predictions if hasattr(response, 'predictions') else 'No predictions attr'}")
            
            # Parse response like your code
            if hasattr(response, 'predictions') and response.predictions:
                # Handle both string and dict responses
                prediction = response.predictions[0]
                if isinstance(prediction, dict):
                    output_text = prediction.get('generated_text', '') or prediction.get('output', '') or str(prediction)
                else:
                    output_text = str(prediction)
            else:
                output_text = ""
                if request_id < 3:
                    print(f"DEBUG: No predictions in response for request {request_id}")
            
            # Calculate metrics
            end_to_end_time = request_end - request_start
            
            # Estimate tokens (rough approximation - you can adjust this)
            input_tokens = len(prompt.split()) * 1.3
            output_tokens = len(output_text.split()) * 1.3
            
            # Simulate TTFT and inter-token timing (estimates)
            estimated_ttft = min(0.5, end_to_end_time * 0.02)
            estimated_inter_token = (end_to_end_time - estimated_ttft) / max(1, output_tokens)
            
            # Store metrics
            with self.metrics_lock:
                self.metrics['ttft_times'].append(estimated_ttft)
                self.metrics['inter_token_latencies'].append(estimated_inter_token)
                self.metrics['end_to_end_times'].append(end_to_end_time)
                self.metrics['input_tokens'].append(input_tokens)
                self.metrics['output_tokens'].append(output_tokens)
                self.metrics['timestamps'].append(request_start)
            
            return {
                'request_id': request_id,
                'success': True,
                'timestamp': request_start,
                'end_to_end_time': end_to_end_time,
                'ttft': estimated_ttft,
                'inter_token_latency': estimated_inter_token,
                'input_tokens': input_tokens,
                'output_tokens': output_tokens,
                'output_length': len(output_text),
                'prompt_length': len(prompt),
                'output_text': output_text[:200] + "..." if len(output_text) > 200 else output_text,
                'raw_prediction': str(prediction)[:100] + "..." if len(str(prediction)) > 100 else str(prediction)
            }
            
        except Exception as e:
            error_time = time.time() - start_time
            
            # Debug: Print first few errors
            if request_id < 5:
                print(f"DEBUG Error {request_id}: {str(e)}")
                import traceback
                print(f"DEBUG Traceback: {traceback.format_exc()}")
            
            with self.metrics_lock:
                self.metrics['request_errors'].append({
                    'request_id': request_id,
                    'error': str(e),
                    'time': error_time
                })
            
            return {
                'request_id': request_id,
                'success': False,
                'timestamp': start_time,
                'error': str(e),
                'time': error_time,
                'end_to_end_time': error_time,
                'ttft': 0,
                'inter_token_latency': 0,
                'input_tokens': 0,
                'output_tokens': 0,
                'output_length': 0,
                'prompt_length': len(prompt),
                'output_text': "",
                'raw_prediction': f"Error: {str(e)}"
            }
    
    def run_scenario_test(self, scenario_name):
        """Run a specific test scenario"""
        if scenario_name not in TEST_SCENARIOS:
            raise ValueError(f"Unknown scenario: {scenario_name}")
        
        config = TEST_SCENARIOS[scenario_name]
        prompt_generator = PROMPT_GENERATORS[scenario_name]
        
        # Reset metrics
        self.reset_metrics()
        
        # Create timestamped filenames
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_dir = f"tpu_benchmark_results_{scenario_name}"
        os.makedirs(output_dir, exist_ok=True)
        
        csv_filename = f"{output_dir}/{scenario_name}_summary_{timestamp}.csv"
        detailed_csv_filename = f"{output_dir}/{scenario_name}_detailed_{timestamp}.csv"
        md_filename = f"{output_dir}/{scenario_name}_report_{timestamp}.md"
        
        print(f"\n{'='*80}")
        print(f"🚀 STARTING TEST: {config['name']}")
        print(f"{'='*80}")
        print(f"Description: {config['description']}")
        print(f"Test started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"Output directory: {output_dir}")
        
        # Generate test prompts
        print(f"\n📝 Generating test prompts...")
        test_prompts = []
        for i in range(config['total_requests']):
            prompt = prompt_generator(config['input_token_length'])
            test_prompts.append(prompt)
        
        print(f"✅ Generated {len(test_prompts)} test prompts")
        print(f"📊 Sample prompt length: {len(test_prompts[0].split())} words")
        print(f"🎯 Target input tokens: {config['input_token_length']}")
        print(f"🎯 Target output tokens: {config['output_tokens']}")
        
        # Run load test
        print(f"\n🔥 Starting load test:")
        print(f"   - Concurrent users: {config['concurrent_users']}")
        print(f"   - Total requests: {config['total_requests']}")
        print(f"   - Temperature: {config['temperature']}")
        print(f"   - Max tokens: {config['max_tokens']}")
        
        # Execute test
        test_start_time = time.time()
        results = []
        
        with ThreadPoolExecutor(max_workers=config['concurrent_users']) as executor:
            # Submit all requests
            future_to_id = {
                executor.submit(self.make_request, i, test_prompts[i % len(test_prompts)], config): i 
                for i in range(config['total_requests'])
            }
            
            # Collect results with progress tracking
            completed = 0
            for future in as_completed(future_to_id):
                request_id = future_to_id[future]
                try:
                    result = future.result()
                    results.append(result)
                except Exception as e:
                    results.append({
                        'request_id': request_id,
                        'success': False,
                        'timestamp': time.time(),
                        'error': str(e),
                        'end_to_end_time': 0,
                        'ttft': 0,
                        'inter_token_latency': 0,
                        'input_tokens': 0,
                        'output_tokens': 0,
                        'output_length': 0,
                        'prompt_length': 0,
                        'output_text': ""
                    })
                
                completed += 1
                if completed % 100 == 0:
                    print(f"   ⚡ Completed {completed}/{config['total_requests']} requests...")
        
        test_end_time = time.time()
        total_test_time = test_end_time - test_start_time
        
        # Calculate and display results
        self.calculate_and_display_results(results, total_test_time, config, scenario_name)
        
        # Save results
        self.save_results(results, config, scenario_name, timestamp, total_test_time,
                         csv_filename, detailed_csv_filename, md_filename)
        
        print(f"\n✅ Test completed successfully!")
        print(f"📁 Files saved in: {output_dir}")
        
        return results
    
    def calculate_and_display_results(self, results, total_test_time, config, scenario_name):
        """Calculate and display test results"""
        successful_requests = [r for r in results if r.get('success', False)]
        failed_requests = [r for r in results if not r.get('success', False)]
        
        print(f"\n{'='*80}")
        print(f"📊 TEST RESULTS: {config['name']}")
        print(f"{'='*80}")
        
        print(f"\n📈 Test Summary:")
        print(f"   - Total requests: {len(results)}")
        print(f"   - Successful requests: {len(successful_requests)}")
        print(f"   - Failed requests: {len(failed_requests)}")
        print(f"   - Success rate: {len(successful_requests)/len(results)*100:.1f}%")
        print(f"   - Total test time: {total_test_time:.1f} seconds")
        
        if successful_requests:
            # Calculate metrics
            ttft_times = [r['ttft'] for r in successful_requests]
            inter_token_times = [r['inter_token_latency'] for r in successful_requests]
            e2e_times = [r['end_to_end_time'] for r in successful_requests]
            input_tokens = [r['input_tokens'] for r in successful_requests]
            output_tokens = [r['output_tokens'] for r in successful_requests]
            
            def percentile(data, p):
                return np.percentile(data, p)
            
            # Latency metrics
            ttft_p50 = percentile(ttft_times, 50)
            ttft_p95 = percentile(ttft_times, 95)
            ttft_p99 = percentile(ttft_times, 99)
            inter_token_p50 = percentile(inter_token_times, 50)
            inter_token_p95 = percentile(inter_token_times, 95)
            e2e_p50 = percentile(e2e_times, 50)
            e2e_p95 = percentile(e2e_times, 95)
            e2e_p99 = percentile(e2e_times, 99)
            
            # Throughput calculations
            total_output_tokens = sum(output_tokens)
            total_input_tokens = sum(input_tokens)
            total_tokens = total_output_tokens + total_input_tokens
            
            token_output_throughput = total_output_tokens / total_test_time
            overall_token_throughput = total_tokens / total_test_time
            requests_per_second = len(successful_requests) / total_test_time
            
            print(f"\n⚡ Latency Metrics:")
            print(f"   - TTFT (p50): {ttft_p50:.3f}s")
            print(f"   - TTFT (p95): {ttft_p95:.3f}s")
            print(f"   - TTFT (p99): {ttft_p99:.3f}s")
            print(f"   - Inter-token (p50): {inter_token_p50:.3f}s")
            print(f"   - Inter-token (p95): {inter_token_p95:.3f}s")
            print(f"   - End-to-End (p50): {e2e_p50:.1f}s")
            print(f"   - End-to-End (p95): {e2e_p95:.1f}s")
            print(f"   - End-to-End (p99): {e2e_p99:.1f}s")
            
            print(f"\n🚀 Throughput Metrics:")
            print(f"   - Token Output Throughput: {token_output_throughput:.2f} tok/sec")
            print(f"   - Overall Token Throughput: {overall_token_throughput:.2f} tok/sec")
            print(f"   - Requests per second: {requests_per_second:.2f} req/sec")
            
            print(f"\n📊 Token Statistics:")
            print(f"   - Average input tokens: {statistics.mean(input_tokens):.1f}")
            print(f"   - Average output tokens: {statistics.mean(output_tokens):.1f}")
            print(f"   - Total input tokens: {int(total_input_tokens):,}")
            print(f"   - Total output tokens: {int(total_output_tokens):,}")
    
    def save_results(self, results, config, scenario_name, timestamp, total_test_time,
                    csv_filename, detailed_csv_filename, md_filename):
        """Save test results to files"""
        successful_requests = [r for r in results if r.get('success', False)]
        failed_requests = [r for r in results if not r.get('success', False)]
        
        # Save detailed results
        results_df = pd.DataFrame(results)
        results_df.to_csv(detailed_csv_filename, index=False)
        
        # Create summary data
        summary_data = {
            'scenario': [scenario_name],
            'timestamp': [timestamp],
            'test_duration_seconds': [total_test_time],
            'total_requests': [len(results)],
            'successful_requests': [len(successful_requests)],
            'failed_requests': [len(failed_requests)],
            'success_rate_percent': [len(successful_requests)/len(results)*100],
            'concurrent_users': [config['concurrent_users']],
            'target_input_tokens': [config['input_token_length']],
            'target_output_tokens': [config['output_tokens']],
            'temperature': [config['temperature']],
            'max_tokens': [config['max_tokens']]
        }
        
        if successful_requests:
            # Calculate metrics for summary
            ttft_times = [r['ttft'] for r in successful_requests]
            inter_token_times = [r['inter_token_latency'] for r in successful_requests]
            e2e_times = [r['end_to_end_time'] for r in successful_requests]
            input_tokens = [r['input_tokens'] for r in successful_requests]
            output_tokens = [r['output_tokens'] for r in successful_requests]
            
            summary_data.update({
                'ttft_p50_seconds': [np.percentile(ttft_times, 50)],
                'ttft_p95_seconds': [np.percentile(ttft_times, 95)],
                'ttft_p99_seconds': [np.percentile(ttft_times, 99)],
                'inter_token_p50_seconds': [np.percentile(inter_token_times, 50)],
                'inter_token_p95_seconds': [np.percentile(inter_token_times, 95)],
                'e2e_p50_seconds': [np.percentile(e2e_times, 50)],
                'e2e_p95_seconds': [np.percentile(e2e_times, 95)],
                'e2e_p99_seconds': [np.percentile(e2e_times, 99)],
                'token_output_throughput': [sum(output_tokens) / total_test_time],
                'overall_token_throughput': [(sum(output_tokens) + sum(input_tokens)) / total_test_time],
                'requests_per_second': [len(successful_requests) / total_test_time],
                'avg_input_tokens': [statistics.mean(input_tokens)],
                'avg_output_tokens': [statistics.mean(output_tokens)],
                'total_input_tokens': [sum(input_tokens)],
                'total_output_tokens': [sum(output_tokens)]
            })
        else:
            # Fill with zeros if no successful requests
            for key in ['ttft_p50_seconds', 'ttft_p95_seconds', 'ttft_p99_seconds', 
                        'inter_token_p50_seconds', 'inter_token_p95_seconds',
                        'e2e_p50_seconds', 'e2e_p95_seconds', 'e2e_p99_seconds',
                        'token_output_throughput', 'overall_token_throughput', 
                        'requests_per_second', 'avg_input_tokens', 'avg_output_tokens',
                        'total_input_tokens', 'total_output_tokens']:
                summary_data[key] = [0]
        
        # Save summary CSV
        summary_df = pd.DataFrame(summary_data)
        summary_df.to_csv(csv_filename, index=False)
        
        # Generate markdown report
        self.generate_markdown_report(results, config, scenario_name, timestamp, 
                                    total_test_time, md_filename)
    
    def generate_markdown_report(self, results, config, scenario_name, timestamp, 
                               total_test_time, md_filename):
        """Generate detailed markdown report"""
        successful_requests = [r for r in results if r.get('success', False)]
        failed_requests = [r for r in results if not r.get('success', False)]
        
        md_content = f"""# TPU Benchmark Report: {config['name']}

**Test Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}  
**Scenario:** {scenario_name}  
**Test Duration:** {total_test_time:.1f} seconds  
**Timestamp:** {timestamp}

## 📋 Test Scenario Description

{config['description']}

## ⚙️ Test Configuration

| Parameter | Value |
|-----------|-------|
| Scenario | {config['name']} |
| Concurrent Users | {config['concurrent_users']} |
| Total Requests | {config['total_requests']} |
| Target Input Tokens | {config['input_token_length']} |
| Target Output Tokens | {config['output_tokens']} |
| Temperature | {config['temperature']} |
| Max Tokens | {config['max_tokens']} |
| Raw Response | {config['raw_response']} |

## 📊 Test Results Summary

| Metric | Value |
|--------|-------|
| Total Requests | {len(results)} |
| Successful Requests | {len(successful_requests)} |
| Failed Requests | {len(failed_requests)} |
| Success Rate | {len(successful_requests)/len(results)*100:.1f}% |
| Test Duration | {total_test_time:.1f} seconds |

"""

        if successful_requests:
            # Calculate detailed metrics
            ttft_times = [r['ttft'] for r in successful_requests]
            inter_token_times = [r['inter_token_latency'] for r in successful_requests]
            e2e_times = [r['end_to_end_time'] for r in successful_requests]
            input_tokens = [r['input_tokens'] for r in successful_requests]
            output_tokens = [r['output_tokens'] for r in successful_requests]
            
            ttft_p50 = np.percentile(ttft_times, 50)
            ttft_p95 = np.percentile(ttft_times, 95)
            ttft_p99 = np.percentile(ttft_times, 99)
            inter_token_p50 = np.percentile(inter_token_times, 50)
            inter_token_p95 = np.percentile(inter_token_times, 95)
            e2e_p50 = np.percentile(e2e_times, 50)
            e2e_p95 = np.percentile(e2e_times, 95)
            e2e_p99 = np.percentile(e2e_times, 99)
            
            total_output_tokens = sum(output_tokens)
            total_input_tokens = sum(input_tokens)
            token_output_throughput = total_output_tokens / total_test_time
            overall_token_throughput = (total_output_tokens + total_input_tokens) / total_test_time
            requests_per_second = len(successful_requests) / total_test_time
            
            md_content += f"""
## ⚡ Latency Metrics

| Metric | p50 | p95 | p99 |
|--------|-----|-----|-----|
| Time to First Token (TTFT) | {ttft_p50:.3f}s | {ttft_p95:.3f}s | {ttft_p99:.3f}s |
| Inter-token Latency | {inter_token_p50:.3f}s | {inter_token_p95:.3f}s | - |
| End-to-End Latency | {e2e_p50:.1f}s | {e2e_p95:.1f}s | {e2e_p99:.1f}s |

## 🚀 Throughput Metrics

| Metric | Value |
|--------|-------|
| Token Output Throughput | {token_output_throughput:.2f} tok/sec |
| Overall Token Throughput | {overall_token_throughput:.2f} tok/sec |
| Requests per Second | {requests_per_second:.2f} req/sec |

## 📊 Token Statistics

| Metric | Value |
|--------|-------|
| Average Input Tokens | {statistics.mean(input_tokens):.1f} |
| Average Output Tokens | {statistics.mean(output_tokens):.1f} |
| Total Input Tokens | {int(total_input_tokens):,} |
| Total Output Tokens | {int(total_output_tokens):,} |
| Input/Output Ratio | {statistics.mean(input_tokens)/statistics.mean(output_tokens):.2f} |

## 🎯 Target vs Actual Comparison

| Metric | Target | Actual | Difference |
|--------|--------|--------|------------|
| Input Token Length | {config['input_token_length']} | {statistics.mean(input_tokens):.1f} | {((statistics.mean(input_tokens) - config['input_token_length']) / config['input_token_length'] * 100):+.1f}% |
| Output Tokens | {config['output_tokens']} | {statistics.mean(output_tokens):.1f} | {((statistics.mean(output_tokens) - config['output_tokens']) / config['output_tokens'] * 100):+.1f}% |

## 📈 Performance Analysis

"""
            
            # Performance analysis based on scenario
            if scenario_name == 'email_generation':
                md_content += f"""
### Email Generation Performance Analysis

This scenario tests the model's ability to generate long-form emails from short prompts, simulating common business use cases.

**Key Observations:**
- **Efficiency Ratio:** {statistics.mean(output_tokens)/statistics.mean(input_tokens):.1f}:1 (output:input tokens)
- **Content Generation Speed:** {token_output_throughput:.1f} tokens/sec for email content
- **Practical Throughput:** Can generate ~{requests_per_second*60:.0f} emails per minute

**Use Case Suitability:** {"✅ Excellent" if token_output_throughput > 50 else "⚠️ Moderate" if token_output_throughput > 20 else "❌ Poor"} for real-time email generation
"""
            
            elif scenario_name == 'summarization':
                md_content += f"""
### Summarization Performance Analysis

This scenario tests the model's ability to condense large amounts of text into concise summaries.

**Key Observations:**
- **Compression Ratio:** {statistics.mean(input_tokens)/statistics.mean(output_tokens):.1f}:1 (input:output tokens)
- **Processing Speed:** {overall_token_throughput:.1f} total tokens/sec
- **Summarization Efficiency:** {requests_per_second:.1f} documents/sec

**Use Case Suitability:** {"✅ Excellent" if requests_per_second > 2 else "⚠️ Moderate" if requests_per_second > 1 else "❌ Poor"} for batch document processing
"""
            
            elif scenario_name == 'rewrite_small':
                md_content += f"""
### Small Content Rewriting Performance Analysis

This scenario tests balanced input/output rewriting for content improvement.

**Key Observations:**
- **Balanced Processing:** {statistics.mean(output_tokens)/statistics.mean(input_tokens):.2f}:1 ratio
- **Rewriting Speed:** {token_output_throughput:.1f} tokens/sec output generation
- **Content Throughput:** {requests_per_second:.1f} documents/sec

**Use Case Suitability:** {"✅ Excellent" if requests_per_second > 1.5 else "⚠️ Moderate" if requests_per_second > 0.8 else "❌ Poor"} for content editing workflows
"""
            
            elif scenario_name == 'rewrite_large':
                md_content += f"""
### Large Content Rewriting Performance Analysis

This scenario tests the model's ability to handle substantial document rewriting tasks.

**Key Observations:**
- **Large Document Handling:** {statistics.mean(input_tokens):.0f} avg input tokens processed
- **Comprehensive Rewriting:** {statistics.mean(output_tokens):.0f} avg output tokens generated
- **Processing Efficiency:** {overall_token_throughput:.1f} total tokens/sec

**Use Case Suitability:** {"✅ Excellent" if overall_token_throughput > 800 else "⚠️ Moderate" if overall_token_throughput > 400 else "❌ Poor"} for enterprise document processing
"""
            
            elif scenario_name == 'code_generation':
                md_content += f"""
### Code Generation Performance Analysis

This scenario tests programming assistance and code generation capabilities.

**Key Observations:**
- **Code Generation Ratio:** {statistics.mean(output_tokens)/statistics.mean(input_tokens):.1f}:1
- **Code Output Speed:** {token_output_throughput:.1f} tokens/sec
- **Developer Assistance Rate:** {requests_per_second:.1f} code requests/sec

**Use Case Suitability:** {"✅ Excellent" if token_output_throughput > 40 else "⚠️ Moderate" if token_output_throughput > 20 else "❌ Poor"} for IDE integration
"""
            
            elif scenario_name == 'conversation':
                md_content += f"""
### Conversational AI Performance Analysis

This scenario tests interactive conversation capabilities for chatbot applications.

**Key Observations:**
- **Response Generation:** {statistics.mean(output_tokens):.0f} avg tokens per response
- **Conversation Speed:** {token_output_throughput:.1f} tokens/sec
- **User Interaction Rate:** {requests_per_second:.1f} conversations/sec

**Use Case Suitability:** {"✅ Excellent" if requests_per_second > 3 else "⚠️ Moderate" if requests_per_second > 1.5 else "❌ Poor"} for real-time chat applications
"""
            
            elif scenario_name == 'qa_long_context':
                md_content += f"""
### Long Context Q&A Performance Analysis

This scenario tests question answering with extensive context processing.

**Key Observations:**
- **Context Processing:** {statistics.mean(input_tokens):.0f} avg context tokens
- **Answer Generation:** {statistics.mean(output_tokens):.0f} avg answer tokens
- **Knowledge Processing:** {overall_token_throughput:.1f} total tokens/sec

**Use Case Suitability:** {"✅ Excellent" if overall_token_throughput > 600 else "⚠️ Moderate" if overall_token_throughput > 300 else "❌ Poor"} for knowledge base applications
"""
            
            elif scenario_name == 'creative_writing':
                md_content += f"""
### Creative Writing Performance Analysis

This scenario tests creative content generation for storytelling and narrative creation.

**Key Observations:**
- **Creative Expansion:** {statistics.mean(output_tokens)/statistics.mean(input_tokens):.1f}:1 expansion ratio
- **Story Generation Speed:** {token_output_throughput:.1f} tokens/sec
- **Creative Throughput:** {requests_per_second:.1f} stories/sec

**Use Case Suitability:** {"✅ Excellent" if token_output_throughput > 60 else "⚠️ Moderate" if token_output_throughput > 30 else "❌ Poor"} for content creation platforms
"""

        # Error analysis
        if failed_requests:
            md_content += f"""
## ❌ Error Analysis

**Total Failed Requests:** {len(failed_requests)}

"""
            error_types = defaultdict(int)
            for req in failed_requests:
                error_msg = req.get('error', 'Unknown error')
                error_types[error_msg] += 1
            
            md_content += "| Error Type | Count | Percentage |\n|------------|-------|------------|\n"
            for error, count in error_types.items():
                percentage = count / len(results) * 100
                md_content += f"| {error} | {count} | {percentage:.1f}% |\n"

        md_content += f"""

## 🔧 Technical Environment

- **Model Hosting:** TPU-based vLLM deployment
- **Test Framework:** Multi-scenario benchmarking suite
- **Concurrency Model:** ThreadPoolExecutor
- **Metrics Collection:** Real-time latency and throughput tracking

## 📁 Generated Files

- **Summary CSV:** `{os.path.basename(csv_filename)}`
- **Detailed CSV:** `{os.path.basename(detailed_csv_filename)}`
- **This Report:** `{os.path.basename(md_filename)}`

---
*Report generated automatically by TPU Benchmark Suite v2.0*
*Test scenario: {scenario_name} | Timestamp: {timestamp}*
"""

        # Save markdown report
        with open(md_filename, 'w', encoding='utf-8') as f:
            f.write(md_content)

def run_all_scenarios(endpoint, use_dedicated_endpoint=True, scenarios=None):
    """Run all test scenarios or specified scenarios"""
    if scenarios is None:
        scenarios = list(TEST_SCENARIOS.keys())
    
    suite = TPUBenchmarkSuite(endpoint, use_dedicated_endpoint)
    all_results = {}
    
    print(f"\n🚀 STARTING COMPREHENSIVE TPU BENCHMARK SUITE")
    print(f"{'='*80}")
    print(f"Scenarios to run: {', '.join(scenarios)}")
    print(f"Total scenarios: {len(scenarios)}")
    
    for i, scenario in enumerate(scenarios, 1):
        print(f"\n🔄 Running scenario {i}/{len(scenarios)}: {scenario}")
        try:
            results = suite.run_scenario_test(scenario)
            all_results[scenario] = results
            print(f"✅ Scenario {scenario} completed successfully")
        except Exception as e:
            print(f"❌ Scenario {scenario} failed: {e}")
            all_results[scenario] = None
    
    # Generate comparative report
    generate_comparative_report(all_results)
    
    print(f"\n🎉 ALL BENCHMARKS COMPLETED!")
    print(f"{'='*80}")
    
    return all_results

def generate_comparative_report(all_results):
    """Generate a comparative report across all scenarios"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = "tpu_benchmark_comparative"
    os.makedirs(output_dir, exist_ok=True)
    
    comparative_csv = f"{output_dir}/comparative_summary_{timestamp}.csv"
    comparative_md = f"{output_dir}/comparative_report_{timestamp}.md"
    
    # Collect summary data from all scenarios
    comparative_data = []
    
    for scenario_name, results in all_results.items():
        if results is None:
            continue
            
        successful_requests = [r for r in results if r.get('success', False)]
        if not successful_requests:
            continue
            
        config = TEST_SCENARIOS[scenario_name]
        
        # Calculate metrics
        ttft_times = [r['ttft'] for r in successful_requests]
        inter_token_times = [r['inter_token_latency'] for r in successful_requests]
        e2e_times = [r['end_to_end_time'] for r in successful_requests]
        input_tokens = [r['input_tokens'] for r in successful_requests]
        output_tokens = [r['output_tokens'] for r in successful_requests]
        
        total_test_time = max([r['timestamp'] for r in results]) - min([r['timestamp'] for r in results])
        if total_test_time == 0:
            total_test_time = 1  # Avoid division by zero
        
        comparative_data.append({
            'scenario': scenario_name,
            'scenario_name': config['name'],
            'description': config['description'],
            'concurrent_users': config['concurrent_users'],
            'total_requests': len(results),
            'successful_requests': len(successful_requests),
            'success_rate': len(successful_requests) / len(results) * 100,
            'ttft_p95': np.percentile(ttft_times, 95),
            'inter_token_p95': np.percentile(inter_token_times, 95),
            'e2e_p95': np.percentile(e2e_times, 95),
            'token_output_throughput': sum(output_tokens) / total_test_time,
            'overall_token_throughput': (sum(output_tokens) + sum(input_tokens)) / total_test_time,
            'requests_per_second': len(successful_requests) / total_test_time,
            'avg_input_tokens': statistics.mean(input_tokens),
            'avg_output_tokens': statistics.mean(output_tokens),
            'target_input_tokens': config['input_token_length'],
            'target_output_tokens': config['output_tokens']
        })
    
    # Save comparative CSV
    if comparative_data:
        comp_df = pd.DataFrame(comparative_data)
        comp_df.to_csv(comparative_csv, index=False)
        
        # Generate comparative markdown report
        md_content = f"""# TPU Benchmark Comparative Report

**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}  
**Scenarios Tested:** {len(comparative_data)}  
**Timestamp:** {timestamp}

## 🎯 Executive Summary

This report compares performance across different use case scenarios for TPU-hosted vLLM deployment.

## 📊 Scenario Comparison Overview

| Scenario | Use Case | Success Rate | Throughput (tok/sec) | TTFT p95 (s) | Requests/sec |
|----------|----------|--------------|---------------------|---------------|--------------|
"""
        
        for data in comparative_data:
            md_content += f"| {data['scenario_name']} | {data['description'][:30]}... | {data['success_rate']:.1f}% | {data['token_output_throughput']:.1f} | {data['ttft_p95']:.3f} | {data['requests_per_second']:.1f} |\n"
        
        md_content += f"""

## 🏆 Performance Rankings

### Best Token Output Throughput
"""
        sorted_by_throughput = sorted(comparative_data, key=lambda x: x['token_output_throughput'], reverse=True)
        for i, data in enumerate(sorted_by_throughput[:3], 1):
            md_content += f"{i}. **{data['scenario_name']}**: {data['token_output_throughput']:.1f} tok/sec\n"
        
        md_content += f"""

### Lowest Latency (TTFT p95)
"""
        sorted_by_ttft = sorted(comparative_data, key=lambda x: x['ttft_p95'])
        for i, data in enumerate(sorted_by_ttft[:3], 1):
            md_content += f"{i}. **{data['scenario_name']}**: {data['ttft_p95']:.3f}s\n"
        
        md_content += f"""

### Highest Request Rate
"""
        sorted_by_rps = sorted(comparative_data, key=lambda x: x['requests_per_second'], reverse=True)
        for i, data in enumerate(sorted_by_rps[:3], 1):
            md_content += f"{i}. **{data['scenario_name']}**: {data['requests_per_second']:.1f} req/sec\n"
        
        md_content += f"""

## 📈 Detailed Analysis by Use Case

"""
        
        for data in comparative_data:
            efficiency_ratio = data['avg_output_tokens'] / data['avg_input_tokens']
            md_content += f"""
### {data['scenario_name']}

**Description:** {data['description']}

| Metric | Value |
|--------|-------|
| Success Rate | {data['success_rate']:.1f}% |
| Token Output Throughput | {data['token_output_throughput']:.1f} tok/sec |
| Overall Token Throughput | {data['overall_token_throughput']:.1f} tok/sec |
| Requests per Second | {data['requests_per_second']:.1f} req/sec |
| TTFT p95 | {data['ttft_p95']:.3f}s |
| Inter-token Latency p95 | {data['inter_token_p95']:.3f}s |
| End-to-End p95 | {data['e2e_p95']:.1f}s |
| Average Input Tokens | {data['avg_input_tokens']:.0f} |
| Average Output Tokens | {data['avg_output_tokens']:.0f} |
| Efficiency Ratio | {efficiency_ratio:.2f}:1 |

"""
        
        md_content += f"""
## 🎯 Recommendations

### Production Deployment Suitability

"""
        
        for data in comparative_data:
            suitability = "🟢 Excellent" if data['success_rate'] > 95 and data['token_output_throughput'] > 30 else "🟡 Good" if data['success_rate'] > 90 and data['token_output_throughput'] > 15 else "🔴 Needs Optimization"
            md_content += f"- **{data['scenario_name']}**: {suitability}\n"
        
        md_content += f"""

### Optimization Priorities

1. **Latency Optimization**: Focus on scenarios with TTFT p95 > 1.0s
2. **Throughput Scaling**: Improve scenarios with < 20 tok/sec output throughput
3. **Reliability**: Address scenarios with < 95% success rate

## 📁 Files Generated

- **Comparative CSV**: `{os.path.basename(comparative_csv)}`
- **This Report**: `{os.path.basename(comparative_md)}`

---
*Comparative analysis generated by TPU Benchmark Suite v2.0*
"""
        
        with open(comparative_md, 'w', encoding='utf-8') as f:
            f.write(md_content)
        
        print(f"\n📊 Comparative report generated:")
        print(f"   - CSV: {comparative_csv}")
        print(f"   - Report: {comparative_md}")

def list_available_scenarios():
    """List all available test scenarios"""
    print("\n🎯 Available Test Scenarios:")
    print("="*60)
    for key, config in TEST_SCENARIOS.items():
        print(f"📌 {key}: {config['name']}")
        print(f"   Description: {config['description']}")
        print(f"   Input: {config['input_token_length']} tokens → Output: {config['output_tokens']} tokens")
        print(f"   Concurrency: {config['concurrent_users']} users, Requests: {config['total_requests']}")
        print()

def test_single_request(endpoint, use_dedicated_endpoint):
    """Test a single request to verify endpoint compatibility"""
    print("🔍 Testing single request with your endpoint...")
    
    # Test with a simple prompt like your working example
    test_prompt = "What is a car that can run on the wall?"
    
    instances = [{
        "prompt": test_prompt,
        "max_tokens": 50,
        "temperature": 1.0,
        "raw_response": True
    }]
    
    try:
        print(f"📤 Sending request: {instances[0]}")
        
        response = endpoint.predict(
            instances=instances, 
            use_dedicated_endpoint=use_dedicated_endpoint
        )
        
        print(f"📥 Response type: {type(response)}")
        print(f"📥 Response attributes: {dir(response)}")
        
        if hasattr(response, 'predictions'):
            print(f"📥 Predictions: {response.predictions}")
            if response.predictions:
                prediction = response.predictions[0]
                print(f"📥 First prediction type: {type(prediction)}")
                print(f"📥 First prediction: {prediction}")
                return True
        else:
            print("❌ No 'predictions' attribute found")
            return False
            
    except Exception as e:
        print(f"❌ Error: {e}")
        import traceback
        print(f"❌ Traceback: {traceback.format_exc()}")
        return False

def run_benchmark_example():
    """Example function showing how to use the benchmark suite"""
    print("🚀 TPU vLLM Multi-Scenario Benchmark Suite")
    print("="*60)
    print("\n📋 To use this benchmark suite, follow these steps:")
    print("\n1. First, test your endpoint compatibility:")
    print("   test_single_request(endpoint, use_dedicated_endpoint)")
    print("\n2. If that works, initialize the benchmark suite:")
    print("   suite = TPUBenchmarkSuite(endpoint, use_dedicated_endpoint)")
    print("\n3. Run individual scenarios:")
    print("   results = suite.run_scenario_test('email_generation')")
    print("   results = suite.run_scenario_test('summarization')")
    print("   results = suite.run_scenario_test('rewrite_small')")
    print("   results = suite.run_scenario_test('rewrite_large')")
    print("\n4. Or run all scenarios at once:")
    print("   all_results = run_all_scenarios(endpoint, use_dedicated_endpoint)")
    print("\n5. Or run specific scenarios:")
    print("   selected = run_all_scenarios(endpoint, use_dedicated_endpoint,")
    print("                              scenarios=['email_generation', 'summarization'])")
    print("\n💡 Available scenarios:")
    list_available_scenarios()

# Main execution - notebook friendly
def main():
    """Main function - notebook friendly version"""
    try:
        # Check if we're in a notebook environment
        get_ipython()
        # If we're in a notebook, just show the example
        run_benchmark_example()
    except NameError:
        # We're in a regular Python script, use argparse
        import sys
        parser = argparse.ArgumentParser(description='TPU vLLM Multi-Scenario Benchmark Suite')
        parser.add_argument('--scenarios', nargs='+', choices=list(TEST_SCENARIOS.keys()) + ['all'], 
                           default=['all'], help='Scenarios to run')
        parser.add_argument('--list-scenarios', action='store_true', help='List available scenarios')
        
        args = parser.parse_args()
        
        if args.list_scenarios:
            list_available_scenarios()
            return
        
        run_benchmark_example()

# Auto-run example when imported in notebook
try:
    get_ipython()
    print("📚 TPU Benchmark Suite loaded successfully!")
    print("💡 Run list_available_scenarios() to see all test scenarios")
    print("🚀 Run run_benchmark_example() to see usage instructions")
except NameError:
    # Not in notebook
    if __name__ == "__main__":
        main()

📚 TPU Benchmark Suite loaded successfully!
💡 Run list_available_scenarios() to see all test scenarios
🚀 Run run_benchmark_example() to see usage instructions


In [55]:
# Your existing working setup
endpoint_name = "1029620071644790784"
aip_endpoint_name = f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
endpoint = aiplatform.Endpoint(aip_endpoint_name)
# use_dedicated_endpoint is already defined in your environment

# Test the endpoint first
test_single_request(endpoint, use_dedicated_endpoint)

🔍 Testing single request with your endpoint...
📤 Sending request: {'prompt': 'What is a car that can run on the wall?', 'max_tokens': 50, 'temperature': 1.0, 'raw_response': True}
📥 Response type: <class 'google.cloud.aiplatform.models.Prediction'>
📥 Response attributes: ['__add__', '__annotations__', '__class__', '__class_getitem__', '__contains__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getnewargs__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__match_args__', '__module__', '__mul__', '__ne__', '__new__', '__orig_bases__', '__reduce__', '__reduce_ex__', '__repr__', '__rmul__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '_asdict', '_field_defaults', '_fields', '_make', '_replace', 'count', 'deployed_model_id', 'explanations', 'index', 'metadata', 'model_resource_name', 'model_version_id', 'predictions']
📥 Predictions: [' Car Park\n

True

In [56]:
# Initialize the benchmark suite with your endpoint
suite = TPUBenchmarkSuite(endpoint, use_dedicated_endpoint)

# Test your specific use cases
email_results = suite.run_scenario_test('email_generation')      # 80→350 tokens
summary_results = suite.run_scenario_test('summarization')       # 500→80 tokens  
small_rewrite = suite.run_scenario_test('rewrite_small')         # 250→250 tokens
large_rewrite = suite.run_scenario_test('rewrite_large')         # 1000→1000 tokens


🚀 STARTING TEST: Email Generation
Description: Short prompts generating long emails
Test started at: 2025-07-08 01:34:40
Output directory: tpu_benchmark_results_email_generation

📝 Generating test prompts...
✅ Generated 2000 test prompts
📊 Sample prompt length: 15 words
🎯 Target input tokens: 80
🎯 Target output tokens: 350

🔥 Starting load test:
   - Concurrent users: 100
   - Total requests: 2000
   - Temperature: 0.7
   - Max tokens: 400
DEBUG Request 0: {'prompt': 'Create a re-engagement email for inactive users offering special incentives to return to our service.', 'max_tokens': 400, 'temperature': 0.7, 'raw_response': True}
DEBUG Request 1: {'prompt': 'Create a re-engagement email for inactive users offering special incentives to return to our service.', 'max_tokens': 400, 'temperature': 0.7, 'raw_response': True}
DEBUG Request 2: {'prompt': 'Create a re-engagement email for inactive users offering special incentives to return to our service.', 'max_tokens': 400, 'temperature': 

#### Run all the scenarios

In [None]:
my_scenarios = ['email_generation', 'summarization', 'rewrite_small', 'rewrite_large']
all_results = run_all_scenarios(endpoint, use_dedicated_endpoint, scenarios=my_scenarios)

### VLLM testing

In [60]:
#!/usr/bin/env python3
"""
vLLM-Style Benchmark Suite for TPU Endpoints
Adapted from vLLM's official benchmark_serving.py for Google Cloud AI Platform endpoints
"""

import argparse
import asyncio
import json
import os
import random
import time
import warnings
from dataclasses import dataclass, asdict
from datetime import datetime
from typing import Any, Optional, List, Dict, Union, Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
import traceback

import numpy as np
import pandas as pd
from tqdm import tqdm

# Dataset classes adapted from vLLM
@dataclass
class BenchmarkRequest:
    """Request data structure for benchmarking"""
    prompt: str
    prompt_len: int
    expected_output_len: int
    request_id: int

class RandomDataset:
    """Generate random prompts for benchmarking"""
    
    def __init__(self, input_len: int, output_len: int, num_requests: int, range_ratio: float = 0.0):
        self.input_len = input_len
        self.output_len = output_len
        self.num_requests = num_requests
        self.range_ratio = range_ratio
        
    def _generate_random_prompt(self, length: int) -> str:
        """Generate a random prompt of specified length"""
        # Create realistic text patterns
        words = [
            "analyze", "consider", "evaluate", "examine", "investigate", "review", "assess", "study",
            "business", "technology", "strategy", "development", "innovation", "implementation", "solution",
            "market", "customer", "product", "service", "quality", "performance", "efficiency", "growth",
            "data", "information", "process", "system", "method", "approach", "framework", "model",
            "challenge", "opportunity", "risk", "benefit", "advantage", "improvement", "optimization",
            "research", "analysis", "report", "recommendation", "conclusion", "insight", "finding"
        ]
        
        prompt_words = []
        target_words = int(length * 0.75)  # Rough token-to-word conversion
        
        while len(prompt_words) < target_words:
            prompt_words.append(random.choice(words))
        
        return " ".join(prompt_words)
    
    def generate_requests(self) -> List[BenchmarkRequest]:
        """Generate benchmark requests"""
        requests = []
        
        for i in range(self.num_requests):
            # Add variance if range_ratio > 0
            if self.range_ratio > 0:
                input_variance = int(self.input_len * self.range_ratio)
                output_variance = int(self.output_len * self.range_ratio)
                
                actual_input_len = random.randint(
                    max(1, self.input_len - input_variance),
                    self.input_len + input_variance
                )
                actual_output_len = random.randint(
                    max(1, self.output_len - output_variance),
                    self.output_len + output_variance
                )
            else:
                actual_input_len = self.input_len
                actual_output_len = self.output_len
            
            prompt = self._generate_random_prompt(actual_input_len)
            
            requests.append(BenchmarkRequest(
                prompt=prompt,
                prompt_len=actual_input_len,
                expected_output_len=actual_output_len,
                request_id=i
            ))
        
        return requests

class ShareGPTDataset:
    """ShareGPT-style conversation dataset"""
    
    def __init__(self, output_len: int, num_requests: int):
        self.output_len = output_len
        self.num_requests = num_requests
    
    def _generate_conversation_prompt(self) -> str:
        """Generate conversation-style prompts"""
        conversation_starters = [
            "I need help with creating a business plan for my startup. Can you guide me through the key components?",
            "Explain the differences between machine learning and deep learning in simple terms.",
            "What are the best practices for managing a remote software development team?",
            "Help me understand the key principles of effective project management.",
            "I'm preparing for a technical interview. Can you explain common algorithms and data structures?",
            "What are the current trends in artificial intelligence and how might they impact businesses?",
            "Explain the process of building a scalable web application from scratch.",
            "I need advice on digital marketing strategies for a B2B software company.",
            "Help me understand blockchain technology and its potential applications.",
            "What are the key considerations when designing a user-friendly mobile application?"
        ]
        
        return random.choice(conversation_starters)
    
    def generate_requests(self) -> List[BenchmarkRequest]:
        """Generate conversation requests"""
        requests = []
        
        for i in range(self.num_requests):
            prompt = self._generate_conversation_prompt()
            # Estimate prompt length
            prompt_len = len(prompt.split()) * 1.3  # Rough token estimate
            
            requests.append(BenchmarkRequest(
                prompt=prompt,
                prompt_len=int(prompt_len),
                expected_output_len=self.output_len,
                request_id=i
            ))
        
        return requests

class SonnetDataset:
    """Generate prompts for creative writing (sonnets)"""
    
    def __init__(self, input_len: int, output_len: int, num_requests: int):
        self.input_len = input_len
        self.output_len = output_len
        self.num_requests = num_requests
    
    def _generate_creative_prompt(self, length: int) -> str:
        """Generate creative writing prompts"""
        themes = [
            "love and loss", "nature and seasons", "time and memory", "hope and dreams",
            "solitude and reflection", "journey and discovery", "friendship and loyalty",
            "courage and adversity", "beauty and art", "wisdom and growth"
        ]
        
        styles = [
            "in the style of Shakespeare", "as a modern poem", "with vivid imagery",
            "using metaphors and symbolism", "in free verse", "with a nostalgic tone",
            "incorporating natural elements", "with emotional depth"
        ]
        
        theme = random.choice(themes)
        style = random.choice(styles)
        
        base_prompt = f"Write a creative piece about {theme} {style}. "
        
        # Extend to target length
        extensions = [
            "Consider the deeper meaning and universal themes. ",
            "Include rich descriptions and sensory details. ",
            "Explore the emotional complexity of the subject. ",
            "Use literary devices to enhance the narrative. ",
            "Create a compelling and memorable conclusion. "
        ]
        
        prompt = base_prompt
        while len(prompt.split()) < length * 0.75:
            prompt += random.choice(extensions)
        
        return prompt
    
    def generate_requests(self) -> List[BenchmarkRequest]:
        """Generate creative writing requests"""
        requests = []
        
        for i in range(self.num_requests):
            prompt = self._generate_creative_prompt(self.input_len)
            
            requests.append(BenchmarkRequest(
                prompt=prompt,
                prompt_len=self.input_len,
                expected_output_len=self.output_len,
                request_id=i
            ))
        
        return requests

@dataclass
class BenchmarkResult:
    """Result of a single benchmark request"""
    request_id: int
    success: bool
    prompt_len: int
    output_len: int
    ttft: float  # Time to first token
    tpot: float  # Time per output token
    itl: float   # Inter-token latency
    e2e_latency: float  # End-to-end latency
    error_msg: str = ""
    timestamp: float = 0.0

class TPUBenchmarkEngine:
    """Benchmark engine for TPU endpoints"""
    
    def __init__(self, endpoint, use_dedicated_endpoint: bool = True):
        self.endpoint = endpoint
        self.use_dedicated_endpoint = use_dedicated_endpoint
        self.results_lock = threading.Lock()
        self.results: List[BenchmarkResult] = []
    
    def _make_single_request(self, request: BenchmarkRequest, 
                           temperature: float = 0.7, 
                           max_tokens: int = None) -> BenchmarkResult:
        """Make a single request to the TPU endpoint"""
        start_time = time.time()
        
        if max_tokens is None:
            max_tokens = request.expected_output_len + 50  # Add buffer
        
        try:
            # Prepare request in your endpoint format
            instances = [{
                "prompt": request.prompt,
                "max_tokens": max_tokens,
                "temperature": temperature,
                "raw_response": True
            }]
            
            request_start = time.time()
            
            # Make prediction using your endpoint
            response = self.endpoint.predict(
                instances=instances,
                use_dedicated_endpoint=self.use_dedicated_endpoint
            )
            
            request_end = time.time()
            
            # Parse response
            if hasattr(response, 'predictions') and response.predictions:
                prediction = response.predictions[0]
                if isinstance(prediction, dict):
                    output_text = prediction.get('generated_text', '') or prediction.get('output', '') or str(prediction)
                else:
                    output_text = str(prediction)
            else:
                output_text = ""
            
            # Calculate metrics
            e2e_latency = request_end - request_start
            output_tokens = len(output_text.split()) * 1.3  # Rough token estimate
            
            # Estimate TTFT and TPOT (since we don't have streaming)
            estimated_ttft = min(0.5, e2e_latency * 0.1)  # Estimate 10% for TTFT
            if output_tokens > 1:
                tpot = (e2e_latency - estimated_ttft) / output_tokens
                itl = tpot  # Approximation
            else:
                tpot = e2e_latency
                itl = e2e_latency
            
            return BenchmarkResult(
                request_id=request.request_id,
                success=True,
                prompt_len=request.prompt_len,
                output_len=int(output_tokens),
                ttft=estimated_ttft,
                tpot=tpot,
                itl=itl,
                e2e_latency=e2e_latency,
                timestamp=request_start
            )
            
        except Exception as e:
            error_time = time.time() - start_time
            return BenchmarkResult(
                request_id=request.request_id,
                success=False,
                prompt_len=request.prompt_len,
                output_len=0,
                ttft=0.0,
                tpot=0.0,
                itl=0.0,
                e2e_latency=error_time,
                error_msg=str(e),
                timestamp=start_time
            )
    
    def run_benchmark(self, 
                      requests: List[BenchmarkRequest],
                      max_concurrency: int = 100,
                      temperature: float = 0.7,
                      max_tokens: int = None,
                      request_rate: float = float('inf')) -> List[BenchmarkResult]:
        """Run benchmark with specified parameters"""
        
        print(f"Starting benchmark with {len(requests)} requests...")
        print(f"Max concurrency: {max_concurrency}")
        print(f"Temperature: {temperature}")
        print(f"Request rate: {request_rate}")
        
        self.results = []
        start_time = time.time()
        
        if request_rate == float('inf'):
            # Send all requests as fast as possible
            with ThreadPoolExecutor(max_workers=max_concurrency) as executor:
                # Submit all requests
                future_to_request = {
                    executor.submit(self._make_single_request, req, temperature, max_tokens): req
                    for req in requests
                }
                
                # Collect results with progress bar
                completed = 0
                with tqdm(total=len(requests), desc="Processing requests") as pbar:
                    for future in as_completed(future_to_request):
                        try:
                            result = future.result()
                            with self.results_lock:
                                self.results.append(result)
                        except Exception as e:
                            # Handle futures that failed to execute
                            request = future_to_request[future]
                            error_result = BenchmarkResult(
                                request_id=request.request_id,
                                success=False,
                                prompt_len=request.prompt_len,
                                output_len=0,
                                ttft=0.0,
                                tpot=0.0,
                                itl=0.0,
                                e2e_latency=0.0,
                                error_msg=f"Future execution failed: {str(e)}"
                            )
                            with self.results_lock:
                                self.results.append(error_result)
                        
                        completed += 1
                        pbar.update(1)
        else:
            # Rate-limited requests
            request_interval = 1.0 / request_rate
            with ThreadPoolExecutor(max_workers=max_concurrency) as executor:
                futures = []
                
                for req in requests:
                    future = executor.submit(self._make_single_request, req, temperature, max_tokens)
                    futures.append(future)
                    time.sleep(request_interval)
                
                # Collect results
                with tqdm(total=len(requests), desc="Processing requests") as pbar:
                    for future in as_completed(futures):
                        try:
                            result = future.result()
                            with self.results_lock:
                                self.results.append(result)
                        except Exception as e:
                            print(f"Request failed: {e}")
                        pbar.update(1)
        
        total_time = time.time() - start_time
        print(f"Benchmark completed in {total_time:.2f} seconds")
        
        return self.results

class BenchmarkAnalyzer:
    """Analyze and report benchmark results"""
    
    def __init__(self, results: List[BenchmarkResult]):
        self.results = results
        self.successful_results = [r for r in results if r.success]
        self.failed_results = [r for r in results if not r.success]
        # For backward compatibility
        self.failed_requests = self.failed_results
    
    def calculate_percentiles(self, values: List[float], percentiles: List[int]) -> Dict[int, float]:
        """Calculate percentiles for a list of values"""
        if not values:
            return {p: 0.0 for p in percentiles}
        return {p: np.percentile(values, p) for p in percentiles}
    
    def generate_summary(self) -> Dict[str, Any]:
        """Generate benchmark summary statistics"""
        if not self.successful_results:
            return {
                "error": "No successful requests",
                "total_requests": len(self.results),
                "failed_requests": len(self.failed_results)
            }
        
        # Extract metrics
        ttfts = [r.ttft * 1000 for r in self.successful_results]  # Convert to ms
        tpots = [r.tpot * 1000 for r in self.successful_results]
        itls = [r.itl * 1000 for r in self.successful_results]
        e2e_latencies = [r.e2e_latency for r in self.successful_results]
        
        # Calculate total tokens
        total_input_tokens = sum(r.prompt_len for r in self.successful_results)
        total_output_tokens = sum(r.output_len for r in self.successful_results)
        
        # Calculate benchmark duration
        if self.successful_results:
            timestamps = [r.timestamp for r in self.successful_results]
            benchmark_duration = max(timestamps) - min(timestamps) + max(e2e_latencies)
        else:
            benchmark_duration = 1.0  # Avoid division by zero
        
        # Throughput calculations
        request_throughput = len(self.successful_results) / benchmark_duration
        input_token_throughput = total_input_tokens / benchmark_duration
        output_token_throughput = total_output_tokens / benchmark_duration
        
        # Percentiles to calculate
        percentiles = [50, 90, 95, 99]
        
        summary = {
            "successful_requests": len(self.successful_results),
            "failed_requests": len(self.failed_requests),
            "total_requests": len(self.results),
            "benchmark_duration": benchmark_duration,
            "request_throughput": request_throughput,
            "input_token_throughput": input_token_throughput,
            "output_token_throughput": output_token_throughput,
            "total_input_tokens": total_input_tokens,
            "total_output_tokens": total_output_tokens,
            "ttft_percentiles": self.calculate_percentiles(ttfts, percentiles),
            "tpot_percentiles": self.calculate_percentiles(tpots, percentiles),
            "itl_percentiles": self.calculate_percentiles(itls, percentiles),
            "e2e_latency_percentiles": self.calculate_percentiles([l * 1000 for l in e2e_latencies], percentiles)
        }
        
        return summary
    
    def print_summary(self):
        """Print formatted benchmark summary"""
        summary = self.generate_summary()
        
        if "error" in summary:
            print(f"❌ Benchmark failed: {summary['error']}")
            print(f"Total requests: {summary['total_requests']}")
            print(f"Failed requests: {summary['failed_requests']}")
            return
        
        print("\n" + "=" * 50)
        print("📊 BENCHMARK RESULTS SUMMARY")
        print("=" * 50)
        
        print(f"\n📈 Request Statistics:")
        print(f"   Successful requests: {summary['successful_requests']}")
        print(f"   Failed requests: {summary['failed_requests']}")
        print(f"   Success rate: {summary['successful_requests'] / summary['total_requests'] * 100:.1f}%")
        print(f"   Benchmark duration: {summary['benchmark_duration']:.2f}s")
        
        print(f"\n🚀 Throughput:")
        print(f"   Request throughput: {summary['request_throughput']:.2f} req/s")
        print(f"   Input token throughput: {summary['input_token_throughput']:.2f} tok/s")
        print(f"   Output token throughput: {summary['output_token_throughput']:.2f} tok/s")
        
        print(f"\n⏱️ Latency Metrics (ms):")
        print("   Metric        p50     p90     p95     p99")
        print("   " + "-" * 45)
        
        ttft_p = summary['ttft_percentiles']
        print(f"   TTFT      {ttft_p[50]:7.1f} {ttft_p[90]:7.1f} {ttft_p[95]:7.1f} {ttft_p[99]:7.1f}")
        
        tpot_p = summary['tpot_percentiles']
        print(f"   TPOT      {tpot_p[50]:7.1f} {tpot_p[90]:7.1f} {tpot_p[95]:7.1f} {tpot_p[99]:7.1f}")
        
        itl_p = summary['itl_percentiles']
        print(f"   ITL       {itl_p[50]:7.1f} {itl_p[90]:7.1f} {itl_p[95]:7.1f} {itl_p[99]:7.1f}")
        
        e2e_p = summary['e2e_latency_percentiles']
        print(f"   E2E       {e2e_p[50]:7.1f} {e2e_p[90]:7.1f} {e2e_p[95]:7.1f} {e2e_p[99]:7.1f}")
        
        print(f"\n📊 Token Statistics:")
        print(f"   Total input tokens: {summary['total_input_tokens']:,}")
        print(f"   Total output tokens: {summary['total_output_tokens']:,}")
        print(f"   Avg input tokens/req: {summary['total_input_tokens'] / summary['successful_requests']:.1f}")
        print(f"   Avg output tokens/req: {summary['total_output_tokens'] / summary['successful_requests']:.1f}")
    
    def save_results(self, 
                     output_dir: str = "benchmark_results",
                     filename_prefix: str = "tpu_benchmark",
                     metadata: Dict[str, Any] = None):
        """Save results to JSON and CSV files"""
        
        os.makedirs(output_dir, exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # Prepare data for saving
        summary = self.generate_summary()
        
        # Add metadata
        if metadata:
            summary.update(metadata)
        
        # Save summary JSON
        summary_file = os.path.join(output_dir, f"{filename_prefix}_summary_{timestamp}.json")
        with open(summary_file, 'w') as f:
            json.dump(summary, f, indent=2)
        
        # Save detailed results CSV
        if self.results:
            detailed_data = []
            for result in self.results:
                row = asdict(result)
                row['ttft_ms'] = result.ttft * 1000
                row['tpot_ms'] = result.tpot * 1000
                row['itl_ms'] = result.itl * 1000
                row['e2e_latency_ms'] = result.e2e_latency * 1000
                detailed_data.append(row)
            
            detailed_file = os.path.join(output_dir, f"{filename_prefix}_detailed_{timestamp}.csv")
            df = pd.DataFrame(detailed_data)
            df.to_csv(detailed_file, index=False)
        
        print(f"\n💾 Results saved:")
        print(f"   Summary: {summary_file}")
        if self.results:
            print(f"   Detailed: {detailed_file}")

def run_vllm_style_benchmark(endpoint,
                             use_dedicated_endpoint: bool = True,
                             dataset_name: str = "random",
                             model_name: str = "unknown",
                             num_prompts: int = 1000,
                             max_concurrency: int = 100,
                             request_rate: float = float('inf'),
                             temperature: float = 0.7,
                             random_input_len: int = 1024,
                             random_output_len: int = 128,
                             random_range_ratio: float = 0.0,
                             sharegpt_output_len: int = 128,
                             sonnet_input_len: int = 500,
                             sonnet_output_len: int = 300,
                             save_result: bool = True,
                             result_dir: str = "benchmark_results",
                             result_filename: str = None,
                             metadata: Dict[str, Any] = None) -> List[BenchmarkResult]:
    """
    Run vLLM-style benchmark on TPU endpoint
    
    Args:
        endpoint: TPU endpoint object
        use_dedicated_endpoint: Whether to use dedicated endpoint
        dataset_name: Dataset to use ("random", "sharegpt", "sonnet")
        model_name: Name of the model being benchmarked
        num_prompts: Number of prompts to process
        max_concurrency: Maximum concurrent requests
        request_rate: Request rate (req/s), use inf for maximum rate
        temperature: Sampling temperature
        random_input_len: Input length for random dataset
        random_output_len: Output length for random dataset
        random_range_ratio: Range ratio for random variance
        sharegpt_output_len: Output length for ShareGPT dataset
        sonnet_input_len: Input length for sonnet dataset
        sonnet_output_len: Output length for sonnet dataset
        save_result: Whether to save results to files
        result_dir: Directory to save results
        result_filename: Custom filename prefix
        metadata: Additional metadata to save
    
    Returns:
        List of benchmark results
    """
    
    print(f"🚀 Starting vLLM-style TPU Benchmark")
    print(f"📋 Configuration:")
    print(f"   Model: {model_name}")
    print(f"   Dataset: {dataset_name}")
    print(f"   Requests: {num_prompts}")
    print(f"   Concurrency: {max_concurrency}")
    print(f"   Temperature: {temperature}")
    
    # Generate dataset
    if dataset_name == "random":
        print(f"   Input length: {random_input_len}")
        print(f"   Output length: {random_output_len}")
        print(f"   Range ratio: {random_range_ratio}")
        
        dataset = RandomDataset(
            input_len=random_input_len,
            output_len=random_output_len,
            num_requests=num_prompts,
            range_ratio=random_range_ratio
        )
    elif dataset_name == "sharegpt":
        print(f"   Output length: {sharegpt_output_len}")
        
        dataset = ShareGPTDataset(
            output_len=sharegpt_output_len,
            num_requests=num_prompts
        )
    elif dataset_name == "sonnet":
        print(f"   Input length: {sonnet_input_len}")
        print(f"   Output length: {sonnet_output_len}")
        
        dataset = SonnetDataset(
            input_len=sonnet_input_len,
            output_len=sonnet_output_len,
            num_requests=num_prompts
        )
    else:
        raise ValueError(f"Unknown dataset: {dataset_name}")
    
    # Generate requests
    print(f"\n📝 Generating {num_prompts} requests...")
    requests = dataset.generate_requests()
    
    # Run benchmark
    engine = TPUBenchmarkEngine(endpoint, use_dedicated_endpoint)
    results = engine.run_benchmark(
        requests=requests,
        max_concurrency=max_concurrency,
        temperature=temperature,
        request_rate=request_rate
    )
    
    # Analyze results
    analyzer = BenchmarkAnalyzer(results)
    analyzer.print_summary()
    
    # Save results
    if save_result:
        filename_prefix = result_filename or f"vllm_tpu_{dataset_name}_{model_name.replace('/', '_')}"
        
        # Prepare metadata
        benchmark_metadata = {
            "model": model_name,
            "dataset": dataset_name,
            "num_prompts": num_prompts,
            "max_concurrency": max_concurrency,
            "request_rate": request_rate,
            "temperature": temperature,
            "timestamp": datetime.now().isoformat(),
        }
        
        if dataset_name == "random":
            benchmark_metadata.update({
                "random_input_len": random_input_len,
                "random_output_len": random_output_len,
                "random_range_ratio": random_range_ratio
            })
        elif dataset_name == "sharegpt":
            benchmark_metadata["sharegpt_output_len"] = sharegpt_output_len
        elif dataset_name == "sonnet":
            benchmark_metadata.update({
                "sonnet_input_len": sonnet_input_len,
                "sonnet_output_len": sonnet_output_len
            })
        
        if metadata:
            benchmark_metadata.update(metadata)
        
        analyzer.save_results(
            output_dir=result_dir,
            filename_prefix=filename_prefix,
            metadata=benchmark_metadata
        )
    
    return results

def main():
    """Main function with command line interface"""
    parser = argparse.ArgumentParser(description="vLLM-style benchmark for TPU endpoints")
    
    # Dataset options
    parser.add_argument("--dataset-name", choices=["random", "sharegpt", "sonnet"], 
                       default="random", help="Dataset to benchmark on")
    parser.add_argument("--model", required=True, help="Model name for identification")
    parser.add_argument("--num-prompts", type=int, default=1000, 
                       help="Number of prompts to process")
    
    # Request options
    parser.add_argument("--max-concurrency", type=int, default=100,
                       help="Maximum number of concurrent requests")
    parser.add_argument("--request-rate", type=float, default=float('inf'),
                       help="Request rate in requests per second")
    parser.add_argument("--temperature", type=float, default=0.7,
                       help="Sampling temperature")
    
    # Dataset-specific options
    parser.add_argument("--random-input-len", type=int, default=1024,
                       help="Input length for random dataset")
    parser.add_argument("--random-output-len", type=int, default=128,
                       help="Output length for random dataset")
    parser.add_argument("--random-range-ratio", type=float, default=0.0,
                       help="Range ratio for random variance")
    parser.add_argument("--sharegpt-output-len", type=int, default=128,
                       help="Output length for ShareGPT dataset")
    parser.add_argument("--sonnet-input-len", type=int, default=500,
                       help="Input length for sonnet dataset")
    parser.add_argument("--sonnet-output-len", type=int, default=300,
                       help="Output length for sonnet dataset")
    
    # Output options
    parser.add_argument("--save-result", action="store_true",
                       help="Save benchmark results")
    parser.add_argument("--result-dir", default="benchmark_results",
                       help="Directory to save results")
    parser.add_argument("--result-filename", default=None,
                       help="Custom filename prefix for results")
    parser.add_argument("--metadata", nargs="*", metavar="KEY=VALUE",
                       help="Additional metadata (e.g., --metadata version=0.3.3 tp=1)")
    
    args = parser.parse_args()
    
    # Parse metadata
    metadata = {}
    if args.metadata:
        for item in args.metadata:
            if "=" in item:
                key, value = item.split("=", 1)
                metadata[key] = value
    
    print("⚠️  This benchmark requires your TPU endpoint to be configured.")
    print("   Please ensure you have:")
    print("   1. endpoint = aiplatform.Endpoint(aip_endpoint_name)")
    print("   2. use_dedicated_endpoint = True/False")
    print("   3. Then call: run_vllm_style_benchmark(endpoint, use_dedicated_endpoint, ...)")
    
    return args

# Preset benchmark configurations
BENCHMARK_PRESETS = {
    "quick_test": {
        "num_prompts": 100,
        "max_concurrency": 10,
        "random_input_len": 512,
        "random_output_len": 64,
        "description": "Quick test with small load"
    },
    "latency_test": {
        "num_prompts": 500,
        "max_concurrency": 1,
        "random_input_len": 1024,
        "random_output_len": 128,
        "description": "Single-request latency measurement"
    },
    "throughput_test": {
        "num_prompts": 2000,
        "max_concurrency": 200,
        "random_input_len": 512,
        "random_output_len": 128,
        "description": "High-throughput test"
    },
    "long_context": {
        "num_prompts": 100,
        "max_concurrency": 20,
        "random_input_len": 4096,
        "random_output_len": 256,
        "description": "Long context processing test"
    },
    "conversation": {
        "dataset_name": "sharegpt",
        "num_prompts": 1000,
        "max_concurrency": 50,
        "sharegpt_output_len": 200,
        "description": "Conversational AI test"
    },
    "creative_writing": {
        "dataset_name": "sonnet",
        "num_prompts": 500,
        "max_concurrency": 30,
        "sonnet_input_len": 300,
        "sonnet_output_len": 500,
        "description": "Creative writing test"
    }
}

def run_preset_benchmark(endpoint, 
                        use_dedicated_endpoint: bool,
                        preset_name: str,
                        model_name: str = "unknown",
                        **kwargs) -> List[BenchmarkResult]:
    """Run a preset benchmark configuration"""
    
    if preset_name not in BENCHMARK_PRESETS:
        available = ", ".join(BENCHMARK_PRESETS.keys())
        raise ValueError(f"Unknown preset '{preset_name}'. Available: {available}")
    
    preset_config = BENCHMARK_PRESETS[preset_name].copy()
    description = preset_config.pop("description", "")
    
    # Override with any provided kwargs
    preset_config.update(kwargs)
    
    print(f"🎯 Running preset benchmark: {preset_name}")
    print(f"📝 Description: {description}")
    
    return run_vllm_style_benchmark(
        endpoint=endpoint,
        use_dedicated_endpoint=use_dedicated_endpoint,
        model_name=model_name,
        **preset_config
    )

def run_comprehensive_benchmark_suite(endpoint,
                                    use_dedicated_endpoint: bool,
                                    model_name: str = "unknown",
                                    save_results: bool = True) -> Dict[str, List[BenchmarkResult]]:
    """Run a comprehensive benchmark suite with multiple configurations"""
    
    print(f"🚀 Starting Comprehensive TPU Benchmark Suite")
    print(f"📋 Model: {model_name}")
    print(f"🎯 Running {len(BENCHMARK_PRESETS)} preset configurations")
    
    all_results = {}
    
    for preset_name in BENCHMARK_PRESETS.keys():
        print(f"\n{'='*60}")
        print(f"🔄 Running preset: {preset_name}")
        print(f"{'='*60}")
        
        try:
            results = run_preset_benchmark(
                endpoint=endpoint,
                use_dedicated_endpoint=use_dedicated_endpoint,
                preset_name=preset_name,
                model_name=model_name,
                save_result=save_results,
                result_filename=f"comprehensive_{preset_name}_{model_name.replace('/', '_')}"
            )
            all_results[preset_name] = results
            print(f"✅ Completed preset: {preset_name}")
            
        except Exception as e:
            print(f"❌ Failed preset {preset_name}: {e}")
            all_results[preset_name] = []
    
    # Generate comparative report
    if save_results:
        generate_comparative_suite_report(all_results, model_name)
    
    print(f"\n🎉 Comprehensive benchmark suite completed!")
    print(f"📊 Results available for {len([k for k, v in all_results.items() if v])} configurations")
    
    return all_results

def generate_comparative_suite_report(all_results: Dict[str, List[BenchmarkResult]], 
                                    model_name: str):
    """Generate a comparative report across all benchmark configurations"""
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = "benchmark_results"
    os.makedirs(output_dir, exist_ok=True)
    
    report_file = os.path.join(output_dir, f"comprehensive_report_{model_name.replace('/', '_')}_{timestamp}.md")
    
    # Analyze each configuration
    config_summaries = {}
    for preset_name, results in all_results.items():
        if results:
            analyzer = BenchmarkAnalyzer(results)
            config_summaries[preset_name] = analyzer.generate_summary()
    
    # Generate markdown report
    md_content = f"""# Comprehensive TPU Benchmark Report

**Model:** {model_name}  
**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}  
**Configurations Tested:** {len(config_summaries)}

## Executive Summary

This report presents comprehensive benchmarking results across multiple test configurations for the TPU-hosted model.

## Configuration Comparison

| Configuration | Requests/s | Output tok/s | TTFT p95 (ms) | TPOT p95 (ms) | Success Rate |
|---------------|------------|--------------|---------------|---------------|--------------|
"""
    
    # Add comparison table rows
    for preset_name, summary in config_summaries.items():
        if "error" not in summary:
            success_rate = summary['successful_requests'] / summary['total_requests'] * 100
            md_content += f"| {preset_name} | {summary['request_throughput']:.2f} | {summary['output_token_throughput']:.2f} | {summary['ttft_percentiles'][95]:.1f} | {summary['tpot_percentiles'][95]:.1f} | {success_rate:.1f}% |\n"
    
    md_content += f"""

## Detailed Results by Configuration

"""
    
    # Add detailed results for each configuration
    for preset_name, summary in config_summaries.items():
        preset_config = BENCHMARK_PRESETS[preset_name]
        md_content += f"""
### {preset_name.replace('_', ' ').title()}

**Description:** {preset_config.get('description', 'N/A')}

"""
        
        if "error" in summary:
            md_content += f"❌ **Status:** Failed - {summary['error']}\n\n"
            continue
        
        md_content += f"""**Status:** ✅ Successful

**Request Statistics:**
- Total requests: {summary['total_requests']}
- Successful requests: {summary['successful_requests']}
- Failed requests: {summary['failed_requests']}
- Success rate: {summary['successful_requests'] / summary['total_requests'] * 100:.1f}%
- Duration: {summary['benchmark_duration']:.2f}s

**Throughput:**
- Request throughput: {summary['request_throughput']:.2f} req/s
- Input token throughput: {summary['input_token_throughput']:.2f} tok/s
- Output token throughput: {summary['output_token_throughput']:.2f} tok/s

**Latency (ms):**
- TTFT p95: {summary['ttft_percentiles'][95]:.1f}ms
- TPOT p95: {summary['tpot_percentiles'][95]:.1f}ms
- ITL p95: {summary['itl_percentiles'][95]:.1f}ms
- E2E p95: {summary['e2e_latency_percentiles'][95]:.1f}ms

**Token Statistics:**
- Total input tokens: {summary['total_input_tokens']:,}
- Total output tokens: {summary['total_output_tokens']:,}
- Avg input tokens/req: {summary['total_input_tokens'] / summary['successful_requests']:.1f}
- Avg output tokens/req: {summary['total_output_tokens'] / summary['successful_requests']:.1f}

"""
    
    # Add recommendations
    md_content += f"""
## Performance Analysis & Recommendations

### Best Performing Configurations

"""
    
    # Find best configurations by different metrics
    if config_summaries:
        # Best throughput
        best_throughput = max(config_summaries.items(), 
                            key=lambda x: x[1].get('output_token_throughput', 0) if 'error' not in x[1] else 0)
        
        # Best latency
        best_latency = min(config_summaries.items(),
                          key=lambda x: x[1].get('ttft_percentiles', {}).get(95, float('inf')) if 'error' not in x[1] else float('inf'))
        
        md_content += f"""
**🚀 Highest Throughput:** {best_throughput[0]} ({best_throughput[1].get('output_token_throughput', 0):.2f} tok/s)  
**⚡ Lowest Latency:** {best_latency[0]} ({best_latency[1].get('ttft_percentiles', {}).get(95, 0):.1f}ms TTFT p95)

### Use Case Recommendations

"""
        
        # Provide recommendations based on results
        for preset_name, summary in config_summaries.items():
            if "error" in summary:
                continue
                
            preset_config = BENCHMARK_PRESETS[preset_name]
            use_case = ""
            
            if preset_name == "quick_test":
                use_case = "Development and testing"
            elif preset_name == "latency_test":
                use_case = "Low-latency applications requiring fast response times"
            elif preset_name == "throughput_test":
                use_case = "High-volume batch processing"
            elif preset_name == "long_context":
                use_case = "Document analysis and long-form content processing"
            elif preset_name == "conversation":
                use_case = "Interactive chatbots and conversational AI"
            elif preset_name == "creative_writing":
                use_case = "Content generation and creative applications"
            
            success_rate = summary['successful_requests'] / summary['total_requests'] * 100
            suitability = "✅ Excellent" if success_rate > 95 and summary['output_token_throughput'] > 10 else "⚠️ Good" if success_rate > 90 else "❌ Needs optimization"
            
            md_content += f"""
**{preset_name.replace('_', ' ').title()}:** {suitability}  
*Use case:* {use_case}  
*Performance:* {summary['output_token_throughput']:.1f} tok/s, {summary['ttft_percentiles'][95]:.1f}ms TTFT p95  
"""
    
    md_content += f"""

## Technical Details

**Model:** {model_name}  
**Test Framework:** vLLM-style TPU Benchmark Suite  
**Timestamp:** {timestamp}  

---
*Report generated automatically by TPU Benchmark Suite*
"""
    
    # Save the report
    with open(report_file, 'w', encoding='utf-8') as f:
        f.write(md_content)
    
    print(f"\n📊 Comprehensive report saved: {report_file}")

# Example usage functions
def benchmark_examples():
    """Show example usage patterns"""
    print("""
🚀 vLLM-Style TPU Benchmark Examples

# 1. Quick test (your existing endpoint setup)
endpoint_name = "1029620071644790784"
aip_endpoint_name = f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
endpoint = aiplatform.Endpoint(aip_endpoint_name)

# 2. Run quick benchmark
results = run_vllm_style_benchmark(
    endpoint=endpoint,
    use_dedicated_endpoint=use_dedicated_endpoint,
    model_name="llama3.3_tpuv6e_base",
    dataset_name="random",
    num_prompts=100,
    max_concurrency=10,
    random_input_len=512,
    random_output_len=128
)

# 3. Run preset benchmarks
results = run_preset_benchmark(
    endpoint=endpoint,
    use_dedicated_endpoint=use_dedicated_endpoint,
    preset_name="throughput_test",
    model_name="llama3.3_tpuv6e_base"
)

# 4. Run comprehensive suite
all_results = run_comprehensive_benchmark_suite(
    endpoint=endpoint,
    use_dedicated_endpoint=use_dedicated_endpoint,
    model_name="llama3.3_tpuv6e_base"
)

# 5. Custom configuration
results = run_vllm_style_benchmark(
    endpoint=endpoint,
    use_dedicated_endpoint=use_dedicated_endpoint,
    model_name="your-model-name",
    dataset_name="sharegpt",  # or "sonnet"
    num_prompts=1000,
    max_concurrency=50,
    request_rate=10.0,  # 10 req/s instead of unlimited
    temperature=0.7,
    save_result=True,
    metadata={"version": "1.0", "notes": "production test"}
)

Available presets: """ + ", ".join(BENCHMARK_PRESETS.keys()) + """
Available datasets: random, sharegpt, sonnet
""")

if __name__ == "__main__":
    try:
        # Check if we're in a notebook environment
        get_ipython()
        # If in notebook, show examples
        benchmark_examples()
    except NameError:
        # If not in notebook, run CLI
        main()


🚀 vLLM-Style TPU Benchmark Examples

# 1. Quick test (your existing endpoint setup)
endpoint_name = "1029620071644790784"
aip_endpoint_name = f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
endpoint = aiplatform.Endpoint(aip_endpoint_name)

# 2. Run quick benchmark
results = run_vllm_style_benchmark(
    endpoint=endpoint,
    use_dedicated_endpoint=use_dedicated_endpoint,
    model_name="your-model-name",
    dataset_name="random",
    num_prompts=100,
    max_concurrency=10,
    random_input_len=512,
    random_output_len=128
)

# 3. Run preset benchmarks
results = run_preset_benchmark(
    endpoint=endpoint,
    use_dedicated_endpoint=use_dedicated_endpoint,
    preset_name="throughput_test",
    model_name="your-model-name"
)

# 4. Run comprehensive suite
all_results = run_comprehensive_benchmark_suite(
    endpoint=endpoint,
    use_dedicated_endpoint=use_dedicated_endpoint,
    model_name="your-model-name"
)

# 5. Custom configuration
results = run_vllm_sty

In [None]:
# Your existing endpoint setup
endpoint_name = "1029620071644790784"
aip_endpoint_name = f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
endpoint = aiplatform.Endpoint(aip_endpoint_name)

# 1. Quick test (like your use cases)
results = run_vllm_style_benchmark(
    endpoint=endpoint,
    use_dedicated_endpoint=use_dedicated_endpoint,
    model_name="llama3.3_tpuv6e",
    dataset_name="random",
    num_prompts=500,
    max_concurrency=50,
    random_input_len=80,      # Email use case
    random_output_len=350,
    save_result=True
)

# 2. Run preset configurations
results = run_preset_benchmark(
    endpoint=endpoint,
    use_dedicated_endpoint=use_dedicated_endpoint,
    preset_name="throughput_test",
    model_name="llama3.3_tpuv6e"
)

# 3. Comprehensive suite (all configurations)
all_results = run_comprehensive_benchmark_suite(
    endpoint=endpoint,
    use_dedicated_endpoint=use_dedicated_endpoint,
    model_name="llama3.3_tpuv6e"
)

🚀 Starting vLLM-style TPU Benchmark
📋 Configuration:
   Model: your-model-name
   Dataset: random
   Requests: 500
   Concurrency: 50
   Temperature: 0.7
   Input length: 80
   Output length: 350
   Range ratio: 0.0

📝 Generating 500 requests...
Starting benchmark with 500 requests...
Max concurrency: 50
Temperature: 0.7
Request rate: inf


Processing requests: 100%|██████████| 500/500 [03:03<00:00,  2.72it/s]


Benchmark completed in 183.88 seconds

📊 BENCHMARK RESULTS SUMMARY

📈 Request Statistics:
   Successful requests: 500
   Failed requests: 0
   Success rate: 100.0%
   Benchmark duration: 185.19s

🚀 Throughput:
   Request throughput: 2.70 req/s
   Input token throughput: 215.99 tok/s
   Output token throughput: 1308.82 tok/s

⏱️ Latency Metrics (ms):
   Metric        p50     p90     p95     p99
   ---------------------------------------------
   TTFT        500.0   500.0   500.0   500.0
   TPOT         34.5    43.1    48.8    73.6
   ITL          34.5    43.1    48.8    73.6
   E2E       18394.7 18547.7 18986.9 19043.9

📊 Token Statistics:
   Total input tokens: 40,000
   Total output tokens: 242,385
   Avg input tokens/req: 80.0
   Avg output tokens/req: 484.8

💾 Results saved:
   Summary: benchmark_results/vllm_tpu_random_your-model-name_summary_20250708_024154.json
   Detailed: benchmark_results/vllm_tpu_random_your-model-name_detailed_20250708_024154.csv
🎯 Running preset benchmark: 

Processing requests: 100%|██████████| 2000/2000 [04:36<00:00,  7.24it/s] 


Benchmark completed in 276.89 seconds

📊 BENCHMARK RESULTS SUMMARY

📈 Request Statistics:
   Successful requests: 2000
   Failed requests: 0
   Success rate: 100.0%
   Benchmark duration: 290.44s

🚀 Throughput:
   Request throughput: 6.89 req/s
   Input token throughput: 3525.64 tok/s
   Output token throughput: 1583.21 tok/s

⏱️ Latency Metrics (ms):
   Metric        p50     p90     p95     p99
   ---------------------------------------------
   TTFT        500.0   500.0   500.0   500.0
   TPOT        115.4   128.9   130.0   142.5
   ITL         115.4   128.9   130.0   142.5
   E2E       27177.6 30329.2 30343.4 32854.7

📊 Token Statistics:
   Total input tokens: 1,024,000
   Total output tokens: 459,834
   Avg input tokens/req: 512.0
   Avg output tokens/req: 229.9

💾 Results saved:
   Summary: benchmark_results/vllm_tpu_random_your-model-name_summary_20250708_024631.json
   Detailed: benchmark_results/vllm_tpu_random_your-model-name_detailed_20250708_024631.csv
🚀 Starting Comprehensi

Processing requests: 100%|██████████| 100/100 [00:36<00:00,  2.74it/s]


Benchmark completed in 36.49 seconds

📊 BENCHMARK RESULTS SUMMARY

📈 Request Statistics:
   Successful requests: 100
   Failed requests: 0
   Success rate: 100.0%
   Benchmark duration: 36.52s

🚀 Throughput:
   Request throughput: 2.74 req/s
   Input token throughput: 1401.82 tok/s
   Output token throughput: 404.53 tok/s

⏱️ Latency Metrics (ms):
   Metric        p50     p90     p95     p99
   ---------------------------------------------
   TTFT        364.4   366.0   366.7   367.6
   TPOT         22.1    22.2    22.3    23.0
   ITL          22.1    22.2    22.3    23.0
   E2E        3644.2  3659.6  3667.1  3676.4

📊 Token Statistics:
   Total input tokens: 51,200
   Total output tokens: 14,775
   Avg input tokens/req: 512.0
   Avg output tokens/req: 147.8

💾 Results saved:
   Summary: benchmark_results/comprehensive_quick_test_your-model-name_summary_20250708_024708.json
   Detailed: benchmark_results/comprehensive_quick_test_your-model-name_detailed_20250708_024708.csv
✅ Completed 

Processing requests: 100%|██████████| 500/500 [40:13<00:00,  4.83s/it]


Benchmark completed in 2413.36 seconds

📊 BENCHMARK RESULTS SUMMARY

📈 Request Statistics:
   Successful requests: 500
   Failed requests: 0
   Success rate: 100.0%
   Benchmark duration: 2413.53s

🚀 Throughput:
   Request throughput: 0.21 req/s
   Input token throughput: 212.14 tok/s
   Output token throughput: 47.46 tok/s

⏱️ Latency Metrics (ms):
   Metric        p50     p90     p95     p99
   ---------------------------------------------
   TTFT        484.6   489.1   490.3   492.8
   TPOT         18.9    19.1    19.1    22.1
   ITL          18.9    19.1    19.1    22.1
   E2E        4846.5  4890.8  4902.7  4928.5

📊 Token Statistics:
   Total input tokens: 512,000
   Total output tokens: 114,536
   Avg input tokens/req: 1024.0
   Avg output tokens/req: 229.1

💾 Results saved:
   Summary: benchmark_results/comprehensive_latency_test_your-model-name_summary_20250708_032721.json
   Detailed: benchmark_results/comprehensive_latency_test_your-model-name_detailed_20250708_032721.csv
✅ C

Processing requests: 100%|██████████| 2000/2000 [04:31<00:00,  7.36it/s] 


Benchmark completed in 272.09 seconds

📊 BENCHMARK RESULTS SUMMARY

📈 Request Statistics:
   Successful requests: 2000
   Failed requests: 0
   Success rate: 100.0%
   Benchmark duration: 286.17s

🚀 Throughput:
   Request throughput: 6.99 req/s
   Input token throughput: 3578.31 tok/s
   Output token throughput: 1604.27 tok/s

⏱️ Latency Metrics (ms):
   Metric        p50     p90     p95     p99
   ---------------------------------------------
   TTFT        500.0   500.0   500.0   500.0
   TPOT        115.3   117.2   131.2   143.5
   ITL         115.3   117.2   131.2   143.5
   E2E       27188.6 27552.0 29420.4 33229.0

📊 Token Statistics:
   Total input tokens: 1,024,000
   Total output tokens: 459,092
   Avg input tokens/req: 512.0
   Avg output tokens/req: 229.5

💾 Results saved:
   Summary: benchmark_results/comprehensive_throughput_test_your-model-name_summary_20250708_033154.json
   Detailed: benchmark_results/comprehensive_throughput_test_your-model-name_detailed_20250708_03315

Processing requests: 100%|██████████| 100/100 [00:01<00:00, 83.67it/s]


Benchmark completed in 1.23 seconds
❌ Benchmark failed: No successful requests
Total requests: 100
Failed requests: 100

💾 Results saved:
   Summary: benchmark_results/comprehensive_long_context_your-model-name_summary_20250708_033155.json
   Detailed: benchmark_results/comprehensive_long_context_your-model-name_detailed_20250708_033155.csv
✅ Completed preset: long_context

🔄 Running preset: conversation
🎯 Running preset benchmark: conversation
📝 Description: Conversational AI test
🚀 Starting vLLM-style TPU Benchmark
📋 Configuration:
   Model: your-model-name
   Dataset: sharegpt
   Requests: 1000
   Concurrency: 50
   Temperature: 0.7
   Output length: 200

📝 Generating 1000 requests...
Starting benchmark with 1000 requests...
Max concurrency: 50
Temperature: 0.7
Request rate: inf


Processing requests: 100%|██████████| 1000/1000 [03:47<00:00,  4.39it/s]


Benchmark completed in 228.05 seconds

📊 BENCHMARK RESULTS SUMMARY

📈 Request Statistics:
   Successful requests: 1000
   Failed requests: 0
   Success rate: 100.0%
   Benchmark duration: 228.40s

🚀 Throughput:
   Request throughput: 4.38 req/s
   Input token throughput: 69.09 tok/s
   Output token throughput: 1074.59 tok/s

⏱️ Latency Metrics (ms):
   Metric        p50     p90     p95     p99
   ---------------------------------------------
   TTFT        500.0   500.0   500.0   500.0
   TPOT         44.1    48.6    50.6    55.4
   ITL          44.1    48.6    50.6    55.4
   E2E       11391.3 11451.7 11470.6 11484.8

📊 Token Statistics:
   Total input tokens: 15,781
   Total output tokens: 245,437
   Avg input tokens/req: 15.8
   Avg output tokens/req: 245.4

💾 Results saved:
   Summary: benchmark_results/comprehensive_conversation_your-model-name_summary_20250708_033543.json
   Detailed: benchmark_results/comprehensive_conversation_your-model-name_detailed_20250708_033543.csv
✅ Comp

Processing requests: 100%|██████████| 500/500 [05:37<00:00,  1.48it/s]

Benchmark completed in 337.76 seconds

📊 BENCHMARK RESULTS SUMMARY

📈 Request Statistics:
   Successful requests: 500
   Failed requests: 0
   Success rate: 100.0%
   Benchmark duration: 339.94s

🚀 Throughput:
   Request throughput: 1.47 req/s
   Input token throughput: 441.25 tok/s
   Output token throughput: 895.52 tok/s

⏱️ Latency Metrics (ms):
   Metric        p50     p90     p95     p99
   ---------------------------------------------
   TTFT        500.0   500.0   500.0   500.0
   TPOT         31.4    33.7    34.3    36.3
   ITL          31.4    33.7    34.3    36.3
   E2E       19995.0 20089.9 20115.1 20153.7

📊 Token Statistics:
   Total input tokens: 150,000
   Total output tokens: 304,426
   Avg input tokens/req: 300.0
   Avg output tokens/req: 608.9

💾 Results saved:
   Summary: benchmark_results/comprehensive_creative_writing_your-model-name_summary_20250708_034121.json
   Detailed: benchmark_results/comprehensive_creative_writing_your-model-name_detailed_20250708_034121.c




### 250 concurrency 

In [62]:
# Your existing endpoint setup
endpoint_name = "1029620071644790784"
aip_endpoint_name = f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
endpoint = aiplatform.Endpoint(aip_endpoint_name)

# 1. Quick test (like your use cases)
results = run_vllm_style_benchmark(
    endpoint=endpoint,
    use_dedicated_endpoint=use_dedicated_endpoint,
    model_name="llama3.3_tpuv6e",
    dataset_name="random",
    num_prompts=500,
    max_concurrency=250,
    random_input_len=80,      # Email use case
    random_output_len=350,
    save_result=True
)

# 2. Run preset configurations
results = run_preset_benchmark(
    endpoint=endpoint,
    use_dedicated_endpoint=use_dedicated_endpoint,
    preset_name="throughput_test",
    model_name="llama3.3_tpuv6e"
)

# 3. Comprehensive suite (all configurations)
all_results = run_comprehensive_benchmark_suite(
    endpoint=endpoint,
    use_dedicated_endpoint=use_dedicated_endpoint,
    model_name="llama3.3_tpuv6e"
)

🚀 Starting vLLM-style TPU Benchmark
📋 Configuration:
   Model: your-model-name
   Dataset: random
   Requests: 500
   Concurrency: 250
   Temperature: 0.7
   Input length: 80
   Output length: 350
   Range ratio: 0.0

📝 Generating 500 requests...
Starting benchmark with 500 requests...
Max concurrency: 250
Temperature: 0.7
Request rate: inf


Processing requests: 100%|██████████| 500/500 [01:58<00:00,  4.21it/s]


Benchmark completed in 119.84 seconds

📊 BENCHMARK RESULTS SUMMARY

📈 Request Statistics:
   Successful requests: 500
   Failed requests: 0
   Success rate: 100.0%
   Benchmark duration: 121.14s

🚀 Throughput:
   Request throughput: 4.13 req/s
   Input token throughput: 330.20 tok/s
   Output token throughput: 2003.58 tok/s

⏱️ Latency Metrics (ms):
   Metric        p50     p90     p95     p99
   ---------------------------------------------
   TTFT        500.0   500.0   500.0   500.0
   TPOT        114.8   142.6   185.0   251.0
   ITL         114.8   142.6   185.0   251.0
   E2E       59790.7 60596.3 60615.0 60643.6

📊 Token Statistics:
   Total input tokens: 40,000
   Total output tokens: 242,713
   Avg input tokens/req: 80.0
   Avg output tokens/req: 485.4

💾 Results saved:
   Summary: benchmark_results/vllm_tpu_random_your-model-name_summary_20250708_034321.json
   Detailed: benchmark_results/vllm_tpu_random_your-model-name_detailed_20250708_034321.csv
🎯 Running preset benchmark: 

Processing requests: 100%|██████████| 2000/2000 [04:32<00:00,  7.34it/s] 


Benchmark completed in 272.79 seconds

📊 BENCHMARK RESULTS SUMMARY

📈 Request Statistics:
   Successful requests: 2000
   Failed requests: 0
   Success rate: 100.0%
   Benchmark duration: 288.80s

🚀 Throughput:
   Request throughput: 6.93 req/s
   Input token throughput: 3545.69 tok/s
   Output token throughput: 1591.11 tok/s

⏱️ Latency Metrics (ms):
   Metric        p50     p90     p95     p99
   ---------------------------------------------
   TTFT        500.0   500.0   500.0   500.0
   TPOT        114.6   123.7   136.5   151.0
   ITL         114.6   123.7   136.5   151.0
   E2E       27014.2 27453.9 31341.4 35173.8

📊 Token Statistics:
   Total input tokens: 1,024,000
   Total output tokens: 459,515
   Avg input tokens/req: 512.0
   Avg output tokens/req: 229.8

💾 Results saved:
   Summary: benchmark_results/vllm_tpu_random_your-model-name_summary_20250708_034754.json
   Detailed: benchmark_results/vllm_tpu_random_your-model-name_detailed_20250708_034754.csv
🚀 Starting Comprehensi

Processing requests: 100%|██████████| 100/100 [00:36<00:00,  2.75it/s]


Benchmark completed in 36.34 seconds

📊 BENCHMARK RESULTS SUMMARY

📈 Request Statistics:
   Successful requests: 100
   Failed requests: 0
   Success rate: 100.0%
   Benchmark duration: 36.37s

🚀 Throughput:
   Request throughput: 2.75 req/s
   Input token throughput: 1407.64 tok/s
   Output token throughput: 405.71 tok/s

⏱️ Latency Metrics (ms):
   Metric        p50     p90     p95     p99
   ---------------------------------------------
   TTFT        362.9   365.5   366.2   366.6
   TPOT         22.0    22.2    22.3    24.2
   ITL          22.0    22.2    22.3    24.2
   E2E        3629.0  3654.8  3661.8  3665.9

📊 Token Statistics:
   Total input tokens: 51,200
   Total output tokens: 14,757
   Avg input tokens/req: 512.0
   Avg output tokens/req: 147.6

💾 Results saved:
   Summary: benchmark_results/comprehensive_quick_test_your-model-name_summary_20250708_034831.json
   Detailed: benchmark_results/comprehensive_quick_test_your-model-name_detailed_20250708_034831.csv
✅ Completed 

Processing requests: 100%|██████████| 500/500 [40:14<00:00,  4.83s/it]


Benchmark completed in 2414.23 seconds

📊 BENCHMARK RESULTS SUMMARY

📈 Request Statistics:
   Successful requests: 500
   Failed requests: 0
   Success rate: 100.0%
   Benchmark duration: 2414.41s

🚀 Throughput:
   Request throughput: 0.21 req/s
   Input token throughput: 212.06 tok/s
   Output token throughput: 47.48 tok/s

⏱️ Latency Metrics (ms):
   Metric        p50     p90     p95     p99
   ---------------------------------------------
   TTFT        484.7   488.8   490.1   492.2
   TPOT         18.9    19.1    19.1    21.7
   ITL          18.9    19.1    19.1    21.7
   E2E        4846.7  4887.6  4900.8  4922.1

📊 Token Statistics:
   Total input tokens: 512,000
   Total output tokens: 114,628
   Avg input tokens/req: 1024.0
   Avg output tokens/req: 229.3

💾 Results saved:
   Summary: benchmark_results/comprehensive_latency_test_your-model-name_summary_20250708_042845.json
   Detailed: benchmark_results/comprehensive_latency_test_your-model-name_detailed_20250708_042845.csv
✅ C

Processing requests: 100%|██████████| 2000/2000 [04:31<00:00,  7.37it/s] 


Benchmark completed in 271.91 seconds

📊 BENCHMARK RESULTS SUMMARY

📈 Request Statistics:
   Successful requests: 1991
   Failed requests: 9
   Success rate: 99.6%
   Benchmark duration: 285.10s

🚀 Throughput:
   Request throughput: 6.98 req/s
   Input token throughput: 3575.52 tok/s
   Output token throughput: 1605.87 tok/s

⏱️ Latency Metrics (ms):
   Metric        p50     p90     p95     p99
   ---------------------------------------------
   TTFT        500.0   500.0   500.0   500.0
   TPOT        115.0   117.2   129.8   141.8
   ITL         115.0   117.2   129.8   141.8
   E2E       27105.1 27474.1 29262.5 33069.4

📊 Token Statistics:
   Total input tokens: 1,019,392
   Total output tokens: 457,839
   Avg input tokens/req: 512.0
   Avg output tokens/req: 230.0

💾 Results saved:
   Summary: benchmark_results/comprehensive_throughput_test_your-model-name_summary_20250708_043317.json
   Detailed: benchmark_results/comprehensive_throughput_test_your-model-name_detailed_20250708_043317

Processing requests: 100%|██████████| 100/100 [00:01<00:00, 74.57it/s]


Benchmark completed in 1.37 seconds
❌ Benchmark failed: No successful requests
Total requests: 100
Failed requests: 100

💾 Results saved:
   Summary: benchmark_results/comprehensive_long_context_your-model-name_summary_20250708_043319.json
   Detailed: benchmark_results/comprehensive_long_context_your-model-name_detailed_20250708_043319.csv
✅ Completed preset: long_context

🔄 Running preset: conversation
🎯 Running preset benchmark: conversation
📝 Description: Conversational AI test
🚀 Starting vLLM-style TPU Benchmark
📋 Configuration:
   Model: your-model-name
   Dataset: sharegpt
   Requests: 1000
   Concurrency: 50
   Temperature: 0.7
   Output length: 200

📝 Generating 1000 requests...
Starting benchmark with 1000 requests...
Max concurrency: 50
Temperature: 0.7
Request rate: inf


Processing requests: 100%|██████████| 1000/1000 [03:47<00:00,  4.40it/s]


Benchmark completed in 227.42 seconds

📊 BENCHMARK RESULTS SUMMARY

📈 Request Statistics:
   Successful requests: 1000
   Failed requests: 0
   Success rate: 100.0%
   Benchmark duration: 227.99s

🚀 Throughput:
   Request throughput: 4.39 req/s
   Input token throughput: 68.95 tok/s
   Output token throughput: 1074.08 tok/s

⏱️ Latency Metrics (ms):
   Metric        p50     p90     p95     p99
   ---------------------------------------------
   TTFT        500.0   500.0   500.0   500.0
   TPOT         43.7    48.7    50.4    59.9
   ITL          43.7    48.7    50.4    59.9
   E2E       11344.9 11441.4 11493.3 11535.7

📊 Token Statistics:
   Total input tokens: 15,719
   Total output tokens: 244,875
   Avg input tokens/req: 15.7
   Avg output tokens/req: 244.9

💾 Results saved:
   Summary: benchmark_results/comprehensive_conversation_your-model-name_summary_20250708_043707.json
   Detailed: benchmark_results/comprehensive_conversation_your-model-name_detailed_20250708_043707.csv
✅ Comp

Processing requests: 100%|██████████| 500/500 [05:38<00:00,  1.48it/s]

Benchmark completed in 338.62 seconds

📊 BENCHMARK RESULTS SUMMARY

📈 Request Statistics:
   Successful requests: 500
   Failed requests: 0
   Success rate: 100.0%
   Benchmark duration: 340.31s

🚀 Throughput:
   Request throughput: 1.47 req/s
   Input token throughput: 440.78 tok/s
   Output token throughput: 894.93 tok/s

⏱️ Latency Metrics (ms):
   Metric        p50     p90     p95     p99
   ---------------------------------------------
   TTFT        500.0   500.0   500.0   500.0
   TPOT         31.4    34.1    34.9    38.4
   ITL          31.4    34.1    34.9    38.4
   E2E       20024.0 20086.8 20117.2 20127.6

📊 Token Statistics:
   Total input tokens: 150,000
   Total output tokens: 304,550
   Avg input tokens/req: 300.0
   Avg output tokens/req: 609.1

💾 Results saved:
   Summary: benchmark_results/comprehensive_creative_writing_your-model-name_summary_20250708_044245.json
   Detailed: benchmark_results/comprehensive_creative_writing_your-model-name_detailed_20250708_044245.c




## Clean up resources


In [None]:
# # @title Delete the models and endpoints
# # @markdown  Delete the experiment models and endpoints to recycle the resources
# # @markdown  and avoid unnecessary continuous charges that may incur.

# # Undeploy model and delete endpoint.
# for endpoint in endpoints.values():
#     endpoint.delete(force=True)

# # Delete models.
# for model in models.values():
#     model.delete()

# delete_bucket = F   # @param {type:"boolean"}
# if delete_bucket:
#     ! gsutil -m rm -r $BUCKET_NAME