In [1]:
from prompting.llms.utils import GPUInfo
GPUInfo.total_memory, GPUInfo.free_memory, GPUInfo.used_memory

[32m2024-08-14 12:20:24.357[0m | [1mINFO    [0m | [36mprompting[0m:[36m<module>[0m:[36m38[0m - [1mPrompting version: 2.7.0[0m
[32m2024-08-14 12:20:27.393[0m | [1mINFO    [0m | [36mprompting.utils.config[0m:[36mconfig[0m:[36m27[0m - [1mRUNNING WITH ARGS: netuid=None wallet.name=None wallet.hotkey=None subtensor.network=None axon.port=None[0m
[32m2024-08-14 12:20:27.397[0m | [1mINFO    [0m | [36mprompting.settings[0m:[36m<module>[0m:[36m12[0m - [1mConfig: 
netuid: null
wallet:
  name: null
  hotkey: null
subtensor:
  network: null
axon:
  port: null
no_prompt: false
config: null
strict: false
no_version_checking: false
[0m
[32m2024-08-14 12:20:27.516[0m | [1mINFO    [0m | [36mprompting.utils.config[0m:[36mconfig[0m:[36m27[0m - [1mRUNNING WITH ARGS: netuid=None wallet.name=None wallet.hotkey=None subtensor.network=None axon.port=None[0m
[32m2024-08-14 12:20:27.518[0m | [1mINFO    [0m | [36mprompting.settings[0m:[36mload_env[0m:[36m1

mode='validator' MOCK=False NO_BACKGROUND_THREAD=True WANDB_ON=True WANDB_ENTITY='felix-quinque-macrocosmos-ai' WANDB_PROJECT_NAME='validator' WANDB_RUN_STEP_LENGTH=100 WANDB_API_KEY='ae29a588c238d0e168d620e0b18a5e29e283935a' WANDB_OFFLINE=False WANDB_NOTES='' SAVE_PATH='./storage' NEURON_EPOCH_LENGTH=1 NEURON_DEVICE='cuda' NEURON_GPUS=1 LOGGING_DONT_SAVE_EVENTS=False NEURON_TIMEOUT=15 NEURON_DISABLE_SET_WEIGHTS=False NEURON_MOVING_AVERAGE_ALPHA=0.1 NEURON_DECAY_ALPHA=0.001 NEURON_AXON_OFF=False NEURON_VPERMIT_TAO_LIMIT=4096 NEURON_QUERY_UNIQUE_COLDKEYS=False NEURON_QUERY_UNIQUE_IPS=False NEURON_FORWARD_MAX_TIME=120 ORGANIC_TIMEOUT=15 ORGANIC_SAMPLE_SIZE=10 ORGANIC_REUSE_RESPONSE_DISABLED=False ORGANIC_REFERENCE_MAX_TOKENS=256 ORGANIC_SYNTH_REWARD_SCALE=1.0 ORGANIC_SET_WEIGHTS_ENABLED=True ORGANIC_DISABLED=False ORGANIC_TRIGGER_FREQUENCY=120 ORGANIC_TRIGGER_FREQUENCY_MIN=5 ORGANIC_TRIGGER='seconds' ORGANIC_SCALING_FACTOR=1 LOG_FULL=False NETUID=61 TEST=True OPENAI_API_KEY='sk-proj-Cq96

(44.3516845703125, 44.089599609375, 0.2620849609375)

In [2]:
from typing import ClassVar
from loguru import logger
import numpy as np
from pydantic import BaseModel, ConfigDict, model_validator
import torch
import vllm


import vllm
import numpy as np
from prompting.llms.utils import GPUInfo
from vllm.distributed.parallel_state import destroy_model_parallel
from prompting.settings import settings


class ModelConfig(BaseModel):
    model_id: str
    reward: float
    min_ram: float
    model_config = ConfigDict(frozen=True)

    def __hash__(self):
        return hash((self.model_id, self.reward, self.min_ram))


class ModelZoo:
    models_configs: ClassVar[list[ModelConfig]] = [
        ModelConfig(model_id="casperhansen/mistral-nemo-instruct-2407-awq", reward=0.1, min_ram=24),
        ModelConfig(model_id="casperhansen/llama-3-8b-instruct-awq", reward=0.1, min_ram=24),
        ModelConfig(model_id="casperhansen/llama-3-70b-instruct-awq", reward=0.8, min_ram=70),
    ]

    @classmethod
    def get_all_models(cls) -> list[str]:
        return [model.model_id for model in cls.models_configs]

    @classmethod
    def get_random(cls, max_ram: float = np.inf) -> ModelConfig:
        models = [model for model in cls.models_configs if model.min_ram <= max_ram]
        return np.random.choice(models)

    @classmethod
    def get_model_by_id(cls, model_id: str) -> ModelConfig:
        return [model for model in cls.models_configs if model.model_id == model_id][0]


class ModelManager(BaseModel):
    always_active_models: list[ModelConfig] = []
    total_ram: float = 40.0
    active_models: dict[ModelConfig, vllm.LLM] = {}
    used_ram: float = 0.0
    model_config = ConfigDict(arbitrary_types_allowed=True)

    @model_validator(mode="after")
    def load_always_active_models(self) -> "ModelManager":
        for model_config in self.always_active_models:
            self.load_model(model_config)
        return self

    def load_model(self, model_config: ModelConfig, force: bool = True):
        # if force loading is enabled, unload models until there is enough RAM
        if force:
            for active_model in self.active_models.keys():
                if active_model in self.always_active_models:
                    continue
                if self.used_ram + model_config.min_ram > self.total_ram or GPUInfo.free_memory < model_config.min_ram:
                    logger.debug(f"Unloading {active_model.model_id} to make room for {model_config.model_id}")
                    self.unload_model(active_model)
                else:
                    break

        if self.used_ram + model_config.min_ram > self.total_ram or GPUInfo.free_memory < model_config.min_ram:
            raise MemoryError(
                f"""Not enough RAM to load model {model_config.model_id}. 
                    Required: {model_config.min_ram} GB
                    Available in Model Manager: {self.total_ram - self.used_ram} GB
                    Available in GPU: {GPUInfo.free_memory} GB"""
            )

        if model_config in self.active_models.keys():
            print(f"Model {model_config.model_id} is already loaded.")
            return

        try:
            model = vllm.LLM(model_config.model_id, max_model_len=8_000)
            self.active_models[model_config] = model
            self.used_ram += model_config.min_ram
            logger.info(f"Model {model_config.model_id} loaded. Current used RAM: {self.used_ram} GB")

            return model
        except Exception as e:
            logger.exception(f"Failed to load model {model_config.model_id}. Error: {str(e)}")

    def unload_model(self, model_config: ModelConfig):
        if model_config not in self.active_models:
            logger.warning("Couldn't find model to unload.")
            return
        import gc

        destroy_model_parallel()
        try:
            del self.active_models[model_config].llm_engine.model_executor.driver_worker
            del self.active_models[model_config]
        except:
            pass
        gc.collect()
        self.used_ram -= model_config.min_ram
        torch.cuda.empty_cache()

    def get_or_load(self, model_id: str) -> vllm.LLM:
        model_config = ModelZoo.get_model_by_id(model_id)
        if model_config not in self.active_models:
            self.load_model(model_config)
        return self.active_models[model_config]


# keep model used for validation always active


import asyncio
import random
from pydantic import BaseModel
from loguru import logger

class AsyncModelScheduler(BaseModel):
    model_manager: ModelManager
    interval: int = 10  # Minimum time in seconds for a model to stay active
    running: bool = False

    async def start(self):
        self.running = True
        await self.run_scheduler()

    async def stop(self):
        self.running = False

    async def run_scheduler(self):
        while self.running:
            selected_model = ModelZoo.get_random(max_ram=self.model_manager.total_ram)
            logger.info(f"Loading model {selected_model.model_id} for {self.interval} seconds.")
            
            if selected_model in self.model_manager.active_models:
                logger.info(f"Model {selected_model.model_id} is already loaded.")
                return
            # Load the selected model
            await self.load_model_async(selected_model)
            
            # Keep the model loaded for the specified time interval
            await asyncio.sleep(self.interval)
            
            # After the interval, unload the model if it is not in always_active_models
            if selected_model not in self.model_manager.always_active_models:
                logger.info(f"Unloading model {selected_model.model_id} after {self.interval} seconds.")
                self.model_manager.unload_model(selected_model)

        logger.info("Model scheduler stopped.")

    async def load_model_async(self, model_config: ModelConfig):
        loop = asyncio.get_event_loop()
        await loop.run_in_executor(None, self.model_manager.load_model, model_config)


  from .autonotebook import tqdm as notebook_tqdm
2024-08-14 12:20:32,206	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.




In [3]:
# model_manager = ModelManager(always_active_models=[ModelZoo.get_model_by_id(model_id=ModelZoo.models_configs[0].model_id)])

In [4]:
from pydantic import BaseModel

model_manager = ModelManager()
    
model_scheduler = AsyncModelScheduler(model_manager=model_manager)


In [14]:
import numpy as np

a = np.random.normal(size=10)
successes = np.sum([a < max_bound] and [a > min_bound])

In [5]:

# Start the scheduler asynchronously
asyncio.run(model_scheduler.start())


[32m2024-08-14 12:20:32.275[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m140[0m - [1mLoading model casperhansen/llama-3-8b-instruct-awq for 10 seconds.[0m


INFO 08-14 12:20:32 awq_marlin.py:77] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-14 12:20:32 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='casperhansen/llama-3-8b-instruct-awq', speculative_config=None, tokenizer='casperhansen/llama-3-8b-instruct-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=casperhansen/llama-3-8b-instruct-awq, use_v2_block_manager=False, en

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  5.47it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.50it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.68it/s]



INFO 08-14 12:20:36 model_runner.py:692] Loading model weights took 5.3453 GB
INFO 08-14 12:20:38 gpu_executor.py:102] # GPU blocks: 16735, # CPU blocks: 2048
INFO 08-14 12:20:40 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-14 12:20:40 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-14 12:20:56 model_runner.py:1181] Graph capturing finished in 16 secs.


[32m2024-08-14 12:20:56.461[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m88[0m - [1mModel casperhansen/llama-3-8b-instruct-awq loaded. Current used RAM: 24.0 GB[0m
[32m2024-08-14 12:21:06.472[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m153[0m - [1mUnloading model casperhansen/llama-3-8b-instruct-awq after 10 seconds.[0m
[32m2024-08-14 12:21:06.933[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m140[0m - [1mLoading model casperhansen/mistral-nemo-instruct-2407-awq for 10 seconds.[0m


INFO 08-14 12:21:07 awq_marlin.py:77] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-14 12:21:07 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='casperhansen/mistral-nemo-instruct-2407-awq', speculative_config=None, tokenizer='casperhansen/mistral-nemo-instruct-2407-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=casperhansen/mistral-nemo-instruct-2407-awq, use_v2_bl

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.35it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.08it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.11it/s]



INFO 08-14 12:21:11 model_runner.py:692] Loading model weights took 8.0802 GB
INFO 08-14 12:21:13 gpu_executor.py:102] # GPU blocks: 12277, # CPU blocks: 1638
INFO 08-14 12:21:15 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-14 12:21:15 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-14 12:21:30 model_runner.py:1181] Graph capturing finished in 14 secs.


[32m2024-08-14 12:21:30.210[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m88[0m - [1mModel casperhansen/mistral-nemo-instruct-2407-awq loaded. Current used RAM: 24.0 GB[0m
[32m2024-08-14 12:21:40.222[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m153[0m - [1mUnloading model casperhansen/mistral-nemo-instruct-2407-awq after 10 seconds.[0m
[32m2024-08-14 12:21:40.787[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m140[0m - [1mLoading model casperhansen/mistral-nemo-instruct-2407-awq for 10 seconds.[0m


INFO 08-14 12:21:40 awq_marlin.py:77] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-14 12:21:40 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='casperhansen/mistral-nemo-instruct-2407-awq', speculative_config=None, tokenizer='casperhansen/mistral-nemo-instruct-2407-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=casperhansen/mistral-nemo-instruct-2407-awq, use_v2_bl

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.72it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.29it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.34it/s]



INFO 08-14 12:21:44 model_runner.py:692] Loading model weights took 7.8469 GB
INFO 08-14 12:21:46 gpu_executor.py:102] # GPU blocks: 12365, # CPU blocks: 1638
INFO 08-14 12:21:47 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-14 12:21:47 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-14 12:22:00 model_runner.py:1181] Graph capturing finished in 13 secs.


[32m2024-08-14 12:22:00.571[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m88[0m - [1mModel casperhansen/mistral-nemo-instruct-2407-awq loaded. Current used RAM: 24.0 GB[0m
[32m2024-08-14 12:22:10.580[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m153[0m - [1mUnloading model casperhansen/mistral-nemo-instruct-2407-awq after 10 seconds.[0m
[32m2024-08-14 12:22:11.108[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m140[0m - [1mLoading model casperhansen/mistral-nemo-instruct-2407-awq for 10 seconds.[0m


INFO 08-14 12:22:11 awq_marlin.py:77] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-14 12:22:11 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='casperhansen/mistral-nemo-instruct-2407-awq', speculative_config=None, tokenizer='casperhansen/mistral-nemo-instruct-2407-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=casperhansen/mistral-nemo-instruct-2407-awq, use_v2_bl

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.61it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.21it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.25it/s]



INFO 08-14 12:22:14 model_runner.py:692] Loading model weights took 7.8469 GB
INFO 08-14 12:22:17 gpu_executor.py:102] # GPU blocks: 12365, # CPU blocks: 1638
INFO 08-14 12:22:17 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-14 12:22:17 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-14 12:22:31 model_runner.py:1181] Graph capturing finished in 13 secs.


[32m2024-08-14 12:22:31.126[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m88[0m - [1mModel casperhansen/mistral-nemo-instruct-2407-awq loaded. Current used RAM: 24.0 GB[0m
[32m2024-08-14 12:22:41.137[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m153[0m - [1mUnloading model casperhansen/mistral-nemo-instruct-2407-awq after 10 seconds.[0m
[32m2024-08-14 12:22:41.678[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m140[0m - [1mLoading model casperhansen/mistral-nemo-instruct-2407-awq for 10 seconds.[0m


INFO 08-14 12:22:41 awq_marlin.py:77] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-14 12:22:41 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='casperhansen/mistral-nemo-instruct-2407-awq', speculative_config=None, tokenizer='casperhansen/mistral-nemo-instruct-2407-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=casperhansen/mistral-nemo-instruct-2407-awq, use_v2_bl

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.77it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.30it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.35it/s]



INFO 08-14 12:22:45 model_runner.py:692] Loading model weights took 7.8469 GB
INFO 08-14 12:22:47 gpu_executor.py:102] # GPU blocks: 12365, # CPU blocks: 1638
INFO 08-14 12:22:48 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-14 12:22:48 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-14 12:23:03 model_runner.py:1181] Graph capturing finished in 15 secs.


[32m2024-08-14 12:23:03.696[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m88[0m - [1mModel casperhansen/mistral-nemo-instruct-2407-awq loaded. Current used RAM: 24.0 GB[0m
[32m2024-08-14 12:23:13.708[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m153[0m - [1mUnloading model casperhansen/mistral-nemo-instruct-2407-awq after 10 seconds.[0m
[32m2024-08-14 12:23:14.400[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m140[0m - [1mLoading model casperhansen/mistral-nemo-instruct-2407-awq for 10 seconds.[0m


INFO 08-14 12:23:14 awq_marlin.py:77] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-14 12:23:14 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='casperhansen/mistral-nemo-instruct-2407-awq', speculative_config=None, tokenizer='casperhansen/mistral-nemo-instruct-2407-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=casperhansen/mistral-nemo-instruct-2407-awq, use_v2_bl

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.54it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.14it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.19it/s]



INFO 08-14 12:23:18 model_runner.py:692] Loading model weights took 7.8469 GB
INFO 08-14 12:23:20 gpu_executor.py:102] # GPU blocks: 12365, # CPU blocks: 1638
INFO 08-14 12:23:21 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-14 12:23:21 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-14 12:23:35 model_runner.py:1181] Graph capturing finished in 14 secs.


[32m2024-08-14 12:23:35.618[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m88[0m - [1mModel casperhansen/mistral-nemo-instruct-2407-awq loaded. Current used RAM: 24.0 GB[0m
[32m2024-08-14 12:23:45.631[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m153[0m - [1mUnloading model casperhansen/mistral-nemo-instruct-2407-awq after 10 seconds.[0m
[32m2024-08-14 12:23:46.252[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m140[0m - [1mLoading model casperhansen/mistral-nemo-instruct-2407-awq for 10 seconds.[0m


INFO 08-14 12:23:46 awq_marlin.py:77] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-14 12:23:46 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='casperhansen/mistral-nemo-instruct-2407-awq', speculative_config=None, tokenizer='casperhansen/mistral-nemo-instruct-2407-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=casperhansen/mistral-nemo-instruct-2407-awq, use_v2_bl

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.60it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.18it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.23it/s]



INFO 08-14 12:23:50 model_runner.py:692] Loading model weights took 7.8469 GB
INFO 08-14 12:23:52 gpu_executor.py:102] # GPU blocks: 12365, # CPU blocks: 1638
INFO 08-14 12:23:52 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-14 12:23:52 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-14 12:24:06 model_runner.py:1181] Graph capturing finished in 14 secs.


[32m2024-08-14 12:24:06.709[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m88[0m - [1mModel casperhansen/mistral-nemo-instruct-2407-awq loaded. Current used RAM: 24.0 GB[0m
[32m2024-08-14 12:24:16.722[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m153[0m - [1mUnloading model casperhansen/mistral-nemo-instruct-2407-awq after 10 seconds.[0m
[32m2024-08-14 12:24:17.280[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m140[0m - [1mLoading model casperhansen/mistral-nemo-instruct-2407-awq for 10 seconds.[0m


INFO 08-14 12:24:17 awq_marlin.py:77] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-14 12:24:17 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='casperhansen/mistral-nemo-instruct-2407-awq', speculative_config=None, tokenizer='casperhansen/mistral-nemo-instruct-2407-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=casperhansen/mistral-nemo-instruct-2407-awq, use_v2_bl

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.67it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.21it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.26it/s]



INFO 08-14 12:24:21 model_runner.py:692] Loading model weights took 7.8469 GB
INFO 08-14 12:24:23 gpu_executor.py:102] # GPU blocks: 12365, # CPU blocks: 1638
INFO 08-14 12:24:24 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-14 12:24:24 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-14 12:24:38 model_runner.py:1181] Graph capturing finished in 15 secs.


[32m2024-08-14 12:24:38.874[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m88[0m - [1mModel casperhansen/mistral-nemo-instruct-2407-awq loaded. Current used RAM: 24.0 GB[0m
[32m2024-08-14 12:24:48.885[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m153[0m - [1mUnloading model casperhansen/mistral-nemo-instruct-2407-awq after 10 seconds.[0m
[32m2024-08-14 12:24:49.561[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m140[0m - [1mLoading model casperhansen/mistral-nemo-instruct-2407-awq for 10 seconds.[0m


INFO 08-14 12:24:49 awq_marlin.py:77] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-14 12:24:49 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='casperhansen/mistral-nemo-instruct-2407-awq', speculative_config=None, tokenizer='casperhansen/mistral-nemo-instruct-2407-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=casperhansen/mistral-nemo-instruct-2407-awq, use_v2_bl

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.58it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.15it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.19it/s]



INFO 08-14 12:24:53 model_runner.py:692] Loading model weights took 7.8469 GB
INFO 08-14 12:24:55 gpu_executor.py:102] # GPU blocks: 12365, # CPU blocks: 1638
INFO 08-14 12:24:56 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-14 12:24:56 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-14 12:25:10 model_runner.py:1181] Graph capturing finished in 14 secs.


[32m2024-08-14 12:25:10.501[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m88[0m - [1mModel casperhansen/mistral-nemo-instruct-2407-awq loaded. Current used RAM: 24.0 GB[0m
[32m2024-08-14 12:25:20.512[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m153[0m - [1mUnloading model casperhansen/mistral-nemo-instruct-2407-awq after 10 seconds.[0m
[32m2024-08-14 12:25:21.205[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m140[0m - [1mLoading model casperhansen/mistral-nemo-instruct-2407-awq for 10 seconds.[0m


INFO 08-14 12:25:21 awq_marlin.py:77] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-14 12:25:21 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='casperhansen/mistral-nemo-instruct-2407-awq', speculative_config=None, tokenizer='casperhansen/mistral-nemo-instruct-2407-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=casperhansen/mistral-nemo-instruct-2407-awq, use_v2_bl

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.66it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.20it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.25it/s]



INFO 08-14 12:25:24 model_runner.py:692] Loading model weights took 7.8469 GB
INFO 08-14 12:25:27 gpu_executor.py:102] # GPU blocks: 12365, # CPU blocks: 1638
INFO 08-14 12:25:27 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-14 12:25:27 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-14 12:25:41 model_runner.py:1181] Graph capturing finished in 14 secs.


[32m2024-08-14 12:25:41.463[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m88[0m - [1mModel casperhansen/mistral-nemo-instruct-2407-awq loaded. Current used RAM: 24.0 GB[0m
[32m2024-08-14 12:25:51.475[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m153[0m - [1mUnloading model casperhansen/mistral-nemo-instruct-2407-awq after 10 seconds.[0m
[32m2024-08-14 12:25:52.017[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m140[0m - [1mLoading model casperhansen/mistral-nemo-instruct-2407-awq for 10 seconds.[0m


INFO 08-14 12:25:52 awq_marlin.py:77] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-14 12:25:52 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='casperhansen/mistral-nemo-instruct-2407-awq', speculative_config=None, tokenizer='casperhansen/mistral-nemo-instruct-2407-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=casperhansen/mistral-nemo-instruct-2407-awq, use_v2_bl

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.61it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.18it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.23it/s]



INFO 08-14 12:25:55 model_runner.py:692] Loading model weights took 7.8469 GB
INFO 08-14 12:25:58 gpu_executor.py:102] # GPU blocks: 12365, # CPU blocks: 1638
INFO 08-14 12:25:58 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-14 12:25:58 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-14 12:26:12 model_runner.py:1181] Graph capturing finished in 14 secs.


[32m2024-08-14 12:26:12.441[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m88[0m - [1mModel casperhansen/mistral-nemo-instruct-2407-awq loaded. Current used RAM: 24.0 GB[0m
[32m2024-08-14 12:26:22.452[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m153[0m - [1mUnloading model casperhansen/mistral-nemo-instruct-2407-awq after 10 seconds.[0m
[32m2024-08-14 12:26:23.235[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m140[0m - [1mLoading model casperhansen/mistral-nemo-instruct-2407-awq for 10 seconds.[0m


INFO 08-14 12:26:23 awq_marlin.py:77] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-14 12:26:23 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='casperhansen/mistral-nemo-instruct-2407-awq', speculative_config=None, tokenizer='casperhansen/mistral-nemo-instruct-2407-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=casperhansen/mistral-nemo-instruct-2407-awq, use_v2_bl

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.58it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.18it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.23it/s]



INFO 08-14 12:26:26 model_runner.py:692] Loading model weights took 7.8469 GB
INFO 08-14 12:26:29 gpu_executor.py:102] # GPU blocks: 12365, # CPU blocks: 1638
INFO 08-14 12:26:29 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-14 12:26:29 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-14 12:26:43 model_runner.py:1181] Graph capturing finished in 14 secs.


[32m2024-08-14 12:26:43.513[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m88[0m - [1mModel casperhansen/mistral-nemo-instruct-2407-awq loaded. Current used RAM: 24.0 GB[0m
[32m2024-08-14 12:26:53.524[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m153[0m - [1mUnloading model casperhansen/mistral-nemo-instruct-2407-awq after 10 seconds.[0m
[32m2024-08-14 12:26:54.218[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m140[0m - [1mLoading model casperhansen/mistral-nemo-instruct-2407-awq for 10 seconds.[0m


INFO 08-14 12:26:54 awq_marlin.py:77] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-14 12:26:54 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='casperhansen/mistral-nemo-instruct-2407-awq', speculative_config=None, tokenizer='casperhansen/mistral-nemo-instruct-2407-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=casperhansen/mistral-nemo-instruct-2407-awq, use_v2_bl

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.57it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.17it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.21it/s]



INFO 08-14 12:26:58 model_runner.py:692] Loading model weights took 7.8469 GB
INFO 08-14 12:27:00 gpu_executor.py:102] # GPU blocks: 12365, # CPU blocks: 1638
INFO 08-14 12:27:01 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-14 12:27:01 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-14 12:27:14 model_runner.py:1181] Graph capturing finished in 14 secs.


[32m2024-08-14 12:27:14.629[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m88[0m - [1mModel casperhansen/mistral-nemo-instruct-2407-awq loaded. Current used RAM: 24.0 GB[0m
[32m2024-08-14 12:27:24.643[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m153[0m - [1mUnloading model casperhansen/mistral-nemo-instruct-2407-awq after 10 seconds.[0m
[32m2024-08-14 12:27:25.315[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m140[0m - [1mLoading model casperhansen/mistral-nemo-instruct-2407-awq for 10 seconds.[0m


INFO 08-14 12:27:25 awq_marlin.py:77] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-14 12:27:25 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='casperhansen/mistral-nemo-instruct-2407-awq', speculative_config=None, tokenizer='casperhansen/mistral-nemo-instruct-2407-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=casperhansen/mistral-nemo-instruct-2407-awq, use_v2_bl

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.60it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.15it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.20it/s]



INFO 08-14 12:27:29 model_runner.py:692] Loading model weights took 7.8469 GB
INFO 08-14 12:27:31 gpu_executor.py:102] # GPU blocks: 12365, # CPU blocks: 1638
INFO 08-14 12:27:32 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-14 12:27:32 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-14 12:27:46 model_runner.py:1181] Graph capturing finished in 14 secs.


[32m2024-08-14 12:27:46.544[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m88[0m - [1mModel casperhansen/mistral-nemo-instruct-2407-awq loaded. Current used RAM: 24.0 GB[0m
[32m2024-08-14 12:27:56.555[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m153[0m - [1mUnloading model casperhansen/mistral-nemo-instruct-2407-awq after 10 seconds.[0m
[32m2024-08-14 12:27:57.098[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m140[0m - [1mLoading model casperhansen/mistral-nemo-instruct-2407-awq for 10 seconds.[0m


INFO 08-14 12:27:57 awq_marlin.py:77] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-14 12:27:57 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='casperhansen/mistral-nemo-instruct-2407-awq', speculative_config=None, tokenizer='casperhansen/mistral-nemo-instruct-2407-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=casperhansen/mistral-nemo-instruct-2407-awq, use_v2_bl

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.62it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.21it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.26it/s]



INFO 08-14 12:28:00 model_runner.py:692] Loading model weights took 7.8469 GB
INFO 08-14 12:28:03 gpu_executor.py:102] # GPU blocks: 12365, # CPU blocks: 1638
INFO 08-14 12:28:03 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-14 12:28:03 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-14 12:28:17 model_runner.py:1181] Graph capturing finished in 14 secs.


[32m2024-08-14 12:28:17.372[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m88[0m - [1mModel casperhansen/mistral-nemo-instruct-2407-awq loaded. Current used RAM: 24.0 GB[0m
[32m2024-08-14 12:28:27.384[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m153[0m - [1mUnloading model casperhansen/mistral-nemo-instruct-2407-awq after 10 seconds.[0m
[32m2024-08-14 12:28:27.885[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m140[0m - [1mLoading model casperhansen/mistral-nemo-instruct-2407-awq for 10 seconds.[0m


INFO 08-14 12:28:28 awq_marlin.py:77] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-14 12:28:28 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='casperhansen/mistral-nemo-instruct-2407-awq', speculative_config=None, tokenizer='casperhansen/mistral-nemo-instruct-2407-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=casperhansen/mistral-nemo-instruct-2407-awq, use_v2_bl

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.68it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.26it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.31it/s]



INFO 08-14 12:28:31 model_runner.py:692] Loading model weights took 7.8469 GB
INFO 08-14 12:28:34 gpu_executor.py:102] # GPU blocks: 12365, # CPU blocks: 1638
INFO 08-14 12:28:34 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-14 12:28:34 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-14 12:28:47 model_runner.py:1181] Graph capturing finished in 13 secs.


[32m2024-08-14 12:28:47.527[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m88[0m - [1mModel casperhansen/mistral-nemo-instruct-2407-awq loaded. Current used RAM: 24.0 GB[0m
[32m2024-08-14 12:28:57.538[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m153[0m - [1mUnloading model casperhansen/mistral-nemo-instruct-2407-awq after 10 seconds.[0m
[32m2024-08-14 12:28:58.063[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m140[0m - [1mLoading model casperhansen/mistral-nemo-instruct-2407-awq for 10 seconds.[0m


INFO 08-14 12:28:58 awq_marlin.py:77] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-14 12:28:58 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='casperhansen/mistral-nemo-instruct-2407-awq', speculative_config=None, tokenizer='casperhansen/mistral-nemo-instruct-2407-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=casperhansen/mistral-nemo-instruct-2407-awq, use_v2_bl

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.80it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.32it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.37it/s]



INFO 08-14 12:29:01 model_runner.py:692] Loading model weights took 7.8469 GB
INFO 08-14 12:29:04 gpu_executor.py:102] # GPU blocks: 12365, # CPU blocks: 1638
INFO 08-14 12:29:04 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-14 12:29:04 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-14 12:29:17 model_runner.py:1181] Graph capturing finished in 13 secs.


[32m2024-08-14 12:29:17.706[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m88[0m - [1mModel casperhansen/mistral-nemo-instruct-2407-awq loaded. Current used RAM: 24.0 GB[0m
[32m2024-08-14 12:29:27.719[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m153[0m - [1mUnloading model casperhansen/mistral-nemo-instruct-2407-awq after 10 seconds.[0m
[32m2024-08-14 12:29:28.268[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m140[0m - [1mLoading model casperhansen/mistral-nemo-instruct-2407-awq for 10 seconds.[0m


INFO 08-14 12:29:28 awq_marlin.py:77] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-14 12:29:28 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='casperhansen/mistral-nemo-instruct-2407-awq', speculative_config=None, tokenizer='casperhansen/mistral-nemo-instruct-2407-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=casperhansen/mistral-nemo-instruct-2407-awq, use_v2_bl

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.60it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.18it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.23it/s]



INFO 08-14 12:29:32 model_runner.py:692] Loading model weights took 7.8469 GB
INFO 08-14 12:29:34 gpu_executor.py:102] # GPU blocks: 12365, # CPU blocks: 1638
INFO 08-14 12:29:35 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-14 12:29:35 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-14 12:29:49 model_runner.py:1181] Graph capturing finished in 14 secs.


[32m2024-08-14 12:29:49.183[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m88[0m - [1mModel casperhansen/mistral-nemo-instruct-2407-awq loaded. Current used RAM: 24.0 GB[0m
[32m2024-08-14 12:29:59.195[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m153[0m - [1mUnloading model casperhansen/mistral-nemo-instruct-2407-awq after 10 seconds.[0m
[32m2024-08-14 12:29:59.729[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m140[0m - [1mLoading model casperhansen/mistral-nemo-instruct-2407-awq for 10 seconds.[0m


INFO 08-14 12:29:59 awq_marlin.py:77] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-14 12:29:59 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='casperhansen/mistral-nemo-instruct-2407-awq', speculative_config=None, tokenizer='casperhansen/mistral-nemo-instruct-2407-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=casperhansen/mistral-nemo-instruct-2407-awq, use_v2_bl

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.69it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.23it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.29it/s]



INFO 08-14 12:30:03 model_runner.py:692] Loading model weights took 7.8469 GB
INFO 08-14 12:30:05 gpu_executor.py:102] # GPU blocks: 12365, # CPU blocks: 1638
INFO 08-14 12:30:07 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-14 12:30:07 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-14 12:30:22 model_runner.py:1181] Graph capturing finished in 15 secs.


[32m2024-08-14 12:30:22.251[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m88[0m - [1mModel casperhansen/mistral-nemo-instruct-2407-awq loaded. Current used RAM: 24.0 GB[0m
[32m2024-08-14 12:30:32.260[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m153[0m - [1mUnloading model casperhansen/mistral-nemo-instruct-2407-awq after 10 seconds.[0m
[32m2024-08-14 12:30:33.042[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m140[0m - [1mLoading model casperhansen/mistral-nemo-instruct-2407-awq for 10 seconds.[0m


INFO 08-14 12:30:33 awq_marlin.py:77] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-14 12:30:33 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='casperhansen/mistral-nemo-instruct-2407-awq', speculative_config=None, tokenizer='casperhansen/mistral-nemo-instruct-2407-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=casperhansen/mistral-nemo-instruct-2407-awq, use_v2_bl

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.62it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.18it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.23it/s]



INFO 08-14 12:30:36 model_runner.py:692] Loading model weights took 7.8469 GB
INFO 08-14 12:30:39 gpu_executor.py:102] # GPU blocks: 12365, # CPU blocks: 1638
INFO 08-14 12:30:40 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-14 12:30:40 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-14 12:30:53 model_runner.py:1181] Graph capturing finished in 13 secs.


[32m2024-08-14 12:30:53.482[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m88[0m - [1mModel casperhansen/mistral-nemo-instruct-2407-awq loaded. Current used RAM: 24.0 GB[0m
[32m2024-08-14 12:31:03.495[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m153[0m - [1mUnloading model casperhansen/mistral-nemo-instruct-2407-awq after 10 seconds.[0m
[32m2024-08-14 12:31:04.175[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m140[0m - [1mLoading model casperhansen/mistral-nemo-instruct-2407-awq for 10 seconds.[0m


INFO 08-14 12:31:04 awq_marlin.py:77] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-14 12:31:04 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='casperhansen/mistral-nemo-instruct-2407-awq', speculative_config=None, tokenizer='casperhansen/mistral-nemo-instruct-2407-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=casperhansen/mistral-nemo-instruct-2407-awq, use_v2_bl

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.67it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.22it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.27it/s]



INFO 08-14 12:31:07 model_runner.py:692] Loading model weights took 7.8469 GB
INFO 08-14 12:31:10 gpu_executor.py:102] # GPU blocks: 12365, # CPU blocks: 1638
INFO 08-14 12:31:10 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-14 12:31:10 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-14 12:31:26 model_runner.py:1181] Graph capturing finished in 15 secs.


[32m2024-08-14 12:31:26.333[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m88[0m - [1mModel casperhansen/mistral-nemo-instruct-2407-awq loaded. Current used RAM: 24.0 GB[0m
[32m2024-08-14 12:31:36.345[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m153[0m - [1mUnloading model casperhansen/mistral-nemo-instruct-2407-awq after 10 seconds.[0m
[32m2024-08-14 12:31:37.098[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m140[0m - [1mLoading model casperhansen/mistral-nemo-instruct-2407-awq for 10 seconds.[0m


INFO 08-14 12:31:37 awq_marlin.py:77] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-14 12:31:37 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='casperhansen/mistral-nemo-instruct-2407-awq', speculative_config=None, tokenizer='casperhansen/mistral-nemo-instruct-2407-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=casperhansen/mistral-nemo-instruct-2407-awq, use_v2_bl

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.56it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.18it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.22it/s]



INFO 08-14 12:31:40 model_runner.py:692] Loading model weights took 7.8469 GB
INFO 08-14 12:31:43 gpu_executor.py:102] # GPU blocks: 12365, # CPU blocks: 1638
INFO 08-14 12:31:44 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-14 12:31:44 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-14 12:31:58 model_runner.py:1181] Graph capturing finished in 14 secs.


[32m2024-08-14 12:31:58.202[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m88[0m - [1mModel casperhansen/mistral-nemo-instruct-2407-awq loaded. Current used RAM: 24.0 GB[0m
[32m2024-08-14 12:32:08.213[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m153[0m - [1mUnloading model casperhansen/mistral-nemo-instruct-2407-awq after 10 seconds.[0m
[32m2024-08-14 12:32:08.837[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m140[0m - [1mLoading model casperhansen/mistral-nemo-instruct-2407-awq for 10 seconds.[0m


INFO 08-14 12:32:08 awq_marlin.py:77] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-14 12:32:08 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='casperhansen/mistral-nemo-instruct-2407-awq', speculative_config=None, tokenizer='casperhansen/mistral-nemo-instruct-2407-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=casperhansen/mistral-nemo-instruct-2407-awq, use_v2_bl

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.63it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.21it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.26it/s]



INFO 08-14 12:32:12 model_runner.py:692] Loading model weights took 7.8469 GB
INFO 08-14 12:32:15 gpu_executor.py:102] # GPU blocks: 12365, # CPU blocks: 1638
INFO 08-14 12:32:16 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-14 12:32:16 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-14 12:32:30 model_runner.py:1181] Graph capturing finished in 14 secs.


[32m2024-08-14 12:32:30.222[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m88[0m - [1mModel casperhansen/mistral-nemo-instruct-2407-awq loaded. Current used RAM: 24.0 GB[0m
[32m2024-08-14 12:32:40.233[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m153[0m - [1mUnloading model casperhansen/mistral-nemo-instruct-2407-awq after 10 seconds.[0m
[32m2024-08-14 12:32:41.014[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m140[0m - [1mLoading model casperhansen/mistral-nemo-instruct-2407-awq for 10 seconds.[0m


INFO 08-14 12:32:41 awq_marlin.py:77] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-14 12:32:41 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='casperhansen/mistral-nemo-instruct-2407-awq', speculative_config=None, tokenizer='casperhansen/mistral-nemo-instruct-2407-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=casperhansen/mistral-nemo-instruct-2407-awq, use_v2_bl

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.63it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.20it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.24it/s]



INFO 08-14 12:32:44 model_runner.py:692] Loading model weights took 7.8469 GB
INFO 08-14 12:32:47 gpu_executor.py:102] # GPU blocks: 12365, # CPU blocks: 1638
INFO 08-14 12:32:47 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-14 12:32:47 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-14 12:33:02 model_runner.py:1181] Graph capturing finished in 14 secs.


[32m2024-08-14 12:33:02.403[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m88[0m - [1mModel casperhansen/mistral-nemo-instruct-2407-awq loaded. Current used RAM: 24.0 GB[0m
[32m2024-08-14 12:33:12.415[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m153[0m - [1mUnloading model casperhansen/mistral-nemo-instruct-2407-awq after 10 seconds.[0m
[32m2024-08-14 12:33:13.061[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m140[0m - [1mLoading model casperhansen/mistral-nemo-instruct-2407-awq for 10 seconds.[0m


INFO 08-14 12:33:13 awq_marlin.py:77] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-14 12:33:13 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='casperhansen/mistral-nemo-instruct-2407-awq', speculative_config=None, tokenizer='casperhansen/mistral-nemo-instruct-2407-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=casperhansen/mistral-nemo-instruct-2407-awq, use_v2_bl

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.38it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.21it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.23it/s]



INFO 08-14 12:33:16 model_runner.py:692] Loading model weights took 7.8469 GB
INFO 08-14 12:33:19 gpu_executor.py:102] # GPU blocks: 12365, # CPU blocks: 1638
INFO 08-14 12:33:19 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-14 12:33:19 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-14 12:33:34 model_runner.py:1181] Graph capturing finished in 14 secs.


[32m2024-08-14 12:33:34.036[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m88[0m - [1mModel casperhansen/mistral-nemo-instruct-2407-awq loaded. Current used RAM: 24.0 GB[0m
[32m2024-08-14 12:33:44.048[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m153[0m - [1mUnloading model casperhansen/mistral-nemo-instruct-2407-awq after 10 seconds.[0m
[32m2024-08-14 12:33:44.878[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m140[0m - [1mLoading model casperhansen/mistral-nemo-instruct-2407-awq for 10 seconds.[0m


INFO 08-14 12:33:45 awq_marlin.py:77] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-14 12:33:45 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='casperhansen/mistral-nemo-instruct-2407-awq', speculative_config=None, tokenizer='casperhansen/mistral-nemo-instruct-2407-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=casperhansen/mistral-nemo-instruct-2407-awq, use_v2_bl

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.57it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.20it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.24it/s]



INFO 08-14 12:33:48 model_runner.py:692] Loading model weights took 7.8469 GB
INFO 08-14 12:33:51 gpu_executor.py:102] # GPU blocks: 12365, # CPU blocks: 1638
INFO 08-14 12:33:51 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-14 12:33:51 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-14 12:34:04 model_runner.py:1181] Graph capturing finished in 13 secs.


[32m2024-08-14 12:34:04.825[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m88[0m - [1mModel casperhansen/mistral-nemo-instruct-2407-awq loaded. Current used RAM: 24.0 GB[0m
[32m2024-08-14 12:34:14.838[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m153[0m - [1mUnloading model casperhansen/mistral-nemo-instruct-2407-awq after 10 seconds.[0m
[32m2024-08-14 12:34:15.603[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m140[0m - [1mLoading model casperhansen/mistral-nemo-instruct-2407-awq for 10 seconds.[0m


INFO 08-14 12:34:15 awq_marlin.py:77] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-14 12:34:15 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='casperhansen/mistral-nemo-instruct-2407-awq', speculative_config=None, tokenizer='casperhansen/mistral-nemo-instruct-2407-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=casperhansen/mistral-nemo-instruct-2407-awq, use_v2_bl

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.61it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.15it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.20it/s]



INFO 08-14 12:34:19 model_runner.py:692] Loading model weights took 7.8469 GB
INFO 08-14 12:34:22 gpu_executor.py:102] # GPU blocks: 12365, # CPU blocks: 1638
INFO 08-14 12:34:22 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-14 12:34:22 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-14 12:34:35 model_runner.py:1181] Graph capturing finished in 13 secs.


[32m2024-08-14 12:34:35.858[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m88[0m - [1mModel casperhansen/mistral-nemo-instruct-2407-awq loaded. Current used RAM: 24.0 GB[0m
[32m2024-08-14 12:34:45.870[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m153[0m - [1mUnloading model casperhansen/mistral-nemo-instruct-2407-awq after 10 seconds.[0m
[32m2024-08-14 12:34:46.612[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m140[0m - [1mLoading model casperhansen/mistral-nemo-instruct-2407-awq for 10 seconds.[0m


INFO 08-14 12:34:46 awq_marlin.py:77] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-14 12:34:46 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='casperhansen/mistral-nemo-instruct-2407-awq', speculative_config=None, tokenizer='casperhansen/mistral-nemo-instruct-2407-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=casperhansen/mistral-nemo-instruct-2407-awq, use_v2_bl

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.60it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.20it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.24it/s]



INFO 08-14 12:34:50 model_runner.py:692] Loading model weights took 7.8469 GB
INFO 08-14 12:34:52 gpu_executor.py:102] # GPU blocks: 12365, # CPU blocks: 1638
INFO 08-14 12:34:53 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-14 12:34:53 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-14 12:35:06 model_runner.py:1181] Graph capturing finished in 13 secs.


[32m2024-08-14 12:35:06.346[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m88[0m - [1mModel casperhansen/mistral-nemo-instruct-2407-awq loaded. Current used RAM: 24.0 GB[0m
[32m2024-08-14 12:35:16.359[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m153[0m - [1mUnloading model casperhansen/mistral-nemo-instruct-2407-awq after 10 seconds.[0m
[32m2024-08-14 12:35:16.986[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_scheduler[0m:[36m140[0m - [1mLoading model casperhansen/mistral-nemo-instruct-2407-awq for 10 seconds.[0m


INFO 08-14 12:35:17 awq_marlin.py:77] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-14 12:35:17 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='casperhansen/mistral-nemo-instruct-2407-awq', speculative_config=None, tokenizer='casperhansen/mistral-nemo-instruct-2407-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=casperhansen/mistral-nemo-instruct-2407-awq, use_v2_bl

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.47it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.17it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.20it/s]



INFO 08-14 12:35:21 model_runner.py:692] Loading model weights took 7.8469 GB
INFO 08-14 12:35:23 gpu_executor.py:102] # GPU blocks: 12365, # CPU blocks: 1638
INFO 08-14 12:35:24 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-14 12:35:24 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-14 12:35:36 model_runner.py:1181] Graph capturing finished in 12 secs.


[32m2024-08-14 12:35:36.736[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m88[0m - [1mModel casperhansen/mistral-nemo-instruct-2407-awq loaded. Current used RAM: 24.0 GB[0m


In [28]:
from vllm import SamplingParams

sampling_params = SamplingParams(seed=43, max_tokens=100, temperature=0.5, top_p=0.9, top_k=50)
list(model_manager.active_models.values())[0].generate("Hello, my name is", sampling_params=sampling_params)

Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.60s/it, est. speed input: 3.75 toks/s, output: 62.46 toks/s]


[RequestOutput(request_id=2, prompt='Hello, my name is', prompt_token_ids=[1, 22177, 1044, 2036, 2564, 1395], prompt_logprobs=None, outputs=[CompletionOutput(index=0, text=" [Your Name], and I am a [Your Age]-year-old [Your Gender] from [Your City/State/Country]. I am here to learn about [Your Interest or Goal]. I would like to ask you some questions about [Your Topic of Interest] if you don't mind.\n\n في: Hi, my name is [Your Name], and I am a [Your Age]-year-old [Your Gender] from [Your City/State/Country]. I am here to learn about", token_ids=(1766, 16994, 9878, 3605, 1321, 1362, 1855, 1261, 1766, 16994, 21191, 26118, 26098, 15962, 1766, 16994, 59944, 1093, 1562, 1766, 16994, 6308, 1047, 3906, 14787, 15543, 3077, 1362, 1855, 3226, 1317, 8178, 2314, 1766, 16994, 24789, 1505, 57148, 3077, 1362, 2168, 2479, 1317, 4237, 1636, 2269, 8352, 2314, 1766, 16994, 96620, 1307, 24789, 1093, 1693, 1636, 2607, 2405, 5759, 1338, 1819, 1058, 24665, 1044, 2036, 2564, 1395, 1766, 16994, 9878, 3605, 1

In [20]:
model_manager.load_model(ModelZoo.get_random())

In [8]:
import torch
free, total = torch.cuda.mem_get_info(0) 
free / (1024**3), total / (1024**3)

(42.999755859375, 44.3516845703125)

In [None]:
del model_manager.active_models[model_config].llm_engine

In [None]:
#llm is a vllm.LLM object
import gc
import torch
from vllm.model_executor.parallel_utils.parallel_state import destroy_model_parallel

destroy_model_parallel()
#del a vllm.executor.ray_gpu_executor.RayGPUExecutor object
del llm.llm_engine.model_executor
del llm
gc.collect()
torch.cuda.empty_cache()
import ray
ray.shutdown()

In [None]:
import wandb

wandb.login(relogin=True, key="ae29a588c238d0e168d620e0b18a5e29e283935a")
wandb.init(
    project="validators",
    entity="felix-quinque-macrocosmos-ai",
)

In [None]:
import wandb


WANDB = wandb.init(
        project="validator",
        entity="felix-quinque-macrocosmos"
    )

In [None]:
import bittensor as bt

wallet = bt.wallet(name="dalkfjsl", hotkey=None)
metagraph = bt.metagraph(netuid=1)

In [None]:
class Version:
    """Same as packaging.version, but also supports comparison to strings"""

    def __init__(self, version: str = "1.2.3"):
        self.version: str = version

    def __str__(self):
        return f"{self.version}"

    def __repr__(self):
        return f"{self.version}"

    def __eq__(self, other):
        other = other.version if isinstance(other, Version) else other
        return self.version == other

    def __le__(self, other):
        other = other.version if isinstance(other, Version) else other
        return True if all([v <= o for v, o in zip(self.version.split("."), other.split("."))]) else False

    def __lt__(self, other):
        other = other.version if isinstance(other, Version) else other
        return True if self <= other and self != other else False

    def __ge__(self, other):
        other = other.version if isinstance(other, Version) else other
        return True if not (self < other) else False

    def __gt__(self, other):
        other = other.version if isinstance(other, Version) else other
        return True if not (self <= other) else False



Version("1.2.3") < "1.2.2.4"

In [None]:
version_split = "1.2.3".split(".")
(1000 * int(version_split[0])) + (10 * int(version_split[1])) + (1 * int(version_split[2]))

In [None]:
import time
i = 0

start_time = time.time()
for _ in range(10):
    time.sleep(0.1)
    print(f"{-start_time+(start_time := time.time())} seconds have passed")
    # print(f"{(start_time := time.time())-start_time} seconds have passed")


In [None]:
class Foo():
    def __init__(self):
        self.__bar = 42

foo = Foo()
foo.__dict__

In [None]:
import bittensor as bt

In [None]:
metagraph = bt.metagraph(netuid=61, network="test", sync=True, lite=False)

In [None]:
metagraph.axons

In [None]:
import numpy as np


x = np.reshape(np.random.random(size=10), (1,-1))
y = np.reshape(np.random.random(size=10), (-1,1))

float(x@y), x@y

In [None]:
# from prompting.tasks.date_qa import DateQuestionAnsweringTask
from prompting.tasks.base_task import BaseTask
from prompting.rewards.reward import BaseRewardModel
from prompting.tasks.summarization import SummarizationTask, SummarizationRewardConfig
from prompting.tasks.qa import QuestionAnsweringTask, QARewardConfig

from prompting.datasets.wiki import WikiDataset
from prompting.datasets.base import BaseDataset
from pydantic import BaseModel, ConfigDict
import random
from typing import ClassVar
import bittensor as bt


class TaskConfig(BaseModel):
    task: BaseTask.__class__
    probability: float
    datasets: list[BaseDataset.__class__]
    reward_model: BaseRewardModel.__class__

    model_config = ConfigDict(arbitrary_types_allowed=True)


class TaskRegistry(BaseModel):
    tasks: ClassVar[list[TaskConfig]] = [
        TaskConfig(task=QuestionAnsweringTask, probability=0.6, datasets=[WikiDataset], reward_model=QARewardConfig),
        TaskConfig(
            task=SummarizationTask, probability=0.4, datasets=[WikiDataset], reward_model=SummarizationRewardConfig
        ),
        # TaskConfig(task=DateQuestionAnsweringTask, probability=0.2, datasets=[WikiDateDataset])
    ]

    @classmethod
    def random(self) -> TaskConfig:
        probabilities = [task.probability for task in self.tasks]
        selected_task = random.choices(self.tasks, probabilities)[0]
        return selected_task

    @classmethod
    def get_task_datasets(self, task: BaseTask.__class__):
        try:
            return [t.datasets for t in self.tasks if task is t.task][0]
        except Exception:
            bt.logging.error("Tried accessing non-registered task")
            return []

    @classmethod
    def get_random_task_dataset(self, task: BaseTask.__class__) -> BaseDataset.__class__:
        return random.choice(self.get_task_datasets(task))

    @classmethod
    def get_task_reward(self, task: BaseTask) -> BaseRewardModel:
        try:
            return [t.reward_model for t in self.tasks if task is t.task][0]
        except Exception:
            bt.logging.error("Tried accessing non-registered task")
            return []

    @classmethod
    def create_random_task(self, llm_pipeline) -> BaseTask:
        task_config = self.random()
        dataset = self.get_random_task_dataset(task_config.task)
        return task_config.task(
            llm_pipeline=llm_pipeline, context=dataset().next(), reward_config=task_config.reward_model()
        )


In [None]:
TaskRegistry().get_task_reward(QuestionAnsweringTask)