In [1]:
!nvidia-smi

Sat Feb 10 04:12:02 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA RTX A4500               On  | 00000000:82:00.0 Off |                  Off |
| 30%   31C    P8              18W / 200W |      2MiB / 20470MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
#!pip install vllm

In [3]:
#!pip install pandas

In [4]:
#!pip install tqdm

In [5]:
from vllm import LLM, SamplingParams
import torch
import pandas as pd
import time
import os
import tqdm
#from prompts import questions
#from rich.progress import track

llm = LLM(
    "TheBloke/Mistral-7B-v0.1-AWQ",
    dtype=torch.float16,
    quantization="AWQ",    
    download_dir="/workspace/" 
    )

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


INFO 02-10 04:12:10 llm_engine.py:72] Initializing an LLM engine with config: model='TheBloke/Mistral-7B-v0.1-AWQ', tokenizer='TheBloke/Mistral-7B-v0.1-AWQ', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir='/workspace/', load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=False, kv_cache_dtype=auto, seed=0)
INFO 02-10 04:12:12 weight_utils.py:164] Using model weights format ['*.safetensors']


model.safetensors:   0%|          | 0.00/4.15G [00:00<?, ?B/s]

INFO 02-10 04:13:03 llm_engine.py:322] # GPU blocks: 4816, # CPU blocks: 2048
INFO 02-10 04:13:05 model_runner.py:632] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 02-10 04:13:05 model_runner.py:636] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 02-10 04:13:11 model_runner.py:698] Graph capturing finished in 6 secs.


In [6]:
sampling_params = SamplingParams(max_tokens=200)

In [7]:
out = llm.generate("This is me warming up the model", sampling_params=sampling_params)

Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.11s/it]


In [8]:
#out[0].prompt,
out[0].outputs[0].text

", I took some quite decent pictures of the model for one of my flight simulations a while back, so I started putting them together into a cute little thing (and title sequence...?)\n\nIf anyone has heard of or is even a part of the VTOL/SIM1000SANE area, you can use me as proof that not all the models/creations are much use and that you can be better than the rest of us.\n\nAnyway, a few comments:\n\n1) I have no idea how to contain the title screen within a single file, so go figure.\n\n2) My VCR was kinda acting up (see the blue and purple visual)\n\n3) My computer wasn't reacting very well because it seems like it wants an NTSC professor to Fren anim and I'm using NTSC. Obviously I don't know about video signals so if you're a little smarter than me"

In [9]:
import tqdm

questions = ["This is me warming up the model"]*100


print(f"running inference through {len(questions)} prompt.")

results = []
for q in tqdm.tqdm(questions):
    t0 = time.perf_counter()
    output = llm.generate(q, sampling_params=sampling_params)[0]
    t1 = time.perf_counter()
    results.append(
        {"time": t1 - t0, "tokens_generated": len(output.outputs[0].token_ids)}
    )

df = pd.DataFrame(results)
df["tokens_per_sec"] = df.tokens_generated / df.time
print(f"Average tokens/sec: {df.tokens_per_sec.mean(): .3f}")
df.to_csv(f"vllm-benchmark-{tensor_parallel_size}GPUs.csv", index=False)

running inference through 100 prompt.


  0%|          | 0/100 [00:00<?, ?it/s]
Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s][A
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.12s/it][A
  1%|          | 1/100 [00:02<03:30,  2.13s/it]
Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s][A
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.09s/it][A
  2%|▏         | 2/100 [00:04<03:26,  2.11s/it]
Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s][A
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.65s/it][A
  3%|▎         | 3/100 [00:05<03:04,  1.90s/it]
Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s][A
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.10s/it][A
  4%|▍         | 4/100 [00:07<03:10,  1.98s/it]
Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s][A
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.10s/it][A
  5%|▌         | 5/100 [00:10<03:12,  2.03s/it]
Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s][A
Processed prompts: 1

Average tokens/sec:  93.677


NameError: name 'tensor_parallel_size' is not defined

In [10]:
df

Unnamed: 0,time,tokens_generated,tokens_per_sec
0,2.127778,200,93.994775
1,2.096799,200,95.383477
2,1.657019,158,95.351982
3,2.101688,200,95.161592
4,2.101913,200,95.151417
...,...,...,...
95,2.149532,200,93.043512
96,0.263813,24,90.973508
97,2.149723,200,93.035231
98,1.658141,154,92.875069


In [12]:
tensor_parallel_size = 1
df.to_csv(f"vllm-benchmark-{tensor_parallel_size}GPUs.csv", index=False)