In [None]:
# Mounting Google Drive, will prompt for connection
# Must ensure that the "Systems for Gen AI W24" directory is in top level "My Drive" directory (add a shortcut in drive if necessary)
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
colab_path = '/content/drive/MyDrive/Systems for Gen AI W24/colab'

Mounted at /content/drive


In [None]:
# Adding our custom python module directory to the path
import sys
sys.path.append(f"{colab_path}/code/python")

In [None]:
# Installing dependencies
!pip install --upgrade pip
!pip uninstall -y torchaudio torchdata torchtext torchvision
!pip install vllm==0.3.3 torchaudio torchdata torchtext torchvision

Collecting pip
  Downloading pip-24.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-24.0
Found existing installation: torchaudio 2.1.0+cu121
Uninstalling torchaudio-2.1.0+cu121:
  Successfully uninstalled torchaudio-2.1.0+cu121
Found existing installation: torchdata 0.7.0
Uninstalling torchdata-0.7.0:
  Successfully uninstalled torchdata-0.7.0
Found existing installation: torchtext 0.16.0
Uninstalling torchtext-0.16.0:
  Successfully uninstalled torchtext-0.16.0
Found existing installation: torchvision 0.16.0+cu121
Uninstalling torchvision-0.16.0+cu121:
  Successfully uninstalled torchvision-0.16.0+cu121
[0mCollecting vllm
  Downloading vllm-0.3.3-cp310-cp310-manylinux1_x86_64.whl

In [None]:
# Imports
from vllm import LLM, SamplingParams
import pandas as pd
from llama_chat_helpers import format_prompt
from datetime import datetime

In [None]:
# Loading prompt data as a pandas DataFrame
data = pd.read_json(f"{colab_path}/data/prompts.json")
print(data)

         id                                             prompt
0         1               Give three tips for staying healthy.
1         2                 What are the three primary colors?
2        12                     What is the capital of France?
3        14         Discuss the causes of the Great Depression
4        18  Reverse engineer this code to create a new ver...
...     ...                                                ...
9995  51948  Find out if a given sentence is a metaphor.\n\...
9996  51951  List 5 strategies for better organization and ...
9997  51957  Compute the average age in the following list ...
9998  51965  Create five descriptors that describe the smel...
9999  51989  Given an input object, create an algorithm to ...

[10000 rows x 2 columns]


In [None]:
# Initializing LLM (Llama 7B Chat)
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf")

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

INFO 03-08 19:19:26 llm_engine.py:87] Initializing an LLM engine with config: model='meta-llama/Llama-2-7b-chat-hf', tokenizer='meta-llama/Llama-2-7b-chat-hf', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

INFO 03-08 19:19:34 weight_utils.py:163] Using model weights format ['*.safetensors']


model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

INFO 03-08 19:20:52 llm_engine.py:357] # GPU blocks: 2868, # CPU blocks: 512
INFO 03-08 19:20:54 model_runner.py:684] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 03-08 19:20:54 model_runner.py:688] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 03-08 19:21:01 model_runner.py:756] Graph capturing finished in 7 secs.


In [None]:
# Format prompts to match Llama 2 Chat Fine-tuned prompt format
prompts = data['prompt'].map(format_prompt)
print(prompts)

0       <s>[INST] <<SYS>>\nYou are a helpful, respectf...
1       <s>[INST] <<SYS>>\nYou are a helpful, respectf...
2       <s>[INST] <<SYS>>\nYou are a helpful, respectf...
3       <s>[INST] <<SYS>>\nYou are a helpful, respectf...
4       <s>[INST] <<SYS>>\nYou are a helpful, respectf...
                              ...                        
9995    <s>[INST] <<SYS>>\nYou are a helpful, respectf...
9996    <s>[INST] <<SYS>>\nYou are a helpful, respectf...
9997    <s>[INST] <<SYS>>\nYou are a helpful, respectf...
9998    <s>[INST] <<SYS>>\nYou are a helpful, respectf...
9999    <s>[INST] <<SYS>>\nYou are a helpful, respectf...
Name: prompt, Length: 10000, dtype: object


In [12]:
# Create sampling params
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=4096, seed=598)

# Generating outputs
outputs = llm.generate(prompts, sampling_params)

# Add output lengths and response text to a new dataframe
output_lengths = [len(output.outputs[0].token_ids) for output in outputs]
response_texts = [output.outputs[0].text for output in outputs]
response_data = data.assign(response_text=response_texts, output_length=output_lengths)

# Saving the response data
time_str = datetime.now().strftime("%m-%d-%Y-%H:%M")
response_data.to_json(f"{colab_path}/output/prompts-with-output-{time_str}.json", orient='records', indent=2)

The above code does everything we need it to for generating output and recording token length. Below are some additional code blocks for reference.

In [None]:
# Print out the outputs
for i, output in enumerate(outputs):
    prompt = data["prompt"][i]
    generated_text = output.outputs[0].text
    print("")
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

In [None]:
# Run this to get information about the runtime
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')