# vLLM Iteration-Level Schedule Optimization
## Ari Singer, Jack Holland and Vishwa Ramanakumar

This notebook is used to evaluate the performance of our custom vLLM schedule algorithm. It is designed to be run in Google Colab on an A100 instance, as the other instances available do not have enough capacity to fit the LLM used in our experimentation.

In [None]:
# This block mounts your google drive and sets up the ssh key for pulling from git
# Will need to set up ssh key yourself
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
! mkdir -p /root/.ssh/
%cd /content/drive/MyDrive/eecs598_genai
! cp ./.ssh/* /root/.ssh/
! ssh-keyscan -t rsa github.com >> ~/.ssh/known_hosts
! chmod go-rwx /root/.ssh/id_rsa"""

In [None]:
# Setup various paths
base_drive_path="/content/drive/MyDrive/eecs598_genai"
base_github_path="/content/github_repo"
python_path=f"{base_github_path}/python"
vllm_path=f"{python_path}/vllm_custom"
output_path=f"{base_drive_path}/output"
data_path=f"{base_github_path}/data"
requirements_path=f"{base_drive_path}/colab_requirements/on_disk_requirements.txt"

In [None]:
# Initial clone of github repo
%cd /content
! git clone --depth 1 -b main git@github.com:ajsinger1/eecs598-genai-project.git github_repo

In [None]:
# Installing dependencies (note that we are using the custom vllm), this should take ~10+ min
!pip install --upgrade pip
!pip uninstall -y torchaudio torchdata torchtext torchvision
!pip install -e {vllm_path} torchaudio torchdata torchtext torchvision

In [None]:
# Adding our custom python module directory to the path
import sys
sys.path.append(f"{python_path}")
sys.path.append(f"{vllm_path}")

In [None]:
# Imports 
import vllm
import pandas as pd
from llama_chat_helpers import format_prompt
from vllm.model_executor.parallel_utils.parallel_state import destroy_model_parallel
import gc
import torch

In [None]:
# Loading prompt data as a pandas DataFrame
data = pd.read_json(f"{data_path}/prompts.json")
print(data)

In [None]:
# Initializing LLM (Llama 7B Chat)
model = "meta-llama/Llama-2-7b-chat-hf"
llm = vllm.LLM(model=model, preemption_threshold=600, preemption_mode_upper_threshold=1000)

In [None]:
# Setup prompts
prompts = data['prompt'].map(format_prompt)
print(prompts)

In [None]:
import os

model = "meta-llama/Llama-2-7b-chat-hf"

preemption_thresholds = [200, 400, 600, 800, 1000]
preemption_upper_thresholds = [300, 1000, 30000]

sampling_params = vllm.SamplingParams(temperature=0.8, top_p=0.95, max_tokens=4096, seed=598)

configs = [(preemption_threshold, preemption_upper_threshold) for preemption_threshold in preemption_thresholds for preemption_upper_threshold in preemption_upper_thresholds]
configs = configs[::-1]

# SLICE CONFIGS TO TELL IT WHERE TO START
#configs = configs[1:]

for index, (preemption_threshold, preemption_upper_threshold) in enumerate(configs):
  print(f"Beginning experiment {index + 1}: {preemption_threshold}-{preemption_upper_threshold}")
  # Check if outputpath/
  file_name = f"preemption-threshold{preemption_threshold}_preemption-mode-upper-threshold{preemption_upper_threshold}.txt"
  file_path = os.path.join(output_path, file_name)

  if os.path.exists(file_path):
    print(f"Experiment {index + 1} already exists. Skipping.")
    continue

  while True:
    try:
      llm = vllm.LLM(model=model, preemption_threshold=preemption_threshold, preemption_mode_upper_threshold=preemption_upper_threshold)
      break
    except:
      # LLM Creation failed. Trying to free memory.
      destroy_model_parallel()
      gc.collect()
      torch.distributed.destroy_process_group()
      torch.cuda.empty_cache()
      print("Successfully delete the llm pipeline and free the GPU memory!")

  # RUN EXPERIMENT HERE
  # Note: our vLLM implementation does our timing/saving for us
  try:
    outputs = llm.generate(prompts, sampling_params, filedir=output_path)
  except Exception as e:
    print(f"FAILURE FOR EXPERIMENT {index+1}: {preemption_threshold}-{preemption_upper_threshold} ({e})")

  # Cleanup
  destroy_model_parallel()
  del llm
  gc.collect()
  torch.distributed.destroy_process_group()
  torch.cuda.empty_cache()
  print("Successfully delete the llm pipeline and free the GPU memory!")

In [None]:
destroy_model_parallel()
del llm
gc.collect()
torch.distributed.destroy_process_group()
torch.cuda.empty_cache()
print("Successfully delete the llm pipeline and free the GPU memory!")