In [None]:
# Clone the nvidia TensorRT LLM repository
!git clone https://github.com/NVIDIA/TensorRT-LLM.git
%cd TensorRT-LLM/examples/llama

In [None]:
#check if tensorrt_llm is correctly installed
import tensorrt_llm

In [None]:
# tokens for gated models will be fetched through modal secrets
from huggingface_hub import snapshot_download


snapshot_download(
    "meta-llama/Meta-Llama-3-8B-Instruct",
    local_dir="tmp/hf_models/Meta-Llama-3-8B-Instruct",
    max_workers=4
)

In [None]:
# Convert the raw model weights into tensorrt-llm checkpoint format

!python convert_checkpoint.py --model_dir ./tmp/hf_models/Meta-Llama-3-8B-Instruct \
                             --output_dir ./tmp/trt_engines/1-gpu/ \
                             --dtype float16

In [None]:
# Compile the model, refere NVIDIA repo for info abouts arguments

!trtllm-build --checkpoint_dir ./tmp/trt_engines/1-gpu/ \
            --output_dir ./tmp/trt_engines/compiled-model/ \
            --gpt_attention_plugin float16 \
            --gemm_plugin float16 \
            --max_input_len 32256

In [None]:
# Upload the compiled model to hugging face hub

import os
from huggingface_hub import HfApi

for root, dirs, files in os.walk(f"tmp/trt_engines/compiled-model", topdown=False):
    for name in files:
        filepath = os.path.join(root, name)
        filename = "/".join(filepath.split("/")[-2:])
        print("uploading file: ", filename)
        api = HfApi(token= os.environ["HF_WRITE_TOKEN"])
        api.upload_file(
            path_or_fileobj=filepath,
            path_in_repo=filename,
            repo_id="agyaatcoder/llama3-8b-instruct-A100-trtllm"
        )

In [None]:
# Test the compiled model

!python3 run.py --max_output_len=256 \
               --tokenizer_dir ./llama/tmp/hf_models/llama3-8b-instruct-A100-trtllm/ \
               --engine_dir=./llama/tmp/trt_engines/compiled-model \
               --max_attention_window_size=4096