In [1]:
from exllamav2 import ExLlamaV2, ExLlamaV2Config, ExLlamaV2Tokenizer, ExLlamaV2Cache_Q6
# from exllamav2 import ExLlamaV2_HF
# from transformers import AutoTokenizer
import torch
from safetensors.torch import save_file
from datasets import load_dataset
from typing import List
import gc
import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(device)
num_gpus = torch.cuda.device_count()
# devices = [torch.device("cuda:0"), torch.device("cuda:1"), torch.device("cuda:2"), torch.device("cuda:3")]
devices = ["cuda:0", "cuda:1", "cuda:2", "cuda:3"]

cuda


In [2]:
for i in range(num_gpus):
        for j in range(num_gpus):
            if i != j:
                can_access = torch.cuda.can_device_access_peer(i, j)
                print(f"GPU {i} can access GPU {j}: {can_access}")

GPU 0 can access GPU 1: True
GPU 0 can access GPU 2: True
GPU 0 can access GPU 3: True
GPU 1 can access GPU 0: True
GPU 1 can access GPU 2: True
GPU 1 can access GPU 3: True
GPU 2 can access GPU 0: True
GPU 2 can access GPU 1: True
GPU 2 can access GPU 3: True
GPU 3 can access GPU 0: True
GPU 3 can access GPU 1: True
GPU 3 can access GPU 2: True


In [3]:
config = ExLlamaV2Config()
config.model_dir = "./llama-405b/models--ek826--Meta-Llama-3.1-405B-Instruct-6.0bpw-exl2/snapshots/c3aafe6600c3c081f514e33ea325da7a19cb1822"
config.prepare()
# tokenizer = AutoTokenizer.from_pretrained("./llama-405b/models--ek826--Meta-Llama-3.1-405B-Instruct-6.0bpw-exl2/snapshots/c3aafe6600c3c081f514e33ea325da7a19cb1822")
model = ExLlamaV2(config)
tokenizer = ExLlamaV2Tokenizer(config)
cache = ExLlamaV2Cache_Q6(model, lazy = True)
model.load_autosplit(cache)

In [4]:
ds = load_dataset("openai/gsm8k", "main", split="train")
questions = ds["question"]

Downloading readme:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [6]:
def get_logits(inputs : str) -> torch.tensor:
    inputs = tokenizer.encode(inputs)

    with torch.no_grad():
        outputs = model.forward(inputs)

    gc.collect()
    torch.cuda.empty_cache()
    
    return outputs

In [7]:
def save_logits(logits : List[torch.tensor], ) -> None:
    data = {}
    for i, logit in enumerate(logits):
        data[f"Question {i+1}"] = logit.cpu()

    save_file(data, "llama-3.1-405b-gsm8k-base-tensors.safetensors")

In [None]:
# to be used later on to load back the logits

from safetensors.torch import safe_open

def load_list_of_logits_safetensor(file_path):
    # Open the safetensor file
    with safe_open(file_path, framework="pt") as f:
        logits_list = []
        for key in f.keys():
            logits_list.append(f.get_tensor(key))
    
    return logits_list


In [None]:
logits = [(get_logits(question).to('cpu')) for question in tqdm(questions)]
save_logits(logits)