In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from safetensors.torch import save_file
from datasets import load_dataset
from dotenv import load_dotenv
import os
from typing import List
load_dotenv()


device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(device)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-405B-FP8", token=os.getenv("HUGGINGFACE"))
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-405B-FP8", token=os.getenv("HUGGINGFACE"), device_map="cuda")
model.eval()

In [None]:
ds = load_dataset("openai/gsm8k", "main", split="train")
questions = ds["question"]

In [None]:
def get_logits(inputs : str) -> torch.tensor:
    inputs = tokenizer(inputs, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        return outputs.logits

In [None]:
def save_logits(logits : List[torch.tensor], ) -> None:
    data = {}
    for i, logit in enumerate(logits):
        data[f"Question {i+1}"] = logit.cpu()

    save_file(data, "llama-3.1-405b-gsm8k-base-tensors.safetensors")

In [None]:
# to be used later on to load back the logits

from safetensors.torch import safe_open

def load_list_of_logits_safetensor(file_path):
    # Open the safetensor file
    with safe_open(file_path, framework="pt") as f:
        logits_list = []
        for key in f.keys():
            logits_list.append(f.get_tensor(key))
    
    return logits_list


In [None]:
logits = [get_logits(question) for question in questions]
save_logits(logits)

RuntimeError: Failed to import transformers.integrations.fbgemm_fp8 because of the following error (look up to see its traceback):
libcudart.so.12: cannot open shared object file: No such file or directory

linux-vdso.so.1 (0x00007ffe239fe000)
        libtorch.so => not found (wtf??????)
        libc10.so => not found   
        libcudart.so.12 => /usr/local/cuda/targets/x86_64-linux/lib/libcudart.so.12 (0x000073e204e00000) (why did this say not found???)
        libc10_cuda.so => not found (issue)
        libnvidia-ml.so.1 => /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1 (0x000073e203c00000)
        libtorch_cpu.so => not found
        libtorch_cuda.so => not found (massive issue)
        librt.so.1 => /usr/lib/x86_64-linux-gnu/librt.so.1 (0x000073e20513a000)
        libpthread.so.0 => /usr/lib/x86_64-linux-gnu/libpthread.so.0 (0x000073e205135000)
        libstdc++.so.6 => /usr/lib/x86_64-linux-gnu/libstdc++.so.6 (0x000073e2039d4000)
        libm.so.6 => /usr/lib/x86_64-linux-gnu/libm.so.6 (0x000073e204d19000)
        libgcc_s.so.1 => /usr/lib/x86_64-linux-gnu/libgcc_s.so.1 (0x000073e205113000)
        libc.so.6 => /usr/lib/x86_64-linux-gnu/libc.so.6 (0x000073e2037ab000)
        /lib64/ld-linux-x86-64.so.2 (0x000073e25e59b000)
        libdl.so.2 => /usr/lib/x86_64-linux-gnu/libdl.so.2 (0x000073e20510e000)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[3], line 2
      1 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-405B-FP8", token=os.getenv("HUGGINGFACE"))
----> 2 model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-405B-FP8", token=os.getenv("HUGGINGFACE"), device_map='cuda').to(device)
      3 model.eval()

File /usr/local/lib/python3.10/dist-packages/transformers/models/auto/auto_factory.py:564, in _BaseAutoModelClass.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
    562 elif type(config) in cls._model_mapping.keys():
    563     model_class = _get_model_class(config, cls._model_mapping)
--> 564     return model_class.from_pretrained(
    565         pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
    566     )
    567 raise ValueError(
    568     f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n"
    569     f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}."
    570 )

File /usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:3941, in PreTrainedModel.from_pretrained(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, *model_args, **kwargs)
   3931     if dtype_orig is not None:
   3932         torch.set_default_dtype(dtype_orig)
   3934     (
   3935         model,
   3936         missing_keys,
   3937         unexpected_keys,
   3938         mismatched_keys,
   3939         offload_index,
   3940         error_msgs,
-> 3941     ) = cls._load_pretrained_model(
   3942         model,
   3943         state_dict,
   3944         loaded_state_dict_keys,  # XXX: rename?
   3945         resolved_archive_file,
   3946         pretrained_model_name_or_path,
   3947         ignore_mismatched_sizes=ignore_mismatched_sizes,
   3948         sharded_metadata=sharded_metadata,
   3949         _fast_init=_fast_init,
   3950         low_cpu_mem_usage=low_cpu_mem_usage,
   3951         device_map=device_map,
   3952         offload_folder=offload_folder,
   3953         offload_state_dict=offload_state_dict,
   3954         dtype=torch_dtype,
   3955         hf_quantizer=hf_quantizer,
   3956         keep_in_fp32_modules=keep_in_fp32_modules,
   3957         gguf_path=gguf_path,
   3958     )
   3960 # make sure token embedding weights are still tied if needed
   3961 model.tie_weights()

File /usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:4415, in PreTrainedModel._load_pretrained_model(cls, model, state_dict, loaded_keys, resolved_archive_file, pretrained_model_name_or_path, ignore_mismatched_sizes, sharded_metadata, _fast_init, low_cpu_mem_usage, device_map, offload_folder, offload_state_dict, dtype, hf_quantizer, keep_in_fp32_modules, gguf_path)
   4411                 set_module_tensor_to_device(
   4412                     model_to_load, key, "cpu", torch.empty(*param.size(), dtype=dtype)
   4413                 )
   4414     else:
-> 4415         new_error_msgs, offload_index, state_dict_index = _load_state_dict_into_meta_model(
   4416             model_to_load,
   4417             state_dict,
   4418             loaded_keys,
   4419             start_prefix,
   4420             expected_keys,
   4421             device_map=device_map,
   4422             offload_folder=offload_folder,
   4423             offload_index=offload_index,
   4424             state_dict_folder=state_dict_folder,
   4425             state_dict_index=state_dict_index,
   4426             dtype=dtype,
   4427             hf_quantizer=hf_quantizer,
   4428             is_safetensors=is_safetensors,
   4429             keep_in_fp32_modules=keep_in_fp32_modules,
   4430             unexpected_keys=unexpected_keys,
   4431         )
   4432         error_msgs += new_error_msgs
   4433 else:
   4434     # Sharded checkpoint or whole but low_cpu_mem_usage==True

File /usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:936, in _load_state_dict_into_meta_model(model, state_dict, loaded_state_dict_keys, start_prefix, expected_keys, device_map, offload_folder, offload_index, state_dict_folder, state_dict_index, dtype, hf_quantizer, is_safetensors, keep_in_fp32_modules, unexpected_keys)
    925     state_dict_index = offload_weight(param, param_name, state_dict_folder, state_dict_index)
    926 elif (
    927     not is_quantized
    928     or (not hf_quantizer.requires_parameters_quantization)
   (...)
    934 ):
    935     # For backward compatibility with older versions of `accelerate` and for non-quantized params
--> 936     set_module_tensor_to_device(model, param_name, param_device, **set_module_kwargs)
    937 else:
    938     hf_quantizer.create_quantized_param(model, param, param_name, param_device, state_dict, unexpected_keys)

File /usr/local/lib/python3.10/dist-packages/accelerate/utils/modeling.py:373, in set_module_tensor_to_device(module, tensor_name, device, value, dtype, fp16_statistics, tied_params_map)
    369 if value is not None:
    370     # We can expect mismatches when using bnb 4bit since Params4bit will reshape and pack the weights.
    371     # In other cases, we want to make sure we're not loading checkpoints that do not match the config.
    372     if old_value.shape != value.shape and param_cls.__name__ != "Params4bit":
--> 373         raise ValueError(
    374             f'Trying to set a tensor of shape {value.shape} in "{tensor_name}" (which has shape {old_value.shape}), this looks incorrect.'
    375         )
    377     if dtype is None:
    378         # For compatibility with PyTorch load_state_dict which converts state dict dtype to existing dtype in model
    379         value = value.to(old_value.dtype)

ValueError: Trying to set a tensor of shape torch.Size([1024, 16384]) in "weight" (which has shape torch.Size([2048, 16384])), this looks incorrect.