In [1]:
import os
import datetime
from huggingface_hub import hf_hub_download
from safetensors.torch import load_file
import torch

In [2]:
sf_filename = hf_hub_download("gpt2", filename="model.safetensors")
pt_filename = hf_hub_download("gpt2", filename="pytorch_model.bin")

In [3]:
start_st = datetime.datetime.now()
weights = load_file(sf_filename, device="cpu")
load_time_st = datetime.datetime.now() - start_st
print(f"Loaded safetensors {load_time_st}")

start_pt = datetime.datetime.now()
weights = torch.load(pt_filename, map_location="cpu")
load_time_pt = datetime.datetime.now() - start_pt
print(f"Loaded pytorch {load_time_pt}")

print(f"on CPU, safetensors is faster than pytorch by: {load_time_pt/load_time_st:.1f} X")

Loaded safetensors 0:00:00.008188


  weights = torch.load(pt_filename, map_location="cpu")


Loaded pytorch 0:00:00.321877
on CPU, safetensors is faster than pytorch by: 39.3 X


In [4]:
# This is required because this feature hasn't been fully verified yet, but 
# it's been tested on many different environments
os.environ["SAFETENSORS_FAST_GPU"] = "1"

# CUDA startup out of the measurement
torch.zeros((2, 2)).cuda()

start_st = datetime.datetime.now()
weights = load_file(sf_filename, device="cuda:0")
load_time_st = datetime.datetime.now() - start_st
print(f"Loaded safetensors {load_time_st}")

start_pt = datetime.datetime.now()
weights = torch.load(pt_filename, map_location="cuda:0")
load_time_pt = datetime.datetime.now() - start_pt
print(f"Loaded pytorch {load_time_pt}")

print(f"on GPU, safetensors is faster than pytorch by: {load_time_pt/load_time_st:.1f} X")

Loaded safetensors 0:00:00.146794


  weights = torch.load(pt_filename, map_location="cuda:0")


Loaded pytorch 0:00:00.439156
on GPU, safetensors is faster than pytorch by: 3.0 X


In [5]:
from transformers import PreTrainedTokenizer
import hashlib

class CustomHashTokenizer(PreTrainedTokenizer):
    vocab = {}
    def __init__(self, vocab=None, **kwargs):
        super().__init__(**kwargs)
        self.vocab = vocab or {}
        self.inv_vocab = {v: k for k, v in self.vocab.items()}

    def _hash_function(self, text):
        # Example hash function (SHA-256)
        return hashlib.sha256(text.encode('utf-8')).hexdigest()

    def _tokenize(self, text):
        # Tokenize the text using the hash function
        tokens = [self._hash_function(word) for word in text.split()]
        return tokens

    def _convert_token_to_id(self, token):
        # Convert token to ID (using the hash as the ID)
        return int(token, 16) % 10000  # Example: modulo to limit the ID range

    def _convert_id_to_token(self, index):
        # Convert ID back to token (not reversible in this example)
        return self.inv_vocab.get(index, "[UNK]")

    def encode(self, text, **kwargs):
        tokens = self._tokenize(text)
        token_ids = [self._convert_token_to_id(token) for token in tokens]
        return token_ids

    def decode(self, token_ids, **kwargs):
        tokens = [self._convert_id_to_token(token_id) for token_id in token_ids]
        return " ".join(tokens)

    def get_vocab(self):
        return self.vocab

    def save_vocabulary(self, save_directory, filename_prefix=None):
        # Save the vocabulary to a file
        vocab_file = f"{save_directory}/{filename_prefix}-vocab.json" if filename_prefix else f"{save_directory}/vocab.json"
        with open(vocab_file, 'w') as f:
            json.dump(self.vocab, f)
        return (vocab_file,)

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
        # Load the vocabulary from a file
        vocab_file = f"{pretrained_model_name_or_path}/vocab.json"
        with open(vocab_file, 'r') as f:
            vocab = json.load(f)
        return cls(vocab=vocab, **kwargs)

In [6]:
import torch

# Example vocabulary (optional)
vocab = {
    "the": 0,
    "quick": 1,
    "brown": 2,
    "fox": 3,
    "jumps": 4,
    "over": 5,
    "lazy": 6,
    "dog": 7
}

# Create an instance of the custom tokenizer
tokenizer = CustomHashTokenizer(vocab=vocab)

# Example input text
input_text = "the quick brown fox jumps over the lazy dog"

# Encode the input text
encoded = tokenizer.encode(input_text)
print(f"Encoded: {encoded}")

# Decode the encoded text
decoded = tokenizer.decode(encoded)
print(f"Decoded: {decoded}")


Encoded: [8288, 6186, 1756, 144, 2703, 3407, 8288, 7956, 5332]
Decoded: [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK]




In [7]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [8]:
encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.", return_tensors='tf')

2024-09-17 02:26:56.020554: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-17 02:26:56.036433: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-17 02:26:56.041223: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-17 02:26:56.053110: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
I0000 00:00:1726554418.474730  802151 cuda_executor.c

In [9]:
tokenizer.decode(encoded_input["input_ids"][0])    

'[CLS] Do not meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]'

In [10]:
batch_sentences = [
    "But what about second breakfast?",
    "Don't think he knows about second breakfast, Pip.",
    "What about elevensies?",
]
encoded_inputs = tokenizer(batch_sentences)
print(encoded_inputs)

{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102], [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], [101, 1327, 1164, 5450, 23434, 136, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]]}


In [11]:
batch_sentences = [
    "But what about second breakfast?",
    "Don't think he knows about second breakfast, Pip.",
    "What about elevensies?",
]
encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="tf")
print(encoded_input)

{'input_ids': <tf.Tensor: shape=(3, 15), dtype=int32, numpy=
array([[  101,  1252,  1184,  1164,  1248,  6462,   136,   102,     0,
            0,     0,     0,     0,     0,     0],
       [  101,  1790,   112,   189,  1341,  1119,  3520,  1164,  1248,
         6462,   117, 21902,  1643,   119,   102],
       [  101,  1327,  1164,  5450, 23434,   136,   102,     0,     0,
            0,     0,     0,     0,     0,     0]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(3, 15), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(3, 15), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>}


In [12]:
tokenizer.decode(encoded_input["input_ids"][0]) 

'[CLS] But what about second breakfast? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'