In [26]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from quanto import quantize, freeze
import torch

In [2]:
model_name = "EleutherAI/pythia-410m"

In [3]:
model = AutoModelForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=True)



In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
text = "Hello, my name is"

In [6]:
inputs = tokenizer(text, return_tensors="pt")

In [7]:
outputs = model.generate(**inputs, max_new_tokens=10)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


In [8]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

'Hello, my name is John." "I\'m a lawyer." "I'

In [9]:
def compute_model_sizes(model):
    param_size = 0
    for p in model.parameters():
        param_size += p.nelement() * p.element_size()
        
    buffer_size = 0
    for b in model.buffers():
        buffer_size += b.nelement() * b.element_size()
        
    all_size = (param_size + buffer_size) / 1024**3
    return all_size

In [10]:
model_size = compute_model_sizes(model)
print(f"The model size is {model_size} GB")

The model size is 1.6095970571041107 GB


In [11]:
quantize(model, weights=torch.int8, activations=None)

In [12]:
model

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 1024)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): QLinear(in_features=1024, out_features=3072, bias=True)
          (dense): QLinear(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): QLinear(in_features=1024, out_features=4096, bias=True)
          (dense_4h_to_h): QLinear(in_features=4096, out_features=1024, bias=True

In [13]:
model.gpt_neox

GPTNeoXModel(
  (embed_in): Embedding(50304, 1024)
  (emb_dropout): Dropout(p=0.0, inplace=False)
  (layers): ModuleList(
    (0-23): 24 x GPTNeoXLayer(
      (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (post_attention_dropout): Dropout(p=0.0, inplace=False)
      (post_mlp_dropout): Dropout(p=0.0, inplace=False)
      (attention): GPTNeoXAttention(
        (rotary_emb): GPTNeoXRotaryEmbedding()
        (query_key_value): QLinear(in_features=1024, out_features=3072, bias=True)
        (dense): QLinear(in_features=1024, out_features=1024, bias=True)
        (attention_dropout): Dropout(p=0.0, inplace=False)
      )
      (mlp): GPTNeoXMLP(
        (dense_h_to_4h): QLinear(in_features=1024, out_features=4096, bias=True)
        (dense_4h_to_h): QLinear(in_features=4096, out_features=1024, bias=True)
        (act): GELUActivation()
      )
    )
  )
  (final_layer_n

In [24]:
model.gpt_neox.layers[0].attention.dense.weight

Parameter containing:
tensor([[ 0.0061, -0.0016, -0.0068,  ..., -0.0062,  0.0138,  0.0222],
        [ 0.0077,  0.0157, -0.0090,  ...,  0.0013, -0.0132,  0.0109],
        [-0.0330,  0.0008,  0.0281,  ...,  0.0026,  0.0456, -0.0077],
        ...,
        [-0.0105,  0.0091, -0.0137,  ..., -0.0046,  0.0371, -0.0077],
        [-0.0063,  0.0035,  0.0147,  ...,  0.0220,  0.0158,  0.0224],
        [-0.0299,  0.0129,  0.0208,  ..., -0.0040, -0.0065,  0.0122]],
       requires_grad=True)

In [27]:
freeze(model)

In [28]:
model.gpt_neox.layers[0].attention.dense.weight

QTensor(tensor([[ 12,  -3, -14,  ..., -12,  28,  45],
        [ 18,  37, -21,  ...,   3, -31,  26],
        [-75,   2,  64,  ...,   6, 104, -18],
        ...,
        [-25,  22, -33,  ..., -11,  89, -19],
        [-14,   8,  33,  ...,  49,  35,  50],
        [-56,  24,  39,  ...,  -8, -12,  23]], dtype=torch.int8), scale=tensor([[0.0005],
        [0.0004],
        [0.0004],
        ...,
        [0.0004],
        [0.0004],
        [0.0005]]), public_dtype=torch.float32, requires_grad=True)

In [29]:
compute_model_sizes(model)

1.6095977798104286

In [31]:
model

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 1024)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): QLinear(in_features=1024, out_features=3072, bias=True)
          (dense): QLinear(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): QLinear(in_features=1024, out_features=4096, bias=True)
          (dense_4h_to_h): QLinear(in_features=4096, out_features=1024, bias=True