#Quantization using bitsandbytes

In [1]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git



[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m4.7 MB/

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from transformers import pipeline


In [3]:
# Define a function to load the Galactica model
def load_model(model_id, quantization_config=None, device_map="auto"):
    """
    Load a language model with optional quantization configuration and device mapping.

    Args:
        model_id (str): The ID of the model to load.
        quantization_config (BitsAndBytesConfig, optional): Quantization configuration. Default is None.
        device_map (str, optional): Device mapping for model loading. Default is "auto".

    Returns:
        model (AutoModelForCausalLM): Loaded Galactica model.
        tokenizer (AutoTokenizer): Loaded tokenizer for the model.
    """
    model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config, device_map=device_map)
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    return model, tokenizer



In [4]:
# Define a function to generate text using a loaded model and tokenizer
def generate_text(model, tokenizer, input_text, max_length=50, num_return_sequences=1):
    """
    Generate text using a loaded language model and tokenizer.

    Args:
        model (AutoModelForCausalLM): Loaded language model.
        tokenizer (AutoTokenizer): Loaded tokenizer for the model.
        input_text (str): Input text for text generation.
        max_length (int, optional): Maximum length of generated text. Default is 50.
        num_return_sequences (int, optional): Number of generated sequences to return. Default is 1.

    Returns:
        output_text (str): Generated text.
    """
    text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
    output_text = text_generator(input_text, max_length=max_length, num_return_sequences=num_return_sequences)[0]["generated_text"]
    return output_text


In [6]:
def main():
    """
    Main function to demonstrate loading and generating text using Galactica models with different quantization configurations.
    """

    model_id = "facebook/galactica-125m"


    # Load the Galactica model with 4-bit quantization and automatic device mapping
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    galactica_model, galactica_tokenizer = load_model(model_id, quantization_config=quantization_config)

    # Define input text for generation
    input_text = "Generative AI is an exciting area of technology [START_REF]"



    # Generate text using the loaded model
    generated_text = generate_text(galactica_model, galactica_tokenizer, input_text)
    print(f"Generated Text: {generated_text}")

    # Load another model with different quantization configurations
    nf4_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
    )
    model_nf4, tokenizer_nf4 = load_model(model_id, quantization_config=nf4_config)

    # Generate text using the nf4 quantization model
    generated_nf4_text = generate_text(model_nf4, tokenizer_nf4, input_text)
    print(f"Generated Text (nf4): {generated_nf4_text}")

    # Load another model with double quantization and bfloat16 compute dtype
    double_quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
    )
    model_double_quant, tokenizer_double_quant = load_model(model_id, quantization_config=double_quant_config)

    # Generate text using the double quantization model
    generated_double_quant_text = generate_text(model_double_quant, tokenizer_double_quant, input_text)
    print(f"Generated Text (double quant): {generated_double_quant_text}")


    # Load a comprehensive quantization configuration model
    comprehensive_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    model_comprehensive, tokenizer_comprehensive = load_model(model_id, quantization_config=comprehensive_config)

    # Generate text using the comprehensive quantization model
    generated_comprehensive_text = generate_text(model_comprehensive, tokenizer_comprehensive, input_text)
    print(f"Generated Text (comprehensive): {generated_comprehensive_text}")

    # Example of loading and generating text with another model
    model_id_neox = "EleutherAI/gpt-neox-20b"
    neox_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    neox_model, neox_tokenizer = load_model(model_id_neox, quantization_config=neox_config)

    input_text_neox = "Once upon a time, there was a"
    output_text_neox = generate_text(neox_model, neox_tokenizer, input_text_neox, max_length=50)
    print(f"Generated Text (NEOX): {output_text_neox}")

    # Display model size in MB
    num_params = sum(p.numel() for p in neox_model.parameters() if p.requires_grad)
    size_mb = num_params * 32 / 8 / (1024 * 1024)
    print(f"Model size: {size_mb:.2f} MB")


In [7]:
# Execute the main function when this script is run
if __name__ == "__main__":
    main()

Downloading (…)lve/main/config.json:   0%|          | 0.00/787 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/250M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/166 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.14M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/3.00 [00:00<?, ?B/s]

Generated Text: Generative AI is an exciting area of technology [START_REF] A Survey of Generative Adversarial Networks, Zhang.

In this paper, we propose a novel generative adversarial network (GAN) for image generation. The GAN is a generative model that can




Generated Text (nf4): Generative AI is an exciting area of technology [START_REF] Generative Adversarial Nets, Goodfellow.

# 2.2.2. Generative Adversarial Networks

Generative Adversarial Networks (GANs)  Generative Adversarial Nets, Goodfellow
Generated Text (double quant): Generative AI is an exciting area of technology [START_REF] A Survey of Generative Adversarial Networks, Zhang.

In this paper, we propose a novel generative adversarial network (GAN) for image generation. The GAN is a generative model that can
Generated Text (comprehensive): Generative AI is an exciting area of technology [START_REF] Generative Adversarial Nets, Goodfellow.

# 2.2.2. Generative Adversarial Networks

Generative Adversarial Networks (GANs)  Generative Adversarial Nets, Goodfellow


Downloading (…)lve/main/config.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/60.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/46 [00:00<?, ?it/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/926M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/604M [00:00<?, ?B/s]

Downloading (…)of-00046.safetensors:   0%|          | 0.00/620M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/46 [00:00<?, ?it/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/457k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Generated Text (NEOX): Once upon a time, there was a little girl who had a very special gift. She could see the future. She could see the past. She could see the present. She could see the future. She could see the past. She could see
Model size: 2368.17 MB


#Quantization using GPTQ

In [1]:
# Install the AutoGPTQ and transformers libraries
!BUILD_CUDA_EXT=0 pip install -q auto-gptq transformers


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.8 MB[0m [31m3.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m109.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m78.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.6/85.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━

In [2]:
# Importing necessary libraries
import random  # For random number generation
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig  # For GPTQ implementation
from datasets import load_dataset  # For loading datasets
import torch  # For tensor operations
from transformers import AutoTokenizer  # For tokenization


In [3]:
# Define the base model (GPT-2 in this case) and the output directory for the quantized model
model_id = "gpt2"  # We are using GPT-2 as the base model
out_dir = model_id + "-GPTQ"  # Output directory for the quantized model



In [4]:
# Define the quantization configuration using BaseQuantizeConfig
# bits=4 means we are quantizing to 4 bits
# group_size=128 specifies the size of the lazy batch for quantization
# damp_percent=0.01 is a parameter for the Cholesky reformulation
# desc_act=False means we are not sorting rows by decreasing activation

quantize_config = BaseQuantizeConfig(
    bits=4,  # 4-bit quantization
    group_size=128,  # Group size for lazy batch
    damp_percent=0.01,  # Cholesky reformulation parameter
    desc_act=False  # No sorting by decreasing activation
)

# Load the model and tokenizer
model = AutoGPTQForCausalLM.from_pretrained(model_id, quantize_config)
tokenizer = AutoTokenizer.from_pretrained(model_id)


Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [5]:
# Load and tokenize dataset
n_samples = 1024  # Number of samples for quantization
data = load_dataset("allenai/c4", data_files="en/c4-train.00001-of-01024.json.gz", split=f"train[:{n_samples*5}]")
tokenized_data = tokenizer("\n\n".join(data['text']), return_tensors='pt')


Downloading readme:   0%|          | 0.00/2.38k [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2441065 > 1024). Running this sequence through the model will result in indexing errors


In [6]:
# Randomly select and format tokenized examples
examples_ids = []
for _ in range(n_samples):
    i = random.randint(0, tokenized_data.input_ids.shape[1] - tokenizer.model_max_length - 1)
    j = i + tokenizer.model_max_length
    input_ids = tokenized_data.input_ids[:, i:j]
    attention_mask = torch.ones_like(input_ids)
    examples_ids.append({'input_ids': input_ids, 'attention_mask': attention_mask})


In [7]:
# Perform the quantization using the model's quantize method
# examples_ids contains the tokenized text
# batch_size=1 means we are using a batch size of 1 for quantization
# use_triton=True enables the use of OpenAI Triton for GPU acceleration

model.quantize(
    examples_ids,  # Tokenized text
    batch_size=1,  # Batch size for quantization
    use_triton=True  # Use OpenAI Triton for GPU acceleration
)


In [8]:
# Save the quantized model and tokenizer to the specified output directory
# use_safetensors=True saves the model in a format compatible with SafeTensors
model.save_quantized(out_dir, use_safetensors=True)
tokenizer.save_pretrained(out_dir)

# Reload the quantized model and tokenizer from the output directory
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = AutoGPTQForCausalLM.from_quantized(
    out_dir,
    device=device,
    use_triton=True,
    use_safetensors=True,
)
tokenizer = AutoTokenizer.from_pretrained(out_dir)




In [9]:
# Test the quantized model using Hugging Face's pipeline for text generation
from transformers import pipeline

generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
result = generator("I have a dream", do_sample=True, max_length=50)[0]['generated_text']
print(result)


The model 'GPT2GPTQForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 

I have a dream," he said.

"I dream myself on to this next city. I wish I could do something special. With these guys, it's not over. It's just for heaven's sake."
