##**Quantization of Mistral 7B**

In [None]:
!pip install -U bitsandbytes # Install bitsandbytes



In [None]:
!pip install transformers



In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from accelerate import init_empty_weights
from transformers import BitsAndBytesConfig

In [None]:
import os
import sys
import psutil

# Function to measure memory usage (for CPU or GPU)
def get_memory_usage():
    # GPU Memory Usage
    if torch.cuda.is_available():
        return f"{torch.cuda.memory_allocated() / (1024 ** 3):.2f} GB (GPU)"
    # CPU Memory Usage
    process = psutil.Process(os.getpid())
    return f"{process.memory_info().rss / (1024 ** 3):.2f} GB (CPU)"

In [None]:
MODEL_NAME = "mistralai/Ministral-8B-Instruct-2410"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

In [None]:
print("Memory after loading model:", get_memory_usage())

Memory after loading model: 0.00 GB (GPU)


In [None]:
model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
    )

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
print("Memory after loading model:", get_memory_usage())

Memory after loading model: 5.34 GB (GPU)


In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


In [None]:
from transformers import pipeline

In [None]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer = tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

In [None]:
print("Memory after loading model:", get_memory_usage())

Memory after loading model: 5.34 GB (GPU)


In [None]:
INSTRUCTION_KEY = "### Instruction:"
RESPONSE_KEY = "### Response:"
END_KEY = "### End"
INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
PROMPT_FOR_GENERATION_FORMAT = """{intro}
{instruction_key}
{instruction}
{end_key}
{response_key}
""".format(
    intro=INTRO_BLURB,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    end_key=END_KEY,
    response_key=RESPONSE_KEY,
)

In [None]:
def process_stream(instruction, temperature, top_p, top_k, max_new_tokens):
    #prompt = "As a data scientist, can you explain the concept of regularization in machine learning?"
    prompt = PROMPT_FOR_GENERATION_FORMAT.format(instruction=instruction.lower())

    sequences = pipe(
        prompt,
        do_sample=True,
        max_new_tokens= max_new_tokens, # 100,
        temperature=temperature, # 0.7,
        top_k=top_k, #50,
        top_p=top_p, #0.95,
        num_return_sequences=1,
    )
    return sequences[0]['generated_text']

In [None]:
response = process_stream(instruction="What is the capital of India",top_p=50,top_k=1,temperature=1,max_new_tokens=100)
#response = generate_text(prompt_settings.instruction)
response = response.split('### Response:')[-1]
wrapped_text = response #textwrap.fill(response, width=100)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [None]:
print(wrapped_text)


The capital of India is New Delhi.


In [None]:
print(response)


The capital of India is New Delhi.


##**Quantization of Mistral 8B**

In [None]:
!pip install -U bitsandbytes # Install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.1


In [None]:
!pip install transformers



In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from accelerate import init_empty_weights
from transformers import BitsAndBytesConfig

In [None]:
import os
import sys
import psutil

# Function to measure memory usage (for CPU or GPU)
def get_memory_usage():
    # GPU Memory Usage
    if torch.cuda.is_available():
        return f"{torch.cuda.memory_allocated() / (1024 ** 3):.2f} GB (GPU)"
    # CPU Memory Usage
    process = psutil.Process(os.getpid())
    return f"{process.memory_info().rss / (1024 ** 3):.2f} GB (CPU)"

In [None]:
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

In [None]:
print("Memory after loading model:", get_memory_usage())

Memory after loading model: 0.00 GB (GPU)


In [None]:
model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
    )

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [None]:
print("Memory after loading model:", get_memory_usage())

Memory after loading model: 3.84 GB (GPU)


In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)



tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [None]:
from transformers import pipeline

In [None]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer = tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

In [None]:
print("Memory after loading model:", get_memory_usage())

Memory after loading model: 3.85 GB (GPU)


In [None]:
INSTRUCTION_KEY = "### Instruction:"
RESPONSE_KEY = "### Response:"
END_KEY = "### End"
INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
PROMPT_FOR_GENERATION_FORMAT = """{intro}
{instruction_key}
{instruction}
{end_key}
{response_key}
""".format(
    intro=INTRO_BLURB,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    end_key=END_KEY,
    response_key=RESPONSE_KEY,
)

In [None]:
def process_stream(instruction, temperature, top_p, top_k, max_new_tokens):
    #prompt = "As a data scientist, can you explain the concept of regularization in machine learning?"
    prompt = PROMPT_FOR_GENERATION_FORMAT.format(instruction=instruction.lower())

    sequences = pipe(
        prompt,
        do_sample=True,
        max_new_tokens= max_new_tokens, # 100,
        temperature=temperature, # 0.7,
        top_k=top_k, #50,
        top_p=top_p, #0.95,
        num_return_sequences=1,
    )
    return sequences[0]['generated_text']

In [None]:
response = process_stream(instruction="What is the capital of India",top_p=50,top_k=1,temperature=1,max_new_tokens=100)
#response = generate_text(prompt_settings.instruction)
response = response.split('### Response:')[-1]
wrapped_text = response #textwrap.fill(response, width=100)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [None]:
print(wrapped_text)


The capital city of India is New Delhi. It is a city that holds great historical significance and is home to several iconic landmarks such as the Red Fort, Qutub Minar, and India Gate. New Delhi serves as the political and administrative hub of the country.


In [None]:
print(response)


The capital city of India is New Delhi. It is a city that holds great historical significance and is home to several iconic landmarks such as the Red Fort, Qutub Minar, and India Gate. New Delhi serves as the political and administrative hub of the country.
