## **Quantized LLM Inferencing**

In [1]:
import torch
from threading import Thread
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, GenerationConfig, TextIteratorStreamer

In [2]:
def get_model(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    tokenizer.pad_token = tokenizer.unk_token
    tokenizer.pad_token_id = tokenizer.unk_token_id
    tokenizer.padding_side = "left"

    streamer = TextIteratorStreamer(tokenizer)

    compute_dtype = getattr(torch, "bfloat16")
    
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True
    )
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map={"": 0},
        attn_implementation="flash_attention_2",
        cache_dir=f"../models",
        local_files_only=True
    )
    
    return model, tokenizer, streamer

In [3]:
model_name = "mistralai/Mistral-7B-Instruct-v0.1"
model, tokenizer, streamer = get_model(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
def generate(instruction):
    prompt = "[INST]" + instruction + "[/INST]"
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].cuda()

    generation_output = model.generate(
        input_ids=input_ids,
        generation_config=GenerationConfig(pad_token_id=tokenizer.pad_token_id, temperature=1.0, top_p=1.0, top_k=50, num_beams=1),
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=256
    )

    response = ""

    for seq in generation_output.sequences:
        output = tokenizer.decode(seq)
        response += output.split("[/INST]")[1].strip()

    return response

In [5]:
def generate_stream(instruction):
    prompt = "[INST]" + instruction + "[/INST]"
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].cuda()

    generation_kwargs = dict(
        input_ids=input_ids,
        generation_config=GenerationConfig(pad_token_id=tokenizer.pad_token_id, temperature=1.0, top_p=1.0, top_k=50, num_beams=1),
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=256,
        streamer=streamer
    )

    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    for new_text in streamer:
        yield new_text

In [20]:
inputs = tokenizer("Tell me about gravitation.", return_tensors="pt")


32768

In [11]:
iter = generate_stream("Tell me about gravitation.")

for i in iter:
    print(i)

<s> [INST]Tell me about 
gravitation.[/INST] 


Gravitation 
is 
a 
fundamental 
force 
of 
nature 
that 

attracts 
two 
bodies 
towards 
each 

other. 
It 
is 
a 
force 
that 
exists 
between 
any 
two 
objects 
with 

mass, 
and 
the 
strength 
of 
the 
force 
is 
directly 

proportional 
to 
the 
product 
of 
their 
masses 
and 


inversely 

proportional 
to 
the 
square 
of 
the 
distance 
between 

them.




The 
force 
of 
gravity 
is 
what 
keeps 
planets 
in 
orbit 
around 
the 

sun, 
and 
it 
is 
also 
what 
causes 
objects 
to 
fall 
towards 
the 
ground 
when 

dropped. 

Gravity 
is 
a 
force 
that 
acts 
over 
very 
large 

distances, 
and 
its 
effects 
can 
be 
observed 
on 
a 

cosmic 

scale, 
as 
well 
as 
on 
a 
very 
small 

scale.





Gravity 
is 
described 
by 
Isaac 


Newton's 
law 
of 
universal 


gravitation, 
which 
states 
that 
every 
point 
mass 

attracts 
every 
other 
point 
mass 
by 
a 
force 
acting 
along 
the 
line 


intersecting 
both 

point