<a href="https://colab.research.google.com/github/atadria/LLMs/blob/main/mistral_code_samples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Code samples to run Mistral model

In [None]:
# to fix: NotImplementedError: A UTF-8 locale is required. Got ANSI_X3.4-1968
import locale
locale.getdefaultlocale()

## GGUF
- CPU and GPU interference.
GGUF (GPT-Generated Unified Format)

In [None]:
!pip install -q ctransformers

In [None]:
# fix: OSError: libcudart.so.12: cannot open shared object file: No such file or directory
!pip uninstall -q torch
!pip install -q torch

In [None]:
from ctransformers import AutoModelForCausalLM

# Set gpu_layers to the number of layers to offload to GPU.
# Set to 0 if no GPU acceleration is available on your system.
llm = AutoModelForCausalLM.from_pretrained(
    "TheBloke/Mistral-7B-Instruct-v0.1-GGUF",
    model_file="mistral-7b-instruct-v0.1.Q4_K_M.gguf",
    model_type="mistral",
    gpu_layers=128)

In [None]:
llm("</s>[INST] Jaką mamy dzisiaj pogodę w Krakowie? [/INST]",
    temperature=0.3,
    max_new_tokens=50)

## GPTQ
- GPU only

https://arxiv.org/abs/2210.17323

In [None]:
# restart runtime after runing this cell
!pip install -q accelerate
!pip install -q optimum
!pip install -q auto-gptq

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_name_or_path = "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"

model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision="main")

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

prompt = "Tell me about AI"
prompt_template=f'''<s>[INST] {prompt} [/INST]
'''

print("\n\n*** Generate:")

input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512)
print(tokenizer.decode(output[0]))

# Inference can also be done using transformers' pipeline

print("*** Pipeline:")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    top_p=0.95,
    top_k=40,
    repetition_penalty=1.1
)

print(pipe(prompt_template)[0]['generated_text'])

## AutoAWQ
- GPU only

AutoAWQ is an easy-to-use package for 4-bit quantized models. AutoAWQ speeds up models by 2x while reducing memory requirements by 3x compared to FP16. AutoAWQ implements the Activation-aware Weight Quantization (AWQ) algorithm for quantizing LLMs.

https://github.com/casper-hansen/AutoAWQ

https://arxiv.org/abs/2306.00978

In [None]:
!pip install -q autoawq

In [None]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer, TextStreamer

quant_path = "TheBloke/Mistral-7B-Instruct-v0.1-AWQ"

# Load model
model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=True, safetensors=True)
tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

In [None]:
prompt_template = """<s>[INST] {prompt} [/INST]"""
prompt = "What is the weather today in Krakow?"

tokens = tokenizer(
    prompt_template.format(prompt=prompt),
    return_tensors='pt'
).input_ids.cuda()

# Generate output
generation_output = model.generate(
    tokens,
    streamer=streamer,
    max_new_tokens=512
)

## vLLM + AWQ
- GPU only

vLLM is a **FAST** and easy-to-use library for LLM inference and serving.

https://github.com/vllm-project/vllm

https://arxiv.org/abs/2309.06180

In [None]:
!pip install -q vllm

In [None]:
from vllm import LLM, SamplingParams
import torch

llm = LLM(model="TheBloke/Mistral-7B-Instruct-v0.1-AWQ",
          quantization='awq',
          dtype='half',
          # max_model_len=512 fix oom error
          max_model_len=512)

sampling_params = SamplingParams(temperature=0.3,
                                 top_p=0.95,
                                 max_tokens=50)

In [None]:
prompts = [
    "<s>[INST] Jaką mamy dzisiaj pogodę w Krakowie? [/INST]",
    "<s>[INST] What is the weather today in Krakow? [/INST]",
]

outputs = llm.generate(prompts, sampling_params)

# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"\nPrompt: {prompt!r}, \nGenerated text: {generated_text!r}")

## bitsandbytes
- GPU only
- !!! more RAM is needed than free Colab tier - T4 with high RAM option is ok!!!


In [None]:
!pip install -q transformers accelerate bitsandbytes

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

device = "cuda" # the device to load the model onto

model_name = "mistralai/Mistral-7B-Instruct-v0.1"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_name,
                                             quantization_config=bnb_config)

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
prompt_template = """<s>[INST] {prompt} [/INST]"""

prompt = "Do you have mayonnaise recipes?"

model_inputs = tokenizer.encode(prompt_template.format(prompt=prompt),
                                return_tensors="pt").to(device)

generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])