## AWQ Mistral


In [1]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
import torch
import os

In [3]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [None]:
weights_dir = "./weights"
model_name_or_path = "TheBloke/Mistral-7B-v0.1-AWQ"

# Load model
model = AutoAWQForCausalLM.from_quantized(
    model_name_or_path,
    fuse_layers=True,
    trust_remote_code=False,
    safetensors=True,
    device=device,
    cache_dir=weights_dir,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path, trust_remote_code=False, device=device, cache_dir=weights_dir
)

### Test model


In [None]:
prompt = "Tell me about AI"
prompt_template = f"""{prompt}

"""

print("\n\n*** Generate:")

tokens = tokenizer(prompt_template, return_tensors="pt").input_ids.cuda()

# Generate output
generation_output = model.generate(
    tokens, do_sample=True, temperature=0.1, top_p=0.95, top_k=40, max_new_tokens=512
)

print("Output: ", tokenizer.decode(generation_output[0]))

### Test pipeline


In [None]:
# Inference should be possible with transformers pipeline as well in future
# But currently this is not yet supported by AutoAWQ (correct as of September 25th 2023)
from transformers import pipeline

print("*** Pipeline:")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    top_p=0.95,
    top_k=40,
    repetition_penalty=1.1,
    device=device,
)

print(pipe(prompt_template)[0]["generated_text"])

## GPTQ Mistral


In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [3]:
weights_dir = "./weights"
model_name_or_path = "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ"

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    device_map="auto",
    trust_remote_code=False,
    cache_dir=weights_dir,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path, trust_remote_code=False, device=device, cache_dir=weights_dir
)

### Test model


In [None]:
prompt = "Tell me about AI"
prompt_template = f"""{prompt}

"""

print("\n\n*** Generate:")

tokens = tokenizer(prompt_template, return_tensors="pt").input_ids.cuda()

# Generate output
generation_output = model.generate(
    tokens, do_sample=True, temperature=0.1, top_p=0.95, top_k=40, max_new_tokens=512
)

print("Output: ", tokenizer.decode(generation_output[0]))

## JSON former


In [4]:
from jsonformer import Jsonformer

In [5]:
json_schema = {
    "type": "object",
    "properties": {
        "name": {"type": "string"},
        "age": {"type": "number"},
        "is_student": {"type": "boolean"},
        "courses": {"type": "array", "items": {"type": "string"}},
    },
}

prompt = "Generate a person's information based on the following schema:"
jsonformer = Jsonformer(model, tokenizer, json_schema, prompt)
generated_data = jsonformer()

print(generated_data)



{'name': 'John Doe', 'age': 25.567, 'is_student': True, 'courses': ['Math', 'English', 'Science']}
