In [1]:
# !pip3 install -q bitsandbytes
# !pip3 install -q datasets sentencepiece
# !pip3 install -q git+https://github.com/huggingface/transformers
# !pip3 install -q accelerate
# !pip3 install -q safetensors
# git clone https://github.com/oobabooga/GPTQ-for-LLaMa
# cd GPTQ-for-LLaMa && python3 setup_cuda.py build && ln -s build/lib.*/*.so ./

In [2]:
# USE THE OOBABOOGA VERSION OF GPTQ-FOR-LLAMA !!!

In [1]:
import os
import sys 
from pathlib import Path
import torch
import transformers
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig
import accelerate
import ipywidgets as widgets
sys.path.insert(0, "./GPTQ-for-LLaMa")

import llama_inference_offload
from modelutils import find_layers
from quant import make_quant


def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exclude_layers=['lm_head'], kernel_switch_threshold=128):
    config = AutoConfig.from_pretrained(model)
    def noop(*args, **kwargs):
        pass
    torch.nn.init.kaiming_uniform_ = noop 
    torch.nn.init.uniform_ = noop 
    torch.nn.init.normal_ = noop 

    torch.set_default_dtype(torch.half)
    transformers.modeling_utils._init_weights = False
    torch.set_default_dtype(torch.half)
    model = AutoModelForCausalLM.from_config(config)
    torch.set_default_dtype(torch.float)
    model = model.eval()
    layers = find_layers(model)
    for name in exclude_layers:
        if name in layers:
            del layers[name]
    make_quant(model, layers, wbits, groupsize)
    del layers
    
    print('Loading model ...')
    from safetensors.torch import load_file as safe_load
    model.load_state_dict(safe_load(checkpoint))
    model.seqlen = 1024
    print('Done.')

    return model



In [2]:
model_path = 'vicuna-13b-GPTQ-4bit-128g'
pt_path = 'vicuna-13b-GPTQ-4bit-128g/vicuna-13b-4bit-128g.safetensors'

model = _load_quant(model_path, pt_path, 4, 128)
model = model.to(torch.device('cuda:0'))
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)

Loading model ...
Done.


In [5]:
max_memory = {0: "9GiB", "cpu": "30GiB"}
device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=["LlamaDecoderLayer"])
model = accelerate.dispatch_model(model, device_map=device_map, offload_buffers=True)


In [6]:
default_generation_conf = GenerationConfig(
    top_k=40,
    temperature=0.1,
    top_p=0.75,
    max_new_tokens=20
)

def inference (prompt, conf = default_generation_conf):
    inputs = tokenizer(prompt, return_tensors="pt")
    r = model.generate(
        input_ids=inputs["input_ids"].cuda(),
        generation_config=conf
    )
    return [tokenizer.decode(x) for x in r]


In [7]:
text = widgets.Text(
    value='Once upon a time',
    placeholder='Type something',
    description='Prompt:',
    disabled=False   
)



button = widgets.Button(
    description='Click me',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Click me',
    icon='check' # (FontAwesome names without the `fa-` prefix)
)
output = widgets.Output()
display(text, button, output)

conf = GenerationConfig(
    top_k=40,
    temperature=0.1,
    top_p=0.75,
    max_new_tokens=20,
    num_return_sequences=1
)

def on_button_clicked (b):
    out = inference(text.value, conf)
    with output:
        print('--------')
        for l in out:
            print(l)
            
button.on_click(on_button_clicked)

Text(value='Once upon a time', description='Prompt:', placeholder='Type something')

Button(description='Click me', icon='check', style=ButtonStyle(), tooltip='Click me')

Output()

In [21]:
params = []
for key in dir(default_conf):
    if '__' in key:
        continue
    if key[0] == '_':
        continue
    params.append(key)

text = widgets.Text(value='Once upon a time',description='Prompt:',disabled=False)
dropdown = widgets.Dropdown(options=params,value=params[0],disabled=False)
options = widgets.Text(value='',placeholder='',description='',disabled=False)
button = widgets.Button(description='Click me',disabled=False,icon='check')
output = widgets.Output()
display(text, dropdown, options, button, output)

def on_button_clicked2 (b):
    param = dropdown.value
    values = []
    for v in options.value.split(';'):
        v = v.strip()
        values.append(float(v))
    output.clear_output()
    with output:
        for v in values:
            args = dict(top_k=40,temperature=0.1,top_p=0.75,max_new_tokens=10,num_return_sequences=1)
            args[param] = v
            conf = GenerationConfig(**args)
            out = inference(text.value, conf)
            print('----%s = %g ----' % (param, getattr(conf, param)))
            for l in out:
                print(l)
                
def on_change (change):
    param = dropdown.value
    v = getattr(default_conf, param)
    options.value = str(v)
            
button.on_click(on_button_clicked2)
dropdown.observe(on_change)

Text(value='Once upon a time', description='Prompt:')

Dropdown(options=('bad_words_ids', 'begin_suppress_tokens', 'bos_token_id', 'constraints', 'decoder_start_toke…

Text(value='', placeholder='')

Button(description='Click me', icon='check', style=ButtonStyle())

Output()