# core

> The main module. Contains importer code as well as a simple inference test.
> Tested on Geforce 2080Ti graphic card.

## Implementation

In [1]:
#| default_exp core

In [2]:
#| export
import time
import torch
import os
from flags import Flags

In [3]:
#| export
class Matmul4BitOptions(Flags):
    ACT_ORDER = 1
    NO_ACT_ORDER = 2
    ALGORYTHM_DEFAULT = 4
    ALGORYTHM_OLDFASTER = 8
    ALGORYTHM_FASTER = 16

In [4]:
#| export
def _apply_peft_tuners_monkeypatch():
    from alpaca_lora_4bit.monkeypatch.peft_tuners_lora_monkey_patch import replace_peft_model_with_int4_lora_model

    replace_peft_model_with_int4_lora_model()

In [5]:
#| export
def _apply_flash_attention_monkeypatch():
    from alpaca_lora_4bit.monkeypatch.llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn

    replace_llama_attn_with_flash_attn()

In [6]:
#| export
def _apply_xformers_monkeypatch():
    from alpaca_lora_4bit.monkeypatch.llama_attn_hijack_xformers import hijack_llama_attention

    hijack_llama_attention()

In [7]:
#| export
def _set_matmul_options(options: Matmul4BitOptions):
    from alpaca_lora_4bit import matmul_utils_4bit
    if Matmul4BitOptions.ACT_ORDER in options:
        matmul_utils_4bit.act_order = True
    elif Matmul4BitOptions.NO_ACT_ORDER in options:
        matmul_utils_4bit.act_order = False
    else:
        raise ValueError("Need ACT_ORDER or NO_ACT_ORDER options")
    if Matmul4BitOptions.ALGORYTHM_DEFAULT in options:
        matmul_utils_4bit.faster_mode = "disabled"
    elif Matmul4BitOptions.ALGORYTHM_OLDFASTER in options:
        assert not matmul_utils_4bit.act_order
        matmul_utils_4bit.faster_mode = "old_faster"
    elif Matmul4BitOptions.ALGORYTHM_FASTER in options:
        assert not matmul_utils_4bit.act_order
        matmul_utils_4bit.faster_mode = "faster"

In [8]:
#| export
def import_llama(
        use_flash_attention: bool, # Use flash attention monkeypatch or not (shouldn't be used with use_xformerts)
        use_xformers: bool, # Use xformers monkeypatch or not (shouldn't be used with use_flash_attention)
        autograd_4bit_cuda: bool, # Use CUDA backend for 4bit stuff
        autograd_4bit_triton: bool, # Use Triton backend for 4bit stuff
        matmul4bit_options: Matmul4BitOptions = Matmul4BitOptions.NO_ACT_ORDER | Matmul4BitOptions.ALGORYTHM_DEFAULT,
    ): #load_llama_model_4bit_low_ram / load_llama_model_4bit_low_ram_and_offload / model_to_half / model_to_float / Autograd4bitQuantLinear / AMPWrapper
    """
    Do all the monkeypatching than return important objects of alpaca_lora_4bit library (arg_parser, train_data, load_llama_model_4bit_low_ram, load_llama_model_4bit_low_ram_and_offload, model_to_half, model_to_float, apply_gradient_checkpointing, Autograd4bitQuantLinear, AMPWrapper)
    """
    _apply_peft_tuners_monkeypatch()
    assert not (use_flash_attention and use_xformers)
    if use_flash_attention:
        _apply_flash_attention_monkeypatch()
    if use_xformers:
        _apply_xformers_monkeypatch()
    from alpaca_lora_4bit import autograd_4bit
    assert autograd_4bit_cuda ^ autograd_4bit_triton
    if autograd_4bit_cuda:
        autograd_4bit.switch_backend_to("cuda")
    if autograd_4bit_triton:
        autograd_4bit.switch_backend_to("triton")
    _set_matmul_options(matmul4bit_options)
    
    from alpaca_lora_4bit.autograd_4bit import load_llama_model_4bit_low_ram, load_llama_model_4bit_low_ram_and_offload, Autograd4bitQuantLinear, \
        model_to_half, model_to_float
    from alpaca_lora_4bit.amp_wrapper import AMPWrapper
    from alpaca_lora_4bit import train_data, arg_parser
    from alpaca_lora_4bit.gradient_checkpointing import apply_gradient_checkpointing

    return arg_parser, train_data, load_llama_model_4bit_low_ram, load_llama_model_4bit_low_ram_and_offload, model_to_half, model_to_float, apply_gradient_checkpointing, Autograd4bitQuantLinear, AMPWrapper

## Test

Here I will:

- download model from [https://huggingface.co/anon8231489123/vicuna-13b-GPTQ-4bit-128g](https://huggingface.co/anon8231489123/vicuna-13b-GPTQ-4bit-128g)
- initialize monkeypatches to use alpaca_lora_4bit including:
    - flash attention
    - triton
- than initialize the model I have just downloaded
- and do some simple generation

In [9]:
if not os.path.exists("../vicuna-13b-GPTQ-4bit-128g"):
    !git clone "https://huggingface.co/anon8231489123/vicuna-13b-GPTQ-4bit-128g"
    !mv "vicuna-13b-GPTQ-4bit-128g" ..

In [10]:
_, _, load_llama_model_4bit_low_ram, _, model_to_half, _, _, _, AMPWrapper = import_llama(
    use_flash_attention=False,
    use_xformers=False,
    autograd_4bit_cuda=True,
    autograd_4bit_triton=False,
    matmul4bit_options=Matmul4BitOptions.NO_ACT_ORDER | Matmul4BitOptions.ALGORYTHM_DEFAULT,
)

model, tokenizer = load_llama_model_4bit_low_ram(
    config_path="../vicuna-13b-GPTQ-4bit-128g/",
    model_path="../vicuna-13b-GPTQ-4bit-128g/vicuna-13b-4bit-128g.safetensors",
    groupsize=128,
    is_v1_model=False,
)
model_to_half(model)

wrapper = AMPWrapper(model)
wrapper.apply_generate()

  from .autonotebook import tqdm as notebook_tqdm


Triton not found. Please run "pip install triton".
Using CUDA implementation.
Loading Model ...


The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
The safetensors archive passed at ../vicuna-13b-GPTQ-4bit-128g/vicuna-13b-4bit-128g.safetensors does not contain metadata. Make sure to save your model with the `save_pretrained` method. Defaulting to 'pt' metadata.


Loaded the model in 26.82 seconds.
Converted as Half.


In [11]:
prompt = '''I think the meaning of life is'''
batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
batch = {k: v.cuda() for k, v in batch.items()}

In [12]:
start = time.time()
with torch.no_grad():
    generated = model.generate(inputs=batch["input_ids"],
                               do_sample=True,
                               use_cache=False,
                               repetition_penalty=1.1,
                               max_new_tokens=128,
                               temperature=0.9,
                               top_p=0.95,
                               top_k=40,
                               return_dict_in_generate=True,
                               output_attentions=False,
                               output_hidden_states=False,
                               output_scores=False)
result_text = tokenizer.decode(generated['sequences'].cpu().tolist()[0])
end = time.time()

In [13]:
print(result_text)
print(end - start)

I think the meaning of life is to love and be loved.”

“But how about all the terrible things in the world?” I asked. “The wars, the cruelty, the injustice? How can you say that loving has anything to do with all that?”

He looked at me very seriously. “I know it seems like a paradox,” he said.
“But if people really loved each other, there would be no wars, no fighting, no cruelty, no injustice. Love is the only force that can change a person’s heart and make them want to do what’s right. If enough
34.80921292304993


In [14]:
import nbdev; nbdev.nbdev_export()