# core

> The main module. Contains importer code as well as a simple inference test.
> Tested on Geforce 2080Ti graphic card.

## Implementation

In [None]:
#| default_exp core

In [None]:
# These imports are only used for test purpose
import time
import torch
import os

In [None]:
#| export
def _apply_peft_tuners_monkeypatch():
    from alpaca_lora_4bit.monkeypatch.peft_tuners_lora_monkey_patch import replace_peft_model_with_int4_lora_model

    replace_peft_model_with_int4_lora_model()

In [None]:
#| export
def _apply_flash_attention_monkeypatch():
    from alpaca_lora_4bit.monkeypatch.llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn

    replace_llama_attn_with_flash_attn()

In [None]:
#| export
def _apply_xformers_monkeypatch():
    from alpaca_lora_4bit.monkeypatch.llama_attn_hijack_xformers import hijack_llama_attention

    hijack_llama_attention()

In [None]:
#| export
def import_alpaca(use_flash_attention: bool, use_xformers: bool, autograd_4bit_cuda: bool, autograd_4bit_triton: bool):
    _apply_peft_tuners_monkeypatch()
    assert not (use_flash_attention and use_xformers)
    if use_flash_attention:
        _apply_flash_attention_monkeypatch()
    if use_xformers:
        _apply_xformers_monkeypatch()
    from alpaca_lora_4bit import autograd_4bit
    assert autograd_4bit_cuda ^ autograd_4bit_triton
    if autograd_4bit_cuda:
        autograd_4bit.switch_backend_to("cuda")
    if autograd_4bit_triton:
        autograd_4bit.switch_backend_to("triton")
    
    from alpaca_lora_4bit.autograd_4bit import load_llama_model_4bit_low_ram, load_llama_model_4bit_low_ram_and_offload, Autograd4bitQuantLinear, \
        model_to_half, model_to_float
    from alpaca_lora_4bit.amp_wrapper import AMPWrapper

    return load_llama_model_4bit_low_ram, load_llama_model_4bit_low_ram_and_offload, model_to_half, model_to_float, Autograd4bitQuantLinear, AMPWrapper

## Test

In [None]:
if not os.path.exists("../vicuna-13b-GPTQ-4bit-128g"):
    !git clone "https://huggingface.co/anon8231489123/vicuna-13b-GPTQ-4bit-128g"
    !mv "vicuna-13b-GPTQ-4bit-128g" ..

In [None]:
load_llama_model_4bit_low_ram, _, model_to_half, _, _, AMPWrapper = import_alpaca(
    use_flash_attention=True,
    use_xformers=False,
    autograd_4bit_cuda=False,
    autograd_4bit_triton=True,
)

model, tokenizer = load_llama_model_4bit_low_ram(
    config_path="../vicuna-13b-GPTQ-4bit-128g/",
    model_path="../vicuna-13b-GPTQ-4bit-128g/vicuna-13b-4bit-128g.safetensors",
    groupsize=128,
    is_v1_model=False,
)
model_to_half(model)

wrapper = AMPWrapper(model)
wrapper.apply_generate()

  from .autonotebook import tqdm as notebook_tqdm


Using Triton implementation.
Loading Model ...


The safetensors archive passed at ../vicuna-13b-GPTQ-4bit-128g/vicuna-13b-4bit-128g.safetensors does not contain metadata. Make sure to save your model with the `save_pretrained` method. Defaulting to 'pt' metadata.


Loaded the model in 3.26 seconds.
Converted as Half.


In [None]:
prompt = '''I think the meaning of life is'''
batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
batch = {k: v.cuda() for k, v in batch.items()}

In [None]:
start = time.time()
with torch.no_grad():
    generated = model.generate(inputs=batch["input_ids"],
                               do_sample=True,
                               use_cache=False,
                               repetition_penalty=1.1,
                               max_new_tokens=128,
                               temperature=0.9,
                               top_p=0.95,
                               top_k=40,
                               return_dict_in_generate=True,
                               output_attentions=False,
                               output_hidden_states=False,
                               output_scores=False)
result_text = tokenizer.decode(generated['sequences'].cpu().tolist()[0])
end = time.time()

In [None]:
print(result_text)
print(end - start)

I think the meaning of life is to be happy, and that it is our birthright. That we are supposed to feel good as human beings, and that every moment of our lives should be a celebration of that.
“I’m not saying that we always have to be happy, or that life always has to be perfect. But I do believe that we should strive to cultivate happiness and positivity in all areas of our lives, and that we should surround ourselves with people and things that bring us joy.
“I believe that when we are surrounded by negativity, fear, and judgment, it can dull our light and keep
27.77907705307007
