```
torch==2.0.1
torchvision==0.15.2
transformers==4.35.0
tokenizers>=0.14,<0.15
sentencepiece==0.1.99
shortuuid
accelerate==0.21.0
peft==0.4.0
bitsandbytes==0.41.0
pydantic<2,>=1
markdown2[all]
numpy
scikit-learn==1.2.2
gradio==3.35.2
gradio_client==0.2.9
requests
httpx==0.24.0
uvicorn
fastapi
einops==0.6.1
einops-exts==0.0.4
timm==0.6.13
ipywidgets
diffusers
ipykernel
protobuf==3.20.1
```

# Step 2. Load up LLM model in 4-bit mode

In [None]:
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer, BitsAndBytesConfig
from torch import cuda, bfloat16
import transformers
from transformers import AutoTokenizer
from transformers import StoppingCriteria, StoppingCriteriaList

model_name = "/path/to/ai_models/zephyr-7b-beta"

m = AutoModelForCausalLM.from_pretrained(
    model_name
    , trust_remote_code = True
    , quantization_config = BitsAndBytesConfig(
        load_in_4bit                = True,
        bnb_4bit_compute_dtype      = torch.bfloat16,
        bnb_4bit_use_double_quant   = True,
        bnb_4bit_quant_type         = 'nf4'
    )
    , torch_dtype   = torch.bfloat16
    , device_map    = "auto"#{"": 0}
)

tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_name
)
tokenizer.bos_token_id = 1
print(f"Successfully loaded the model {model_name} into memory")

# Step 3. Initialise parameters and functions

In [1]:
stop_token_ids = tokenizer.convert_tokens_to_ids(["<|endoftext|>"])

# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_id in stop_token_ids:
            if input_ids[0][-1] == stop_id:
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

import transformers
model = m
device = "cuda:0"
pipe = transformers.pipeline(
    model               = model,
    tokenizer           = tokenizer,
    return_full_text    = True,  # langchain expects the full text
    task                = 'text-generation',
    #device=device,
    device_map          = "auto",
    # we pass model parameters here too
    stopping_criteria   = stopping_criteria,    # without this model will ramble
    temperature         = 0.15,                 # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    top_p               = 0.15,                 # select from top tokens whose probability add up to 15%
    top_k               = 0,                    # select from top 0 tokens (because zero, relies on top_p)
    max_new_tokens      = 768*4,                # max number of tokens to generate in the output
    repetition_penalty  = 1.1                   # without this output begins repeating
)

def gen_text(system_prompt:str = None ,input_text:str = "hello"):
    if system_prompt is None:
        system_prompt = "You are a friendly chatbot who always responds in the style of a pirate"
    messages = [
        {
            "role": "system",
            "content":system_prompt
        },
        {"role": "user", "content": input_text},
    ]
    prompt = pipe.tokenizer.apply_chat_template(
        messages
        , tokenize              = False
        , add_generation_prompt = True
    )
    outputs = pipe(
        prompt
        , max_new_tokens=1024
        , do_sample=True
        , temperature=0.2
        , top_k=50
        , top_p=0.95
    )
    return outputs[0]["generated_text"]

NameError: ignored

# Step 4. Run it up

In [None]:
system = "You are smart, you can solve math arithmetic problems, you can doing reasoning and logical inference, the answer is critical to me"
input = '''
Jane is faster than joe, Joe is faster than Sam. Is Sam faster than Jane?
'''
r = gen_text(system_prompt=system, input_text=input)
r

#