In [1]:
import ctypes
import llama_cpp

In [2]:
llama_cpp.llama_backend_init(numa=False)

In [3]:
params = llama_cpp.llama_model_default_params()
params.n_gpu_layers = 35
model = llama_cpp.llama_load_model_from_file(
    b"/workspaces/llama-cpp-python/mistral-7b-v0.1.Q2_K.gguf", params
)  # Update this to whatever

llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from /workspaces/llama-cpp-python/mistral-7b-v0.1.Q2_K.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-v0.1
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 l

llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32000]   = ["<unk>", "<s>", "</s>", "<0x00>", "<...
llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32000]   = [0.000000, 0.000000, 0.000000, 0.0000...
llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32000]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
llama_model_loader: - kv  19:               general.quantization_version u32              = 2
llama_model_loader: - type  f32:   65 tensors
llama_model_loader: - type q2_K:   65 tensors
llama_model_loader: - type q3_K:  160 tensors
llama_model_loader: - type q6_K:    1 tensors
llm_load_vocab: special_eos_id is not in 

In [4]:
n_ctx = 512
n_len = 32
n_parallel = 2
prompt = b"The quick brown fox"

tokens = (llama_cpp.llama_token * n_ctx)()
tokens_len = llama_cpp.llama_tokenize(
    model, prompt, len(prompt), tokens, len(tokens), True, True
)
print(tokens[:tokens_len])

n_kv_req = tokens_len + (n_len - tokens_len) * n_parallel
print(n_kv_req)

[1, 415, 2936, 9060, 285, 1142]
58


In [5]:
ctx_params = llama_cpp.llama_context_default_params()
ctx_params.seed = 1234
ctx_params.n_ctx = n_kv_req
ctx_params.n_batch = max(n_len, n_parallel)
ctx_params.n_threads = 1
ctx_params.n_threads_batch = 1
ctx = llama_cpp.llama_new_context_with_model(model, ctx_params)

llama_new_context_with_model: n_ctx      = 64
llama_new_context_with_model: n_batch    = 32
llama_new_context_with_model: n_ubatch   = 32
llama_new_context_with_model: flash_attn = 0
llama_new_context_with_model: freq_base  = 10000.0
llama_new_context_with_model: freq_scale = 1
llama_kv_cache_init:        CPU KV buffer size =     8.00 MiB
llama_new_context_with_model: KV self size  =    8.00 MiB, K (f16):    4.00 MiB, V (f16):    4.00 MiB
llama_new_context_with_model:        CPU  output buffer size =     0.12 MiB
llama_new_context_with_model:        CPU compute buffer size =     5.01 MiB
llama_new_context_with_model: graph nodes  = 1030
llama_new_context_with_model: graph splits = 1


In [6]:
n_ctx = llama_cpp.llama_n_ctx(ctx)
batch = llama_cpp.llama_batch_init(max(tokens_len, n_parallel), 0, 1)

In [7]:


batch.n_tokens = tokens_len
for i in range(tokens_len):
    batch.token[i] = tokens[i]
    batch.pos[i] = i
    batch.seq_id[i][0] = 0
    batch.n_seq_id[i] = 1
    batch.logits[i] = False

batch.logits[batch.n_tokens - 1] = True

if llama_cpp.llama_decode(ctx, batch) != 0:
    print("Error decoding")

In [8]:
for i in range(n_parallel):
    llama_cpp.llama_kv_cache_seq_cp(ctx, 0, i, 0, batch.n_tokens)

In [9]:

# Initialize sampler chain with default parameters
sparams = llama_cpp.llama_sampler_chain_default_params()
sampler_chain = llama_cpp.llama_sampler_chain_init(sparams)

# Add top_k, top_p, temperature, and final distribution-based sampler
llama_cpp.llama_sampler_chain_add(sampler_chain, llama_cpp.llama_sampler_init_top_k(40))
llama_cpp.llama_sampler_chain_add(sampler_chain, llama_cpp.llama_sampler_init_top_p(0.9, 1))
llama_cpp.llama_sampler_chain_add(sampler_chain, llama_cpp.llama_sampler_init_temp(0.4))
llama_cpp.llama_sampler_chain_add(sampler_chain, llama_cpp.llama_sampler_init_dist(1234))  # Final "dist" sampler

In [10]:
streams = [""] * n_parallel
i_batch = [batch.n_tokens - 1] * n_parallel

n_cur = batch.n_tokens
n_decode = 0

while n_cur <= n_len:
    batch.n_tokens = 0
    for i in range(n_parallel):
        if i_batch[i] < 0:
            continue

        # Sample the next token using the sampler chain
        new_token_id = llama_cpp.llama_sampler_sample(sampler_chain, ctx, -1)

        if new_token_id == llama_cpp.llama_token_eos(ctx) or n_cur == n_len:
            i_batch[i] = -1
            continue

        buf = (ctypes.c_char * 32)()
        
        # Convert token ID to text
        outlen = llama_cpp.llama_token_to_piece(model, new_token_id, buf, len(buf), 0, False)
        streams[i] += bytes(buf[:outlen]).decode("utf-8")

        batch.token[batch.n_tokens] = new_token_id
        batch.pos[batch.n_tokens] = n_cur
        batch.seq_id[batch.n_tokens][0] = i
        batch.n_seq_id[batch.n_tokens] = 1
        batch.logits[batch.n_tokens] = True

        i_batch[i] = batch.n_tokens
        batch.n_tokens += 1
        n_decode += 1

    if batch.n_tokens == 0:
        break

    n_cur += 1

    if llama_cpp.llama_decode(ctx, batch) != 0:
        print("Error decoding", flush=True)
        break
    print(n_cur)
    print(streams)

7
[' j', ' jumped']
8
[' j over', ' jumped over']
9
[' j over the', ' jumped over the']
10
[' j over the lazy', ' jumped over the lazy']
11
[' j over the lazy dog', ' jumped over the lazy dog']
12
[' j over the lazy dog.', ' jumped over the lazy dog\n']
13
[' j over the lazy dog. También', ' jumped over the lazy dog\nGroupLayout']
14
[' j over the lazy dog. También:', ' jumped over the lazy dog\nGroupLayouting']
15
[' j over the lazy dog. También: is', ' jumped over the lazy dog\nGroupLayouting is']
16
[' j over the lazy dog. También: is a', ' jumped over the lazy dog\nGroupLayouting is a']
17
[' j over the lazy dog. También: is a technique', ' jumped over the lazy dog\nGroupLayouting is a common']
18
[' j over the lazy dog. También: is a technique practice', ' jumped over the lazy dog\nGroupLayouting is a common practice']
19
[' j over the lazy dog. También: is a technique practice in', ' jumped over the lazy dog\nGroupLayouting is a common practice in']
20
[' j over the lazy dog. Tam

In [11]:
print(streams)

[' j over the lazy dog. También: is a technique practice in the real-. We, when is been little researchirical research', ' jumped over the lazy dog\nGroupLayouting is a common practice in the media industry. However, there has been little empirical research']


In [12]:
llama_cpp.llama_batch_free(batch)

In [13]:
llama_cpp.llama_free(ctx)

In [14]:
llama_cpp.llama_free_model(model)

In [15]:
llama_cpp.llama_backend_free()