In [1]:
import time
import torch
from transformers import AutoTokenizer
from utils.utils import load_json

  from .autonotebook import tqdm as notebook_tqdm


## Prepare text and embeddings

In [2]:
device="mps"
base_model_dir = "./granite-3b-code-instruct/"
config = load_json(f"./{base_model_dir}/config.json")

In [3]:
tokenizer = AutoTokenizer.from_pretrained(base_model_dir, clean_up_tokenization_spaces=False)
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]
tokenizer.pad_token = tokenizer.eos_token

In [4]:
def text_to_ids(text, tokenizer):
    if type(text) != list: text = [text]
    input_ids = tokenizer(
        text,
    )["input_ids"]
    return input_ids

## Forward passes

In [5]:
from ops.transformer_ops import Transformer
from ops.generation import generate_text, generate_text_stream

In [6]:
config["model_type"] = "granite-small"

In [7]:
config

{'architectures': ['LlamaForCausalLM'],
 'attention_bias': True,
 'attention_dropout': 0.1,
 'bos_token_id': 0,
 'eos_token_id': 0,
 'hidden_act': 'silu',
 'hidden_size': 2560,
 'initializer_range': 0.02,
 'intermediate_size': 10240,
 'max_position_embeddings': 2048,
 'mlp_bias': True,
 'model_type': 'granite-small',
 'num_attention_heads': 32,
 'num_hidden_layers': 32,
 'num_key_value_heads': 32,
 'pad_token_id': 0,
 'pretraining_tp': 1,
 'rms_norm_eps': 1e-05,
 'rope_scaling': None,
 'rope_theta': 10000,
 'tie_word_embeddings': True,
 'torch_dtype': 'bfloat16',
 'transformers_version': '4.41.0.dev0',
 'use_cache': True,
 'vocab_size': 49152}

In [8]:
model_dir = "GRANITE-3B-CODE-INSTRUCT-PKL"
model = Transformer(model_dir, config, device=device)

In [9]:
#texts = ["def quicksort():\t#Implementation of quicksort:"]
#input_ids = text_to_ids(texts, tokenizer)
chat = [
    { "role": "user", "content": "Write a code to find the maximum value in a list of numbers." },
]
chat = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
# tokenize the text
input_ids = [tokenizer(chat)["input_ids"]]

In [10]:
chat

'Question:\nWrite a code to find the maximum value in a list of numbers.\n\nAnswer:\n'

In [11]:
streaming=True

In [12]:
if streaming:
    output_text = []
    total_tokens_count = 0
    start_time = time.time()
    for word, n_tokens in generate_text_stream(model, tokenizer, input_ids, max_gen_len=128, stop_tokens_ids=terminators):
        print(f"{word}", end='', flush=True)
        output_text.append(word)
        total_tokens_count += n_tokens
    delta_time = time.time() - start_time
    output_text = "".join(output_text)
    print()
else:  
    start_time = time.time()
    outputs, total_tokens_count = generate_text(model, tokenizer, input_ids, max_gen_len=128, stop_tokens_ids=terminators)
    delta_time = time.time() - start_time
    print([text+output for text, output in zip(texts, outputs)])
print(f"Generation took {delta_time} seconds, {total_tokens_count/delta_time} tokens/s.")

```python
def find_max_value(nums):
    max_value = nums[0]
    for num in nums:
        if num > max_value:
            max_value = num
    return max_value

nums = [2, 3, 5, 7, 11]
max_value = find_max_value(nums)
print("The maximum value in the list is:", max_value)
```
Generation took 10.620662689208984 seconds, 9.038983988968958 tokens/s.
