In [2]:
from pprint import pprint
from parsers import ModelParser

import torch
from transformers import AutoTokenizer
from utils.utils import load_json

model_parser = ModelParser([
    "./Phi-3-mini-4k-instruct/model-00001-of-00002.safetensors",
    "./Phi-3-mini-4k-instruct/model-00002-of-00002.safetensors",
])

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_parser.tensor_names

['model.embed_tokens.weight',
 'model.layers.0.input_layernorm.weight',
 'model.layers.0.mlp.down_proj.weight',
 'model.layers.0.mlp.gate_up_proj.weight',
 'model.layers.0.post_attention_layernorm.weight',
 'model.layers.0.self_attn.o_proj.weight',
 'model.layers.0.self_attn.qkv_proj.weight',
 'model.layers.1.input_layernorm.weight',
 'model.layers.1.mlp.down_proj.weight',
 'model.layers.1.mlp.gate_up_proj.weight',
 'model.layers.1.post_attention_layernorm.weight',
 'model.layers.1.self_attn.o_proj.weight',
 'model.layers.1.self_attn.qkv_proj.weight',
 'model.layers.10.input_layernorm.weight',
 'model.layers.10.mlp.down_proj.weight',
 'model.layers.10.mlp.gate_up_proj.weight',
 'model.layers.10.post_attention_layernorm.weight',
 'model.layers.10.self_attn.o_proj.weight',
 'model.layers.10.self_attn.qkv_proj.weight',
 'model.layers.11.input_layernorm.weight',
 'model.layers.11.mlp.down_proj.weight',
 'model.layers.11.mlp.gate_up_proj.weight',
 'model.layers.11.post_attention_layernorm.w

## Prepare text and embeddings

In [4]:
device="mps"
base_model_dir = "./Phi-3-mini-4k-instruct"
config = load_json(f"./{base_model_dir}/config.json")

In [5]:
tokenizer = AutoTokenizer.from_pretrained(base_model_dir)
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
def text_to_ids(text, tokenizer):
    if type(text) != list: text = [text]
    input_ids = tokenizer(
        text,
        #return_tensors="pt",
        #padding=True
    )["input_ids"]
    return input_ids

## Forward passes

In [7]:
from ops.transformer_ops import Transformer
from ops.generation import generate_text, generate_text_stream

In [8]:
import time

In [9]:
model_dir = "PHI3-MINI-4K-PKL-int8"
model = Transformer(model_dir, config, device=device)

In [11]:
texts = ["<|user|>I am going to Paris, what should I see?<|end|><|assistant|>"]
input_ids = text_to_ids(texts, tokenizer)

In [12]:
start_time = time.time()
outputs, total_tokens_count = generate_text(model, tokenizer, input_ids, max_gen_len=128, stop_tokens_ids=terminators)
delta_time = time.time() - start_time
print([text+output for text, output in zip(texts, outputs)])
print(f"Generation took {delta_time} seconds, {total_tokens_count/delta_time} tokens/s.")

['<|user|>I am going to Paris, what should I see?<|end|><|assistant|>Paris, known as the "City of Light," offers a plethora of attractions that cater to a wide range of interests. Here\'s a list of must-see sights and experiences:\n\n1. Eiffel Tower: Visit the iconic symbol of Paris, offering panoramic views of the city from its observation decks.\n2. Louvre Museum: Explore the world-renowned museum and see famous works such as the Mona Lisa and Venus de Milo.\n3. Notre-Dame Cathedral: Admire the stunning Gothic architecture and']
Generation took 33.3645179271698 seconds, 3.8364108925357945 tokens/s.
