In [11]:
!pip install -q transformers autoawq intel_extension_for_pytorch

from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
from datetime import datetime, UTC
from ipywidgets import widgets
from IPython.display import display
import torch
import re
# --- load model sekali ---
t0 = datetime.now(UTC)
model_name_or_path = "TheBloke/Mistral-7B-v0.1-AWQ"

print("Loading model...")
model = AutoAWQForCausalLM.from_quantized(
    model_name_or_path,
    fuse_layers=True,
    trust_remote_code=False,
    safetensors=True,
    device_map="auto",
    dtype="auto"
)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=False)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

load_done = datetime.now(UTC)
elapsed_load = (load_done - t0).total_seconds()
print(f"✅ Model loaded in {elapsed_load:.2f} seconds")

# --- bikin textbox untuk input user ---
prompt_box = widgets.Text(
    value='',
    placeholder='Tulis prompt di sini...',
    description='Prompt:',
    disabled=False
)
display(prompt_box)

# --- handler ketika user pencet Enter ---
def handle_submit(change):
    prompt = prompt_box.value
    prompt_template = f"""{prompt}\n\n"""

    print("\n\n*** Generate:")
    gen_start = datetime.now(UTC)

    tokens = tokenizer(
        prompt_template,
        return_tensors='pt'
    ).input_ids.cuda()

    generation_output = model.generate(
        tokens,
        do_sample=True,
        temperature=0.3,
        top_p=0.95,
        top_k=40,
        max_new_tokens=60,
        repetition_penalty=1.5
    )

    gen_done = datetime.now(UTC)
    elapsed_gen = (gen_done - gen_start).total_seconds()

     # --- decode & cleanup ---
    output_text = tokenizer.decode(generation_output[0], skip_special_tokens=True)
    if prompt in output_text:
        output_text = output_text.split(prompt, 1)[-1].strip()

    # ambil kalimat utuh (biar gak nyangkut)
    sentences = re.split(r'(?<=[.!?]) +', output_text)
    clean_text = " ".join(sentences[:-1]) if sentences and not sentences[-1].endswith((".", "!", "?")) else output_text

    # --- print hasil ---
    print("Cleaned output:", clean_text)


    print(f"⚡ Model finished generating in {elapsed_gen:.2f} seconds")
    n_prompt_tokens = tokens.shape[-1]
    n_total_tokens = generation_output.shape[-1]
    n_new_tokens = n_total_tokens - n_prompt_tokens

    tok_per_sec = n_new_tokens / elapsed_gen

    print(f"⏱ Generated {n_new_tokens} tokens in {elapsed_gen:.2f} seconds "
      f"≈ {tok_per_sec:.2f} tok/s")

# trigger kalau user tekan Enter
prompt_box.on_submit(handle_submit)


Loading model...


Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]

Replacing layers...: 100%|██████████| 32/32 [00:13<00:00,  2.46it/s]


✅ Model loaded in 16.14 seconds


Text(value='', description='Prompt:', placeholder='Tulis prompt di sini...')





*** Generate:
Cleaned output: Americanos are a type of coffee drink that originated in Italy. They’re made with espresso and hot water, which gives them their characteristic dark coloring.
⚡ Model finished generating in 12.76 seconds
⏱ Generated 60 tokens in 12.76 seconds ≈ 4.70 tok/s
