In [5]:
import time

import torch
torch.cuda.is_available()

True

In [6]:
torch.__version__

'2.8.0+cu128'

In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-7B-Instruct"

model = AutoModelForCausalLM.from_pretrained(model_name)
model.to(device)

Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.13it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.03 GiB. GPU 0 has a total capacity of 31.37 GiB of which 185.44 MiB is free. Process 382874 has 17.11 GiB memory in use. Including non-PyTorch memory, this process has 14.06 GiB memory in use. Of the allocated memory 13.49 GiB is allocated by PyTorch, and 85.40 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer

Qwen2TokenizerFast(name_or_path='Qwen/Qwen2.5-7B-Instruct', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, n

In [6]:
prompt = "Help me prepare for my Senior AI developer interview"

conversation = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt}
]

text = tokenizer.apply_chat_template(
    conversation,
    tokenize=False,
    add_generation_prompt=True,
)

text

'<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nHelp me prepare for my Senior AI developer interview<|im_end|>\n<|im_start|>assistant\n'

In [7]:
model_inputs = tokenizer([text], return_tensors="pt").to(device)
model_inputs

{'input_ids': tensor([[151644,   8948,    198,   2610,    525,   1207,  16948,     11,   3465,
            553,  54364,  14817,     13,   1446,    525,    264,  10950,  17847,
             13, 151645,    198, 151644,    872,    198,  12689,    752,  10549,
            369,    847,  19342,  15235,  15754,   7128, 151645,    198, 151644,
          77091,    198]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [8]:
tic = time.time()
generated_ids = model.generate(**model_inputs, max_new_tokens=512)
toc = time.time()



In [9]:
model_inputs.input_ids

tensor([[151644,   8948,    198,   2610,    525,   1207,  16948,     11,   3465,
            553,  54364,  14817,     13,   1446,    525,    264,  10950,  17847,
             13, 151645,    198, 151644,    872,    198,  12689,    752,  10549,
            369,    847,  19342,  15235,  15754,   7128, 151645,    198, 151644,
          77091,    198]], device='cuda:0')

In [10]:
generated_ids[0]

tensor([151644,   8948,    198,   2610,    525,   1207,  16948,     11,   3465,
           553,  54364,  14817,     13,   1446,    525,    264,  10950,  17847,
            13, 151645,    198, 151644,    872,    198,  12689,    752,  10549,
           369,    847,  19342,  15235,  15754,   7128, 151645,    198, 151644,
         77091,    198,  97191,    369,    264,   9990,  15235,  15754,   7128,
         17601,   8660,   2176,    279,  10916,  13566,    315,  15235,    323,
           279,  26829,   2266,    304,    892,   1493,  14310,    525,   9251,
            13,   5692,    748,    264,  32930,   5486,    311,   1492,    498,
         10549,   1447,  14374,    220,     16,     13,  26668,  31925,    271,
           820,   9518,  75772,    510,     12,   3070,  21605,  20909,  95518,
         70894,   5257,  19614,  25185,    320,     68,   1302,   2572,  13482,
         30549,     11,   5480,  12408,     11,  90009,     82,     11,  29728,
         14155,    701,    862,  35386, 

In [11]:
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

generated_ids

[tensor([97191,   369,   264,  9990, 15235, 15754,  7128, 17601,  8660,  2176,
           279, 10916, 13566,   315, 15235,   323,   279, 26829,  2266,   304,
           892,  1493, 14310,   525,  9251,    13,  5692,   748,   264, 32930,
          5486,   311,  1492,   498, 10549,  1447, 14374,   220,    16,    13,
         26668, 31925,   271,   820,  9518, 75772,   510,    12,  3070, 21605,
         20909, 95518, 70894,  5257, 19614, 25185,   320,    68,  1302,  2572,
         13482, 30549,    11,  5480, 12408,    11, 90009,    82,    11, 29728,
         14155,   701,   862, 35386,    11, 43567,    11,   323,   979,   311,
           990,  1105,   624,    12,  3070, 33464, 20909, 95518,  2823, 11285,
           448,  5538,  6832, 77235,  1075, 19769,    82,    11,   431,  9745,
            82,    11,   444,   784, 21634,    11,   323, 86870,   624,    12,
          3070, 54281, 11434, 28125,   320,    45, 12567, 32295,    25, 31925,
           315,   451, 12567, 12538,    11,  4119,  

In [12]:
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

Preparing for a senior AI developer interview involves understanding both the technical aspects of AI and the broader context in which these technologies are applied. Here’s a structured approach to help you prepare:

### 1. Technical Knowledge

#### Core Concepts:
- **Machine Learning**: Understand various ML algorithms (e.g., linear regression, decision trees, SVMs, neural networks), their strengths, weaknesses, and when to use them.
- **Deep Learning**: Be familiar with deep learning architectures like CNNs, RNNs, LSTMs, and transformers.
- **Natural Language Processing (NLP)**: Knowledge of NLP techniques, models (BERT, GPT), and applications.
- **Computer Vision**: Understanding of image processing, object detection, segmentation, and face recognition.

#### Frameworks and Tools:
- **TensorFlow** and **PyTorch**: Proficiency in using these frameworks for building and training models.
- **Scikit-Learn**: Useful for basic machine learning tasks.
- **Keras**: A high-level API for Ten


# Metricsь

In [1]:
from typing import Any
import time
import numpy as np
import torch
from threading import Thread
from transformers import StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer

class TokenTimingCriteria(StoppingCriteria):
    """Records a timestamp at each decoding step (one per generated token)."""
    def __init__(self, device_type="cpu", sync_each_step=True):
        super().__init__()
        self.timestamps = []
        self.device_type = device_type
        self.sync_each_step = sync_each_step

    def _now(self):
        # Ensure GPU kernels are finished before timing (accurate per-token)
        if self.device_type.startswith("cuda") and self.sync_each_step:
            torch.cuda.synchronize()
        return time.perf_counter()

    def __call__(self, input_ids, scores, **kwargs):
        self.timestamps.append(self._now())
        return False  # never stop on our own


def generate_with_streaming(model, tokenizer, model_inputs, max_new_tokens=128, print_stream=True):
    # Resolve device type robustly
    device_type = model.device.type

    # Stopping criteria for accurate per-token timing
    timer = TokenTimingCriteria(device_type=device_type, sync_each_step=True)
    stopping = StoppingCriteriaList([timer])

    # Text streamer for user-visible incremental text
    streamer = TextIteratorStreamer(
        tokenizer,
        skip_special_tokens=True,
        skip_prompt=True
    )

    # Kick off generation in a background thread
    gen_kwargs = dict(
        **model_inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.7,
        top_p=0.8,
        stopping_criteria=stopping,
        streamer=streamer,
    )

    start_time = time.perf_counter()
    thread = Thread(target=model.generate, kwargs=gen_kwargs, daemon=True)
    thread.start()

    # Consume stream while generation runs (prints as it arrives)
    pieces = []
    for chunk in streamer:
        if print_stream:
            print(chunk, end="", flush=True)
        pieces.append(chunk)

    thread.join()
    end_time = time.perf_counter()

    # Build metrics from true per-token timestamps
    metrics = get_metrics(end_time, start_time, timer)

    return "".join(pieces), metrics


def get_metrics(end_time: float, start_time: float, timer: TokenTimingCriteria) -> dict[Any, Any]:
    ts = np.asarray(timer.timestamps, dtype=np.float64)
    metrics = {}

    if ts.size >= 1:
        ttft_ms = (ts[0] - start_time) * 1000.0
        metrics["ttft_ms"] = float(ttft_ms)
        metrics["num_generated_tokens"] = int(ts.size)
        metrics["total_time_s"] = float(end_time - start_time)
        metrics["throughput_tokens_per_sec"] = float(
            (ts.size / (end_time - start_time)) if end_time > start_time else 0.0
        )

        if ts.size >= 2:
            inter_ms = np.diff(ts) * 1000.0
            metrics["tpot_mean_ms"] = float(np.mean(inter_ms))
            metrics["tpot_p50_ms"] = float(np.percentile(inter_ms, 50))
            metrics["tpot_p95_ms"] = float(np.percentile(inter_ms, 95))
    else:
        # No tokens generated
        metrics = {
            "ttft_ms": None,
            "num_generated_tokens": 0,
            "total_time_s": float(end_time - start_time),
            "throughput_tokens_per_sec": 0.0,
        }
    return metrics


# ===== Example usage =====


generated_text, metrics = generate_with_streaming(model, tokenizer, model_inputs, max_new_tokens=1024, print_stream=True)
print("\n", "="*20, "METRICS", "="*20)
for key, value in metrics.items():
    if isinstance(value, float):
        print(f"{key}: {value:.2f}")
    else:
        print(f"{key}: {value}")


  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'model' is not defined