In [10]:
# Packages
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
import torch
from tqdm import tqdm
import os
from typing import Literal


In [11]:
# Load Models

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
use_amp = amp_supported = torch.cuda.is_available() and torch.cuda.get_device_capability(0) >= (7, 0)


# tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-hf")
# model = AutoModelForCausalLM.from_pretrained("NousResearch/Llama-2-7b-hf", torch_dtype=torch.float16).to(device)
# Load model directly

tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-step-50K-105b")
model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-step-50K-105b",   torch_dtype=torch.float16).to(device)

## Inference: Memory and Max Memory Monitor

In [12]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): 

In [13]:
config = AutoConfig.from_pretrained("Tinyllama/Tinyllama-1.1B-step-50K-105b")

hidden_size = config.hidden_size
eos_token_id = config.eos_token_id
max_position_embeddings = config.max_position_embeddings

print("Hidden Size:", hidden_size)
print("EOS Token ID:", eos_token_id)
print("Max Position Embeddings:", max_position_embeddings)

Hidden Size: 2048
EOS Token ID: 2
Max Position Embeddings: 2048


In [14]:
# Hyperparameter
b = 1 # batch size
s = 1 # sequence length
max_s = 4096 # max sequence length

In [None]:

class ModelMemoryMonitor:
    def __init__(self, model, tokenizer, batch_size=1, max_seq_len=4096, use_amp=False, device="cuda"):
        """
        Initialize the memory monitor for model inference.

        Args:
        model (torch.nn.Module): The preloaded model to monitor.
        tokenizer (Tokenizer): Tokenizer to process input text.
        batch_size (int): Number of samples in a batch.
        max_seq_len (int): Maximum sequence length.
        use_amp (bool): If True, enables mixed precision inference.
        device (str): Device for inference (e.g., 'cuda' or 'cpu').
        """
        self.model = model.to(device)
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_seq_len = max_seq_len
        self.device = device
        self.use_amp = use_amp

        # Set pad_token to eos_token if no padding token is available
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

    def bytes_to_mb(self, bytes: int) -> float:
        """Convert bytes to megabytes."""
        return bytes / 1048576

    def bytes_to_gb(self, bytes: int) -> float:
        """Convert bytes to gigabytes."""
        return bytes / 1073741824
    
    def convert_memory(self, memory, memory_unit: Literal['byte', 'mb', 'gb']):
        if memory_unit == 'mb':
            return self.bytes_to_mb(memory)
        elif memory_unit == 'gb':
            return self.bytes_to_gb(memory)
        return memory  # default to bytes


    def simulate_input_ids(self, sequence_length: int, only_padding=False):
        """
        Generate dummy input IDs based on a specified sequence length.

        Args:
        sequence_length (int): The sequence length for the dummy input.
        only_padding (bool): If True, generates only padding tokens.
        
        Returns:
        dict: A dictionary with input IDs and attention masks on the specified device.
        """
        dummy_text = "" if only_padding else " ".join(["token"] * int(sequence_length * 1.5))
        inputs = self.tokenizer(dummy_text, return_tensors="pt", padding=True, truncation=True, max_length=self.max_seq_len)
        inputs = {key: value.to(self.device) for key, value in inputs.items()}

        # Validate actual input length against expected sequence length
        actual_length = inputs["input_ids"].shape[1]
        if actual_length != sequence_length:
            print(f"Warning: Expected sequence length ({sequence_length}) does not match actual input length ({actual_length}).")
        
        # Check attention mask sum
        attention_mask_sum = inputs["attention_mask"].sum().item()
        if attention_mask_sum != sequence_length:
            print(f"Warning: Attention mask sum ({attention_mask_sum}) does not match expected sequence length ({sequence_length}).")

        return inputs
    
    def estimate_forward_memory(self, sample_inputs, memory_unit: Literal['byte', 'mb', 'gb'] = 'byte'):
        """
        Estimate memory usage during inference.

        Args:
        sample_inputs (dict): Input data for the model.
        memory_unit (Literal['byte', 'mb', 'gb']): Unit of memory measurement. Options are 'byte', 'mb', or 'gb'.

        Returns:
        tuple: Previous memory, peak memory, and current memory in the specified unit.
        """
        # Move the model to the specified device
        self.model.cpu()
        self.model.to(self.device)
        self.model.eval()

        # Measure memory usage
        prev_memory = torch.cuda.memory_allocated(self.device)
        torch.cuda.reset_peak_memory_stats()

        # Run inference with optional AMP
        with torch.no_grad():
            if self.use_amp:
                with torch.amp.autocast(device_type=str(self.device)):
                    outputs = self.model(**sample_inputs)
            else:
                outputs = self.model(**sample_inputs)

        # Calculate memory usage
        peak_memory = torch.cuda.max_memory_allocated(self.device)
        cur_memory = torch.cuda.memory_allocated(self.device)

        # Convert memory usage to the specified unit
        if memory_unit == 'mb':
            prev_memory, peak_memory, cur_memory = map(self.bytes_to_mb, [prev_memory, peak_memory, cur_memory])
        elif memory_unit == 'gb':
            prev_memory, peak_memory, cur_memory = map(self.bytes_to_gb, [prev_memory, peak_memory, cur_memory])

        # Print memory usage summary
        print(f"Previous Memory: {prev_memory} {memory_unit.upper()}; Peak Memory: {peak_memory} {memory_unit.upper()}; Current Memory: {cur_memory} {memory_unit.upper()}")
        print(f"Peak Memory Difference: {peak_memory - prev_memory} {memory_unit.upper()}")
        print(f"Total Memory Consumption: {cur_memory - prev_memory} {memory_unit.upper()}")

        return prev_memory, peak_memory, cur_memory
    
    def estimate_inference_memory(self, prompt, max_iters = 100, memory_unit: Literal['byte', 'mb', 'gb'] = 'byte'):
        self.model.cpu()
        self.model.to(self.device)
        self.model.eval()

        prev_memory = torch.cuda.memory_allocated(self.device)
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
        
        peak_memory_lst = []
        cur_memory_lst = []

        for i in range(max_iters):
            # Reset peak memory stats for each token generation
            torch.cuda.reset_peak_memory_stats()

            with torch.no_grad():
                if self.use_amp:
                    with torch.amp.autocast(device_type=self.device):
                        outputs = self.model.generate(
                            input_ids,
                            max_length=input_ids.shape[1] + 1,
                            eos_token_id=self.tokenizer.eos_token_id,
                            pad_token_id=self.tokenizer.eos_token_id,
                            do_sample=False  # Disable sampling for deterministic output
                        )
                else:
                    outputs = self.model.generate(
                        input_ids,
                        max_length=input_ids.shape[1] + 1,
                        eos_token_id=self.tokenizer.eos_token_id,
                        pad_token_id=self.tokenizer.eos_token_id,
                        do_sample=False
                    )
                next_token_id = outputs[:, -1:]

                generated_ids = torch.cat([generated_ids, next_token_id], dim=1)

                
                if next_token_id.item() == self.tokenizer.eos_token_id:
                    print(f"EOS token generated at iteration {i+1}")
                    break

                peak_memory = torch.cuda.max_memory_allocated(self.device)
                cur_memory = torch.cuda.memory_allocated(self.device)
                peak_memory_lst = peak_memory_lst.append(peak_memory)
                cur_memory_lst = cur_memory_lst.append(cur_memory)
                
        return prev_memory, peak_memory_lst, cur_memory_lst

                





In [39]:
monitor = ModelMemoryMonitor(model, tokenizer, use_amp=use_amp, device=device)
sample_inputs = monitor.simulate_input_ids(1)
monitor.estimate_inference_memory(sample_inputs, memory_unit='mb')

Previous Memory: 4208.833984375 MB; Peak Memory: 4209.0068359375 MB; Current Memory: 4208.9990234375 MB
Peak Memory Difference: 0.1728515625 MB
Total Memory Consumption: 0.1650390625 MB


(4208.833984375, 4209.0068359375, 4208.9990234375)

In [40]:
with torch.no_grad():
    if use_amp:
        with torch.amp.autocast(device_type=str(device)):
            outputs = model(**sample_inputs)

In [48]:
# Prepare the input prompt
prompt = (
    'The TinyLlama project aims to pretrain '
)

# Tokenize the prompt
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

# Generation settings
max_length = 500
top_k = 10
repetition_penalty = 1.5

# Run inference with temperature sampling and repetition penalty
with torch.no_grad():
    generated_ids = model.generate(
        input_ids,
        do_sample=True,
        max_length=max_length,
        top_k=top_k,
        repetition_penalty=repetition_penalty,
        eos_token_id=tokenizer.eos_token_id
    )

# Decode the generated text
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(f"Result: {generated_text}")

Result: The TinyLlama project aims to pretrain 10,425 volunteer dog walkers and the public in how to use their dogs effectively so that they can be an integral part of communities.
In total over three years with more than 768 miles covered, there are now approximately one million animals walking safely through these villages – which is about as many for every person who walks on average each day. They have seen improvements such as reduced traffic accidents resulting from people using them instead of cars or bicycles by making it easier for other road users to find ways around busy roads; improved healthcare due to increased numbers being able access veterinary services without leaving local hospitals when seeking treatment; improved social interaction between residents living at home versus driving themselves because drivers tend not take into consideration all those pesky pedestrians passing near where you're standing! These positive changes also mean we will see even higher usage ra

In [49]:
generated_text

'The TinyLlama project aims to pretrain 10,425 volunteer dog walkers and the public in how to use their dogs effectively so that they can be an integral part of communities.\nIn total over three years with more than 768 miles covered, there are now approximately one million animals walking safely through these villages – which is about as many for every person who walks on average each day. They have seen improvements such as reduced traffic accidents resulting from people using them instead of cars or bicycles by making it easier for other road users to find ways around busy roads; improved healthcare due to increased numbers being able access veterinary services without leaving local hospitals when seeking treatment; improved social interaction between residents living at home versus driving themselves because drivers tend not take into consideration all those pesky pedestrians passing near where you\'re standing! These positive changes also mean we will see even higher usage rates d