In [1]:
import os
import resource
import psutil

# Set memory limit to 16GB (adjust if needed)
memory_limit = 16 * 1024 * 1024 * 1024  # 16GB in bytes
resource.setrlimit(resource.RLIMIT_AS, (memory_limit, memory_limit))

# Restrict CPU core usage
p = psutil.Process()  # Get the current process
p.cpu_affinity([0, 1, 2, 3]) # Restrict the process to use only CPU cores 0, 1, 2, and 3
#p.cpu_affinity([0,1]) # Restrict the process to use only CPU cores 0, 1, 2, and 3

In [None]:
import tensorflow as tf
import torch
import psutil
#from transformers import GPT2LMHeadModel, GPT2Tokenizer
#from transformers import T5Tokenizer, T5ForConditionalGeneration
#from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import AutoModelForCausalLM, AutoTokenizer
import time
import matplotlib.pyplot as plt

# Disable GPU usage
tf.config.set_visible_devices([], 'GPU')

# Optionally disable XLA devices to prevent further warnings
os.environ['TF_XLA_FLAGS'] = '--tf_xla_enable_xla_devices=false'

# Check available devices
print("Available devices:", tf.config.list_physical_devices())

# Print CPU Count
print("CPU Count:", os.cpu_count())

In [None]:
#model_name = "t5-large"  # You can use "gpt2-medium", "gpt2-large", or "gpt2-xl" for larger versions
#tokenizer = T5Tokenizer.from_pretrained("t5-base")
#model = T5ForConditionalGeneration.from_pretrained("t5-base")

#model_name = "gpt2-medium"  # You can use "gpt2-medium", "gpt2-large", or "gpt2-xl" for larger versions
#tokenizer = GPT2Tokenizer.from_pretrained(model_name)
#model = GPT2LMHeadModel.from_pretrained(model_name)

#model_name = "facebook/bart-large"
#tokenizer = BartTokenizer.from_pretrained(model_name)
#model = BartForConditionalGeneration.from_pretrained(model_name)

model_name = "microsoft/phi-1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32)#

In [None]:
# Cell 3: Set Pad Token and Model to Evaluation Mode
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Set EOS token as the pad_token

model.eval()  # Set model to evaluation mode

# Define a prompt
prompt = "What is the capital of Denmark?"

# Encode the prompt (tokenize it) with padding
inputs = tokenizer(prompt, return_tensors="pt", padding=True)  # Ensure padding is applied if needed

# Add attention mask to the inputs
attention_mask = inputs['attention_mask']

In [None]:
from datasets import load_dataset

# Load GSM8K dataset (test split)
gsm8k = load_dataset("gsm8k", "main", split="test")
questions = gsm8k["question"][:3]  # Take the first 10 questions for benchmarking

# Measure Latency
def measure_latency(tokenizer, model, prompt, iterations=2):
    latencies = []
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
    
    for i in range(iterations):
        start_time = time.time()
        with torch.no_grad():
            output_ids = model.generate(
                inputs["input_ids"], 
                attention_mask=inputs["attention_mask"],
                max_new_tokens=50,
                pad_token_id=tokenizer.pad_token_id
            )
        end_time = time.time()
        
        latency = end_time - start_time
        latencies.append(latency)
        output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        
        print(f"Iteration {i+1}: {prompt} \n  → Model Output: {output_text}\n")

    return sum(latencies) / len(latencies), latencies

# Run latency tests on GSM8K
results = []
for idx, question in enumerate(questions):
    print(f"\n🔹 Question {idx+1}: {question}")
    avg_latency, latencies = measure_latency(tokenizer, model, question)
    results.append((question, avg_latency, latencies))

# Plot Latency
plt.figure(figsize=(10, 6))
for idx, (_, _, latencies) in enumerate(results):
    plt.plot(latencies, label=f"Q{idx+1}")

plt.title("Latency Over Iterations (GSM8K)")
plt.xlabel("Iteration")
plt.ylabel("Latency (seconds)")
plt.legend()
plt.show()


In [None]:
print(avg_latency)

In [None]:
from datasets import load_dataset

# Load GSM8K dataset (test split)
gsm8k = load_dataset("gsm8k", "main", split="test")
questions = gsm8k["question"][:3]  # Take the first 10 questions for benchmarking

def measure_throughput_gsm(tokenizer, model, questions, batch_size=8, iterations=3):
    throughputs = []
    
    for i in range(iterations):
        batch = questions[i % len(questions): (i % len(questions)) + batch_size]  # Get batch of questions
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, return_attention_mask=True)
        attention_mask = inputs['attention_mask']
        
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        start_time = time.time()
        model.generate(
            inputs['input_ids'], 
            attention_mask=attention_mask,
            max_new_tokens=50,  # Ensures output length is controlled
            pad_token_id=tokenizer.pad_token_id
        )
        end_time = time.time()
        
        throughput = batch_size / (end_time - start_time)
        throughputs.append(throughput)

    average_throughput = sum(throughputs) / len(throughputs)
    return throughputs, average_throughput

# Run throughput measurement on GSM8K
throughputs, average_throughput = measure_throughput_gsm(tokenizer, model, questions)

print(f"Average Throughput: {average_throughput:.2f} samples/second")

# Plot throughput over iterations
plt.figure(figsize=(10, 6))
plt.plot(throughputs)
plt.title('Throughput Over Iterations')
plt.xlabel('Iteration')
plt.ylabel('Throughput (samples/second)')
plt.show()


In [None]:
# Cell 6: Track Memory Usage
def get_memory_usage():
    memory = psutil.virtual_memory()
    return memory.used  # Return memory used in bytes

def track_memory_usage(model, inputs, attention_mask, iterations=25):
    memory_usages = []
    for _ in range(iterations):
        outputs = model.generate(
            inputs['input_ids'],
            attention_mask=attention_mask,  # Provide attention mask
            max_length=100,  # Maximum number of tokens to generate
            num_return_sequences=1,  # Number of output sequences
            no_repeat_ngram_size=2,  # Prevent repetition
            top_p=0.92,  # Nucleus sampling (controls randomness)
            top_k=50,  # Top-k sampling
            temperature=0.85,  # Lower temperature makes text less random
            do_sample=True,  # Enable sampling mode to use top_p, top_k, and temperature
            pad_token_id=tokenizer.pad_token_id  # Set pad token explicitly
        )
        memory_usage = get_memory_usage()
        memory_usages.append(memory_usage)
    
    average_memory_usage = sum(memory_usages) / len(memory_usages)
    return memory_usages, average_memory_usage

# Define inputs and attention mask
prompt = "What is the capital of Denmark and what is the capital of India?"
inputs = tokenizer(prompt, return_tensors="pt", padding=True)
attention_mask = inputs['attention_mask']

# Track memory usage over iterations
memory_usages, average_memory_usage = track_memory_usage(model, inputs, attention_mask)

# Print average memory usage
print(f"Average Memory Usage: {average_memory_usage / (1024 * 1024):.2f} MB")

# Plot memory usage over iterations
plt.figure(figsize=(10, 6))
plt.plot([usage / (1024 * 1024) for usage in memory_usages])  # Convert bytes to MB for plotting
plt.title('Memory Usage Over Iterations')
plt.xlabel('Iteration')
plt.ylabel('Memory Usage (MB)')
plt.show()


In [None]:
print("Latencies: \n", latencies)
print("\n\nThroughputs: \n", throughputs)
print("\n\nMemory Usages:\n", memory_usages)