In [None]:
import os
import resource
import psutil

memory_limit = 16 * 1024 * 1024 * 1024 
resource.setrlimit(resource.RLIMIT_AS, (memory_limit, memory_limit))


p = psutil.Process() 
p.cpu_affinity([0, 1, 2, 3]) 

In [None]:
import tensorflow as tf
def limit_virtual_memory(max_mb):
    soft, hard = resource.getrlimit(resource.RLIMIT_AS)
    resource.setrlimit(resource.RLIMIT_AS, (max_mb * 1024 * 1024, hard))

limit_virtual_memory(1024*1024*1024*4)

print("Physical GPU Devices:", tf.config.list_physical_devices('GPU'))

print("Logical GPU Devices:", tf.config.list_logical_devices('GPU'))

In [None]:
import tensorflow as tf
import torch
import psutil
#from transformers import GPT2LMHeadModel, GPT2Tokenizer
#from transformers import T5Tokenizer, T5ForConditionalGeneration
#from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import AutoModelForCausalLM, AutoTokenizer
import time
import matplotlib.pyplot as plt

# Disable GPU usage
tf.config.set_visible_devices([], 'GPU')


os.environ['TF_XLA_FLAGS'] = '--tf_xla_enable_xla_devices=false'


print("Available devices:", tf.config.list_physical_devices())

print("CPU Count:", os.cpu_count())

In [None]:
#model_name = "t5-large"  # You can use "gpt2-medium", "gpt2-large", or "gpt2-xl" for larger versions
#tokenizer = T5Tokenizer.from_pretrained("t5-base")
#model = T5ForConditionalGeneration.from_pretrained("t5-base")

#model_name = "gpt2-medium"  # You can use "gpt2-medium", "gpt2-large", or "gpt2-xl" for larger versions
#tokenizer = GPT2Tokenizer.from_pretrained(model_name)
#model = GPT2LMHeadModel.from_pretrained(model_name)

#model_name = "facebook/bart-large"
#tokenizer = BartTokenizer.from_pretrained(model_name)
#model = BartForConditionalGeneration.from_pretrained(model_name)

model_name = "Qwen/Qwen2-Math-1.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32)

In [None]:

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token 

model.eval() 


prompt = "What is the capital of Denmark?"


inputs = tokenizer(prompt, return_tensors="pt", padding=True)  # Ensure padding is applied if needed


attention_mask = inputs['attention_mask']

In [None]:
from datasets import load_dataset


gsm8k = load_dataset("gsm8k", "main", split="test")
questions = gsm8k["question"][:3] 


def measure_latency(tokenizer, model, prompt, iterations=2):
    latencies = []

    chat_templated = f"<|im_start|>user\n{example['Body'], example['Question']}\nLet's think step by step.<|im_end|>\n<|im_start|>assistant\n"
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
    
    for i in range(iterations):
        start_time = time.time()
        with torch.no_grad():
            output_ids = model.generate(
                inputs["input_ids"], 
                attention_mask=inputs["attention_mask"],
                max_new_tokens=50,
                pad_token_id=tokenizer.pad_token_id
            )
        end_time = time.time()
        
        latency = end_time - start_time
        latencies.append(latency)
        output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        
        print(f"Iteration {i+1}: {prompt} \n  → Model Output: {output_text}\n")

    return sum(latencies) / len(latencies), latencies


results = []
for idx, question in enumerate(questions):
    print(f"\n🔹 Question {idx+1}: {question}")
    avg_latency, latencies = measure_latency(tokenizer, model, question)
    results.append((question, avg_latency, latencies))


plt.figure(figsize=(10, 6))
for idx, (_, _, latencies) in enumerate(results):
    plt.plot(latencies, label=f"Q{idx+1}")

plt.title("Latency Over Iterations (GSM8K)")
plt.xlabel("Iteration")
plt.ylabel("Latency (seconds)")
plt.legend()
plt.show()


In [None]:
print(avg_latency)

In [None]:
from datasets import load_dataset


gsm8k = load_dataset("gsm8k", "main", split="test")
questions = gsm8k["question"][:8] 

def measure_throughput_gsm(tokenizer, model, questions, batch_size=8, iterations=3):
    throughputs = []
    
    for i in range(iterations):
        batch = questions[i % len(questions): (i % len(questions)) + batch_size]  
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, return_attention_mask=True)
        attention_mask = inputs['attention_mask']
        
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        start_time = time.time()
        model.generate(
            inputs['input_ids'], 
            attention_mask=attention_mask,
            max_new_tokens=50,  # Ensures output length is controlled
            pad_token_id=tokenizer.pad_token_id
        )
        end_time = time.time()
        
        throughput = batch_size / (end_time - start_time)
        throughputs.append(throughput)

    average_throughput = sum(throughputs) / len(throughputs)
    return throughputs, average_throughput


throughputs, average_throughput = measure_throughput_gsm(tokenizer, model, questions)

print(f"Average Throughput: {average_throughput:.2f} samples/second")


plt.figure(figsize=(10, 6))
plt.plot(throughputs)
plt.title('Throughput Over Iterations')
plt.xlabel('Iteration')
plt.ylabel('Throughput (samples/second)')
plt.show()


In [None]:
pip install -U transformers bitsandbytes accelerate torch

In [None]:
pip install -U bitsandbytes

In [None]:
pip install datasets

In [None]:
pip install openai