In [10]:
import os
import tensorflow as tf
# Disable GPU usage
tf.config.set_visible_devices([], 'GPU')

# Check available devices
print("Available devices:", tf.config.list_physical_devices())


Available devices: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]


In [11]:
!ulimit -Sv 16000000

import multiprocessing

num_cores = 3  # Set the number of CPUs you want to use
multiprocessing.set_start_method("fork", force=True)

In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the GPT-2 tokenizer and model
model_name = "gpt2-med"  # You can use "gpt2-medium", "gpt2-large", or "gpt2-xl" for larger versions
# Load model directly

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2-medium")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2-medium")

In [13]:
# Set the pad_token to eos_token (or add a custom pad_token if necessary)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Set EOS token as the pad_token

# Set the model to evaluation mode
model.eval()

# Define a prompt
prompt = "What is the capital of Denmark?"

# Encode the prompt (tokenize it) with padding
inputs = tokenizer(prompt, return_tensors="pt", padding=True)  # Ensure padding is applied if needed

# Add attention mask to the inputs
attention_mask = inputs['attention_mask']

# Generate text from the model with do_sample set to True for sampling-based generation
outputs = model.generate(
    inputs['input_ids'],
    attention_mask=attention_mask,  # Provide attention mask
    max_length=100,                  # Maximum number of tokens to generate
    num_return_sequences=1,         # Number of output sequences
    no_repeat_ngram_size=2,         # Prevent repetition
    top_p=0.92,                     # Nucleus sampling (controls randomness)
    top_k=50,                       # Top-k sampling
    temperature=0.85,               # Lower temperature makes text less random
    do_sample=True,                 # Enable sampling mode to use top_p, top_k, and temperature
    pad_token_id=tokenizer.pad_token_id  # Set pad token explicitly
)

# Decode the generated tokens back into text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(generated_text)


What is the capital of Denmark?

The capital city of Copenhagen is Denmark, which is located on the Danish west coast. Denmark is a country with a population of approximately 30 million. The capital is Copenhagen and the city is known as the European Capital of Culture and Technology.
,



[ ]

,

 . .

 (


In [14]:
import psutil
import time

# Function to get memory usage
def get_memory_usage():
    memory = psutil.virtual_memory()
    return memory.percent

# Set the pad_token to eos_token (or add a custom pad_token if necessary)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Set EOS token as the pad_token

# Set the model to evaluation mode
model.eval()

# Define a prompt
prompt = "What is the capital of Denmark?"

# Encode the prompt (tokenize it) with padding
inputs = tokenizer(prompt, return_tensors="pt", padding=True)  # Ensure padding is applied if needed

# Add attention mask to the inputs
attention_mask = inputs['attention_mask']

# Track memory usage before running your model
before_memory = get_memory_usage()
print(f"Memory Usage Before Model: {before_memory}%")

# Generate text from the model with do_sample set to True for sampling-based generation
outputs = model.generate(
    inputs['input_ids'],
    attention_mask=attention_mask,  # Provide attention mask
    max_length=100,                  # Maximum number of tokens to generate
    num_return_sequences=1,         # Number of output sequences
    no_repeat_ngram_size=2,         # Prevent repetition
    top_p=0.92,                     # Nucleus sampling (controls randomness)
    top_k=50,                       # Top-k sampling
    temperature=0.85,               # Lower temperature makes text less random
    do_sample=True,                 # Enable sampling mode to use top_p, top_k, and temperature
    pad_token_id=tokenizer.pad_token_id  # Set pad token explicitly
)

# Decode the generated tokens back into text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

after_memory = get_memory_usage()
print(f"Memory Usage After Model: {after_memory}%")

print(generated_text)


Memory Usage Before Model: 41.4%
Memory Usage After Model: 41.6%
What is the capital of Denmark?

Danes have a capital called Kristiansand.
: There is a city called Danket, and then we have another capital, Kristianstad, with the same name.


What did Denmark lose when Sweden was divided?

 A lot of what the Swedish people lost was the fact that the Swedes were better at sports and at everything else. That's why they won the World Cup. It's also why the Finns lost the


In [15]:
import time

# Set pad_token_id to eos_token_id
tokenizer.pad_token = tokenizer.eos_token

# Encode the prompt (tokenize it)
inputs = tokenizer("Mary had a little lamb", return_tensors="pt", padding=True, truncation=True)

# Add the attention mask to the inputs
attention_mask = inputs['attention_mask']

# Measure the time taken for text generation
start_time = time.time()
outputs = model.generate(
    inputs['input_ids'],
    attention_mask=attention_mask,  # Pass the attention mask
    max_length=50,
    num_return_sequences=1,
    no_repeat_ngram_size=2,
    top_p=0.92,
    top_k=50,
    temperature=0.85
)
end_time = time.time()

# Calculate latency
latency = end_time - start_time
print(f"Latency: {latency * 1000:.2f} ms")

# Decode the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Latency: 3684.45 ms
Mary had a little lamb, and she brought it to the Lord, saying, "I have brought you a lamb to eat with you." And the angel of the LORD said to her, "'I will bring you another lamb for you, but this


In [16]:
import time

batch_size = 8
prompt = "Mary had a little lamb"  # Ensure prompt is defined

# Set pad_token to eos_token as GPT-2 does not have a pad_token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the inputs with padding, truncation, and attention mask
inputs = tokenizer([prompt] * batch_size, return_tensors="pt", padding=True, truncation=True, return_attention_mask=True)

# Check if the attention mask is included (it should be automatically if padding=True)
print(inputs.keys())  # This should include 'attention_mask' and 'input_ids'

start_time = time.time()
for _ in range(25):  # Run 100 iterations
    outputs = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=50,
        pad_token_id=tokenizer.pad_token_id  # Explicitly set the pad_token_id
    )
end_time = time.time()

# Calculate throughput
throughput = (batch_size * 100) / (end_time - start_time)
print(f"Throughput: {throughput:.2f} samples/second")

dict_keys(['input_ids', 'attention_mask'])
Throughput: 2.74 samples/second


In [17]:
import time
!pip install datasets
from datasets import load_dataset

# Define batch size
batch_size = 8

# Load dataset (choose config appropriately)
gsm8k_dataset = load_dataset("gsm8k", name="main", split="test")
math_dataset = load_dataset("math_dataset", name="algebra__linear_1d", split="test")

# Choose the dataset to use, e.g., GSM8K
dataset = gsm8k_dataset  # Or replace with math_dataset

# Example preprocessing: Extract the 'question' field for prompts (adjust field name if needed)
prompts = dataset['question'][:batch_size]  # Get a batch of prompts

# Load tokenizer and model (GPT-2 example)
ttokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2-medium")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2-medium")

# Set padding_side to 'left' when initializing the tokenizer
tokenizer.padding_side = 'left'

# Set pad_token to eos_token (since GPT-2 doesn't have a pad_token)
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the inputs
inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, return_attention_mask=True)

# Check if the attention mask is included
print(inputs.keys())  # This should include 'attention_mask' and 'input_ids'

# Run the model and measure latency
start_time = time.time()
for _ in range(10):  # Run 10 iterations (adjust as needed)
    outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_new_tokens=50)  # Only generate 50 new tokens
end_time = time.time()

# Calculate throughput
throughput = (batch_size * 10) / (end_time - start_time)
print(f"Throughput: {throughput:.2f} samples/second")

# Optionally: Print model outputs
for i, output in enumerate(outputs):
    print(f"Prompt: {prompts[i]}")
    print(f"Generated Output: {tokenizer.decode(output, skip_special_tokens=True)}")



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


dict_keys(['input_ids', 'attention_mask'])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Throughput: 0.36 samples/second
Prompt: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?
Generated Output: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?

"I make $2.50 a day," she says. "I don't make much money. I don't make much money. I don't make much money."

She says she's not sure how much she makes from
Prompt: A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take?
Generated Output: A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in tot