# distilbert inference

In [10]:
import torch
print("is mps available?",torch.backends.mps.is_available())
# device=torch.device("mps")
# torch.set_default_device('cpu')
torch.set_default_device('mps')

is mps available? True


In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint).to(device)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
print(type(model))
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
output = model(**tokens)
print(output.logits)

<class 'transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification'>
tensor([[-1.5607,  1.6123],
        [-3.6183,  3.9137]], device='mps:0', grad_fn=<LinearBackward0>)


# Apple Llama2 inference

In [3]:
#locking seeds
import torch
import random
random.seed(42)
torch.manual_seed(42)  #locking seeds

<torch._C.Generator at 0x11704e8d0>

In [2]:
!pip install tqdm



## in Mac mps backend
hangs

In [None]:
import os
# need to restart the kernel
# Set environment variable to disable upper limit for memory allocations
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
# Increase pool size: extra memory pool equal to 50% of the main memory pool size will be allocated.
os.environ["PYTORCH_MPS_EXTRA_POOL_SIZE_RATIO"] = "0.5"  
import torch

#setting the device
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print("device is: ",device)

#loading the model
from transformers import LlamaTokenizer, LlamaForCausalLM
model_path='openlm-research/open_llama_3b_v2'
tokenizer=LlamaTokenizer.from_pretrained(model_path,legacy=True);
print("tokenizer is loaded")
model=LlamaForCausalLM.from_pretrained(model_path).to(device)
print("model is loaded")

#memory-saving technique by not storing the intermediate activations of all layers
model.gradient_checkpointing_enable()


# Example usage: move a tensor to the selected device
sequences = ["Example sequence 1", "Example sequence 2"]
tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
tokens=tokens.to(device)
print("get tokens")

# Perform inference
with torch.no_grad():
    output = model(**tokens)
print(output)


device is:  mps
tokenizer is loaded


In [None]:
#lora for fine-tuning the model
from peft import LoraConfig, PeftModel
lora_config=LoraConfig(
    r=64,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CASUAL_LM",
)
model=PeftModel(model,lora_config,adapter_name="Shakespeare")
device=torch.device("mps")
model.to(device)

In [None]:
#loading data for fine-tuning
import os
import requests
file_name="shakespeare.txt"
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
if not os.path.exists(file_name):
    data = requests.get(url)
    with open(file_name,'w') as f: 
        f.write(data.text)
from transformers import TextDataset
train_dataset = TextDataset (tokenizer=tokenizer, file_path=file_name, block_size=128) [:256]

In [None]:
#set training parameters
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir="output",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=32,
    evaluation_strategy='no',
)

from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) #form batches of data
trainer = Trainer( 
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

In [5]:
#check the model before training by prompt, tokenization, and generation
def generate_response(prompt_text,model,tokenizer,max_length=30,num_return_sequences=1):
    input_ids=tokenizer.encode(prompt_text,return_tensors="pt").to(device)
    print(input_ids)
    
    output_sequences=model.generate(
        input_ids=input_ids,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        no_repeat_ngram_size=2,
    )
    
    responses=[]
    for id,response_id in enumerate(output_sequences):
        response=tokenizer.decode(response_id,skip_special_tokens=True)
        responses.append(response)
        print("reponse ",id," is generated")
    return responses

prompt_text="To be or not to be"
responses=generate_response(prompt_text,model,tokenizer)
for response in responses:
    print(response)

tensor([[   1, 1240,  339,  408,  437,  290,  339]], device='mps:0')


Traceback (most recent call last):
  File "/opt/anaconda3/envs/LLMPower/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/d_/_lmsrk1d2h5cwkq89xphqfwm0000gn/T/ipykernel_3570/2695243141.py", line 21, in <module>
    responses=generate_response(prompt_text,model,tokenizer)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/d_/_lmsrk1d2h5cwkq89xphqfwm0000gn/T/ipykernel_3570/2695243141.py", line 6, in generate_response
    output_sequences=model.generate(
                     ^^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/LLMPower/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/LLMPower/lib/python3.12/site-packages/transformers/generation/utils.py", line 2024, in generate
    result = self._sample(
             ^^^^^^^^^^

KeyboardInterrupt: 

In [None]:
trainer.train()

In [None]:
save_path = "merged_fine_tuned_openllama2_3b_shakespeare"
tokenizer.save_pretrained(save_path)
merged_model = model.merge_and_unload() #merge model and adapter into a single model
merged_model.save_pretrained(save_path)

## In Mac, cpu backend

In [4]:
import os
# need to restart the kernel
# Set environment variable to disable upper limit for memory allocations
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
# Increase pool size: extra memory pool equal to 50% of the main memory pool size will be allocated.
os.environ["PYTORCH_MPS_EXTRA_POOL_SIZE_RATIO"] = "0.5"  
import torch

#setting the device
torch.set_default_device('cpu')
print("set default device to cpu")
device = torch.device("cpu")
print("device(var) is: ",device)

#loading the model
from transformers import LlamaTokenizer, LlamaForCausalLM
model_path='openlm-research/open_llama_3b_v2'
tokenizer=LlamaTokenizer.from_pretrained(model_path,legacy=True);
print("tokenizer is loaded")
model=LlamaForCausalLM.from_pretrained(model_path)
print("model is loaded")

#memory-saving technique by not storing the intermediate activations of all layers
model.gradient_checkpointing_enable()


set default device to cpu
device(var) is:  cpu
tokenizer is loaded
model is loaded


In [5]:
#e.g.
sequences = ["Example sequence 1", "Example sequence 2"]
#Llama don't include a 'pad_token' by default, so we need to add eos_token
tokenizer.pad_token = tokenizer.eos_token
tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
# tokens=tokens.to(device)
print("get tokens")

# Perform inference
with torch.no_grad():
    output = model(**tokens)
print(output)

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


get tokens
CausalLMOutputWithPast(loss=None, logits=tensor([[[-86.4790, -82.7401, -75.2648,  ..., -83.2248, -83.6001, -84.0480],
         [-69.1984, -64.9035, -58.1323,  ..., -69.1593, -68.4532, -68.7843],
         [-72.0738, -68.7730, -58.9978,  ..., -71.1235, -70.0178, -69.7289],
         [-13.0008,  -9.2679,  -0.0981,  ...,  -6.8345,  -9.4190,  -7.7957],
         [-70.1116, -66.3901, -57.9048,  ..., -71.1395, -70.7951, -69.7212]],

        [[-86.4790, -82.7401, -75.2648,  ..., -83.2248, -83.6001, -84.0480],
         [-69.1984, -64.9035, -58.1323,  ..., -69.1593, -68.4532, -68.7843],
         [-72.0738, -68.7730, -58.9978,  ..., -71.1235, -70.0178, -69.7289],
         [-13.0008,  -9.2679,  -0.0981,  ...,  -6.8345,  -9.4190,  -7.7957],
         [-72.2647, -68.1028, -59.5370,  ..., -72.6572, -72.5540, -70.7121]]]), past_key_values=((tensor([[[[ 0.9957,  0.5055, -0.6987,  ..., -0.2885, -0.8605, -0.0536],
          [ 0.7225, -0.2219, -0.1725,  ..., -0.5091,  0.4595,  0.2045],
          [

In [2]:
#check the model before training by prompt, tokenization, and generation
def generate_response(prompt_text,model,tokenizer,max_length=30,num_return_sequences=1):
    print("the device is: ",device)
    input_ids=tokenizer.encode(prompt_text,return_tensors="pt").to(device)
    print("input_ids is: ",input_ids)
    
    output_sequences=model.generate(
        input_ids=input_ids,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        no_repeat_ngram_size=2,
    )
    
    responses=[]
    for id,response_id in enumerate(output_sequences):
        response=tokenizer.decode(response_id,skip_special_tokens=True)
        responses.append(response)
        print("reponse ",id," is generated")
    return responses

prompt_text="To be or not to be"
responses=generate_response(prompt_text,model,tokenizer)
for response in responses:
    print(response)

the device is:  cpu
tensor([[   1, 1240,  339,  408,  437,  290,  339]])


KeyboardInterrupt: 

# tqdm: add progress bars to loops

In [3]:
#test tqdm: add progress bars to loops
from tqdm import tqdm
import time

total_iterations = 100

# Wrap your loop with tqdm for a progress bar
for i in tqdm(range(total_iterations), desc="Processing"):
    # Simulate work with a sleep
    time.sleep(0.1)  # Replace with your actual code

print("Done!")

Processing: 100%|██████████| 100/100 [00:10<00:00,  9.50it/s]

Done!





# energyusage
    doesn't support Mac

In [4]:
!pip install energyusage

Collecting energyusage
  Downloading energyusage-0.0.14-py3-none-any.whl.metadata (14 kB)
Collecting reportlab (from energyusage)
  Downloading reportlab-4.2.2-py3-none-any.whl.metadata (1.4 kB)
Collecting pillow>=9.0.0 (from reportlab->energyusage)
  Downloading pillow-10.4.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (9.2 kB)
Collecting chardet (from reportlab->energyusage)
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Downloading energyusage-0.0.14-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading reportlab-4.2.2-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading pillow-10.4.0-cp312-cp312-macosx_11_0_arm64.whl (3.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m24.5 MB/s[0m eta 

In [8]:
import energyusage

# user function to be evaluated
def recursive_fib(n):
    if (n <= 2): return 1
    else: return recursive_fib(n-1) + recursive_fib(n-2)

energyusage.evaluate(recursive_fib, 40, pdf=True)
# returns 102,334,155

IndexError: tuple index out of range

# powermetrics

In [9]:
!sudo powermetrics --samplers cpu_power,gpu_power --show-process-energy --interval 1

Password:
sudo: a password is required


In [4]:
!sudo -S powermetrics --samplers cpu_power,gpu_power 

Password:powermetrics: unrecognized sampler: --show-process-energy


In [15]:
!pip install asitop

Collecting asitop
  Downloading asitop-0.0.24-py3-none-any.whl.metadata (530 bytes)
Collecting dashing (from asitop)
  Downloading dashing-0.1.0.tar.gz (9.6 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting blessed (from dashing->asitop)
  Downloading blessed-1.20.0-py2.py3-none-any.whl.metadata (13 kB)
Downloading asitop-0.0.24-py3-none-any.whl (8.6 kB)
Downloading blessed-1.20.0-py2.py3-none-any.whl (58 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.4/58.4 kB[0m [31m721.7 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hBuilding wheels for collected packages: dashing
  Building wheel for dashing (setup.py) ... [?25ldone
[?25h  Created wheel for dashing: filename=dashing-0.1.0-py3-none-any.whl size=7288 sha256=8dcc53e5b9956cc647aa97d48e44fbfb7f85fceb3ddca2f13744863aa4ae9b7e
  Stored in directory: /Users/adrianhwang/Library/Caches/pip/wheels/b6/bc/0f/53e7d908ed4225cdfe70693df23e062a5f6d3ffacf201117ea
Successfully built dashing
I

In [2]:
!sudo -S asitop --interval 1 < pwd

Password:
ASITOP - Performance monitoring CLI tool for Apple Silicon
You can update ASITOP by running `pip install asitop --upgrade`
Get help at `https://github.com/tlkh/asitop`
P.S. You are recommended to run ASITOP with `sudo asitop`


[1/3] Loading ASITOP

[?25l

[2/3] Starting powermetrics process


[3/3] Waiting for first reading...

[H[2J[32m[1;1H┌──────────────────────────────────────────────────────────────────────────────┐
[2;1H│
[2;80H│
[3;1H│
[3;80H│
[4;1H│
[4;80H│
[5;1H│
[5;80H│
[6;1H│
[6;80H│
[7;1H└──────────────────────────────────────────────────────────────────────────────┘
[1;3H[32m  Apple M3 (cores: 4E+4P+10GPU)  
[2;2HE-CPU Usage: 0% @ 0 MHz                
[32m[3;3H
[3;2H▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏
[2;41HP-CPU Usage: 0% @ 0 MHz                
[32m[3;42H
[3;41H▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏
[4;2HGPU Usage: 7% @ 502 MHz                
[32m[5;3H
[5;2H▉▉▋▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏▏