# Set-up the Environment

In [None]:
# Let's ensure we're using the correct Python environment

import sys
print(sys.executable)

# Or in linux-based systems:
# !which python

In [3]:
!pip install pandas numpy python-dotenv torch transformers
# !pip install -r requirements.txt

# Import requirements

In [24]:
import os
import time
import torch

from dotenv import load_dotenv
from transformers import pipeline

In [23]:
# Load the environment variables
load_dotenv()

# Check the environment variables
print(os.getenv('SOME_SECRET_KEY'))
print(os.getenv('ANOTHER_SECRET_KEY'))

here_is_my_secret_key
None


# Let's go load the model

In [3]:
# Check if CUDA is available
print(torch.cuda.is_available())
print(torch.backends.mps.is_available())  # apple silicon


False
True


In [13]:
if torch.cuda.is_available():
    device = 'cuda'
elif torch.backends.mps.is_available():
    # Apple Silicon
    device = 'mps'
else:
    device = 'cpu'

# device = 'cpu'  # force to cpu

print(device)

mps


## LLama 1B

In [4]:
model_name = "meta-llama/Llama-3.2-1B-Instruct"

llama_1b_pipe = pipeline(
    "text-generation",
    model=model_name,
    torch_dtype=torch.bfloat16,
    device=device
)

Device set to use mps


In [5]:
print(llama_1b_pipe.device)
print(llama_1b_pipe.torch_dtype)

start_time = time.time()

try:
    result = llama_1b_pipe("Toktok, who is there?", max_new_tokens=128)
except KeyboardInterrupt:
    result = None

end_time = time.time()

print(f"Time taken: {end_time - start_time} seconds")
print(result)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


cpu
torch.bfloat16
Time taken: 496.57046031951904 seconds
[{'generated_text': "Toktok, who is there? What do they do?\nToktok is a popular American rapper, singer, songwriter, and record producer. He is the leader of the hip-hop group 21 Savage, and has also released solo music.\n\nI'm not sure if you're asking about Toktok or 21 Savage. Both are known for their unique styles and contributions to hip-hop.\n\nIf you could provide more context or clarify which Toktok you are referring to, I'd be happy to help further!"}]


In [5]:
print(llama_1b_pipe.device)
print(llama_1b_pipe.torch_dtype)

start_time = time.time()

try:
    result = llama_1b_pipe("Toktok, who is there?", max_new_tokens=128)
except KeyboardInterrupt:
    result = None

end_time = time.time()

print(f"Time taken: {end_time - start_time} seconds")
print(result)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


mps
torch.bfloat16
Time taken: 4.574230909347534 seconds
[{'generated_text': 'Toktok, who is there? I don’t know\nI’m not aware of anyone named Toktok being in the news. Can you provide more context or clarify what you are referring to? I’m here to help with any questions you may have.'}]


## TinyLlama 110M

In [25]:
model_name = "nickypro/tinyllama-110M"

tinyllama_pipe = pipeline(
    "text-generation",
    model=model_name,
)

tinyllama_pipe("Toktok, who is there?", max_new_tokens=128)

Device set to use mps:0
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[{'generated_text': 'Toktok, who is there? That means he was there.\nHe had been walking on the beach when he heard a sound. It was a whistling. He looked around and saw a little bird. It was perched on a rock, whistling.\nBrying was curious. He wanted to know why the bird was whistling.\nSo he asked the bird, "Why are you whistling?"\nThe bird said, "I was just feeling a bit tired. I was flying all day and I needed a break."\nBrying was surprised. He thought the bird was being selfish by not sharing the moment with'}]

In [None]:
model_name = "nickypro/tinyllama-110M"

tinyllama_pipe = pipeline(
    "text-generation",
    model=model_name,
    torch_dtype=torch.bfloat16,
    device=device
)

Device set to use mps:0


In [19]:
print(tinyllama_pipe.device)
print(tinyllama_pipe.torch_dtype)

start_time = time.time()

try:
    result = tinyllama_pipe("Toktok, who is there?", max_new_tokens=128)
except KeyboardInterrupt:
    result = None

end_time = time.time()

print(f"Time taken: {end_time - start_time} seconds")
print(result)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


mps:0
torch.bfloat16
Time taken: 4.106837034225464 seconds
[{'generated_text': 'Toktok, who is there? He was at the park with his mom and dad. He had a big smile on his face because he was so excited to go on the slide.\nHis mom said, "Come on, let\'s go get a ticket! We will go down the slide and then we will go get ice cream."\nBaby was so excited and quickly ran over to the ticket booth. He asked the nice lady for a ticket, and she gave it to him.\nBaby\'s mom said, "Hold on tight. That\'s the slide."\nBaby was so happy and he quickly ran over to'}]


---

# Bonus: LMStudio

In [12]:
import requests
import json

url = "http://127.0.0.1:1234/v1/chat/completions"
headers = {"Content-Type": "application/json"}
data = {
    "model": "llama-3.2-1b-instruct",
    "messages": [
        {
            "role": "system",
            "content": "You are a helpful jokester."
        },
        {
            "role": "user",
            "content": "Tell me a joke."
        }
    ],
    "response_format": {
        "type": "json_schema",
        "json_schema": {
            "name": "joke_response",
            "strict": "true",
            "schema": {
                "type": "object",
                "properties": {
                    "joke": {
                        "type": "string"
                    }
                },
                "required": ["joke"]
            }
        }
    },
    "temperature": 0.7,
    "max_tokens": 50,
    "stream": False
}

response = requests.post(url, headers=headers, data=json.dumps(data))
print(response.text)


{
  "id": "chatcmpl-5stil8thtwn5r565z2vodk",
  "object": "chat.completion",
  "created": 1748583675,
  "model": "llama-3.2-1b-instruct",
  "choices": [
    {
      "index": 0,
      "logprobs": null,
      "finish_reason": "stop",
      "message": {
        "role": "assistant",
        "content": "{\n   \"joke\": \"What do you call a fake noodle? An impasta.\"\n}"
      }
    }
  ],
  "usage": {
    "prompt_tokens": 47,
    "completion_tokens": 21,
    "total_tokens": 68
  },
  "stats": {},
  "system_fingerprint": "llama-3.2-1b-instruct"
}
