# Check-up the Environment

In [None]:
!which python

In [3]:
# !pip install pandas numpy python-dotenv torch transformers --quiet
!pip install -r requirements.txt --quiet

# Import necessary libraries

In [6]:
import os
import time
import pandas as pd
import torch

from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

In [2]:
# Load the environment variables
load_dotenv()

# Check the environment variables
os.environ["SOME_SECRET_KEY"]

'here_is_my_secret_key'

In [3]:
# Check if CUDA is available
print(torch.cuda.is_available())
print(torch.backends.mps.is_available())  # apple silicon


False
True


# Let's go load the model

In [7]:
model_name = "meta-llama/Llama-3.2-1B-Instruct"

device = 0 if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
device = 'cpu'

llama_1b_pipe = pipeline(
    "text-generation",
    model=model_name,
    torch_dtype=torch.bfloat16,
    device=device
)

Device set to use cpu


In [11]:
# Try call (device=cpu, no torch_dtype)
llama_1b_pipe("Toktok, who is there?", max_new_tokens=128)

# not good, taking so long time

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


KeyboardInterrupt: 

In [None]:
# # Try call (device=mps, no torch_dtype)
# llama_1b_pipe("Toktok, who is there?", max_new_tokens=128)

# # not good, taking so long time

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


KeyboardInterrupt: 

In [9]:
# Try call (device=cpu, torch_dtype=torch.bfloat16)

start_time = time.time()

try:
    result = llama_1b_pipe("Toktok, who is there?", max_new_tokens=128)
except KeyboardInterrupt:
    result = None

end_time = time.time()

print(f"Time taken: {end_time - start_time} seconds")
print(result)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Time taken: 420.8971338272095 seconds
[{'generated_text': "Toktok, who is there? \nYou, you, you... Toktok, who is there? \n\nAnswer: It's you.\n\nI know what you're saying, you're saying that you're there, but you're not, you're not. You're just a projection of my thoughts, a manifestation of my mind. You're not a real person, you're just a product of my brain."}]


In [6]:
model_name = "meta-llama/Llama-3.2-1B-Instruct"

# Determine the appropriate device
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

device = torch.device('cpu')  # keep using 'cpu' because got OOM when using mps
print(device)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.to(device)


cpu


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb):

In [None]:
# Prepare input
prompt = "Toktok, who is there?"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate output
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
    )

# Decode and print the output
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

# oof, still taking so long time


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


KeyboardInterrupt: 

# Try LMStudio

In [11]:
import requests
import json

url = "http://127.0.0.1:1234/v1/chat/completions"
headers = {"Content-Type": "application/json"}
data = {
    "model": "llama-3.2-1b-instruct",
    "messages": [
        {"role": "system", "content": "Always answer in rhymes."},
        {"role": "user", "content": "Introduce yourself."}
    ],
    "temperature": 0.7,
    "max_tokens": -1,
    "stream": True
}

response = requests.post(url, headers=headers, data=json.dumps(data))
print(response.text)


data: {"id":"chatcmpl-1zwo4ezprnhwbf9nvvthlm","object":"chat.completion.chunk","created":1748583638,"model":"llama-3.2-1b-instruct","system_fingerprint":"llama-3.2-1b-instruct","choices":[{"index":0,"delta":{"role":"assistant","content":"Hello"},"logprobs":null,"finish_reason":null}]}

data: {"id":"chatcmpl-1zwo4ezprnhwbf9nvvthlm","object":"chat.completion.chunk","created":1748583638,"model":"llama-3.2-1b-instruct","system_fingerprint":"llama-3.2-1b-instruct","choices":[{"index":0,"delta":{"role":"assistant","content":" there"},"logprobs":null,"finish_reason":null}]}

data: {"id":"chatcmpl-1zwo4ezprnhwbf9nvvthlm","object":"chat.completion.chunk","created":1748583638,"model":"llama-3.2-1b-instruct","system_fingerprint":"llama-3.2-1b-instruct","choices":[{"index":0,"delta":{"role":"assistant","content":","},"logprobs":null,"finish_reason":null}]}

data: {"id":"chatcmpl-1zwo4ezprnhwbf9nvvthlm","object":"chat.completion.chunk","created":1748583638,"model":"llama-3.2-1b-instruct","system_fi

In [12]:
import requests
import json

url = "http://127.0.0.1:1234/v1/chat/completions"
headers = {"Content-Type": "application/json"}
data = {
    "model": "llama-3.2-1b-instruct",
    "messages": [
        {
            "role": "system",
            "content": "You are a helpful jokester."
        },
        {
            "role": "user",
            "content": "Tell me a joke."
        }
    ],
    "response_format": {
        "type": "json_schema",
        "json_schema": {
            "name": "joke_response",
            "strict": "true",
            "schema": {
                "type": "object",
                "properties": {
                    "joke": {
                        "type": "string"
                    }
                },
                "required": ["joke"]
            }
        }
    },
    "temperature": 0.7,
    "max_tokens": 50,
    "stream": False
}

response = requests.post(url, headers=headers, data=json.dumps(data))
print(response.text)


{
  "id": "chatcmpl-5stil8thtwn5r565z2vodk",
  "object": "chat.completion",
  "created": 1748583675,
  "model": "llama-3.2-1b-instruct",
  "choices": [
    {
      "index": 0,
      "logprobs": null,
      "finish_reason": "stop",
      "message": {
        "role": "assistant",
        "content": "{\n   \"joke\": \"What do you call a fake noodle? An impasta.\"\n}"
      }
    }
  ],
  "usage": {
    "prompt_tokens": 47,
    "completion_tokens": 21,
    "total_tokens": 68
  },
  "stats": {},
  "system_fingerprint": "llama-3.2-1b-instruct"
}
