In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from dotenv import load_dotenv
import os

load_dotenv()
assert os.getenv("HF_ACCESS_TOKEN") is not None, "Please set the HF_TOKEN environment variable."

Load a CausalLM tokenizer and model. Be sure to set the model to your device and set it to eval

In [None]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
model = model.to("mps")
model.eval()

In [None]:

inputs = tokenizer("Tell me about yourself!\n\n", return_tensors="pt").to("mps")
with torch.no_grad():
    outputs = model.generate(**inputs, num_beams=2, max_new_tokens=1024)

decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(decoded_output)

Try to use a chat interface

In [None]:
messages = [{"role": "user", "content": "Explain LLMs to me"}]
inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("mps")
with torch.no_grad():
    outputs = model.generate(**inputs, num_beams=4, max_new_tokens=100)
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(decoded_output)

In [10]:
from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# First instantiate the tokenizer and model
model_id = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16,
).eval()

print(model.device)

# Set the pad token on the model
print(f"Model pad_token_id: {model.config.pad_token_id}")
if tokenizer.pad_token is None:
    print("Setting pad_token to eos_token...")
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.pad_token_id
print(f"Model pad_token_id: {model.config.pad_token_id}")

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512, do_sample=False)
llm_pipeline = HuggingFacePipeline(pipeline=pipe)
chat_model = ChatHuggingFace(llm=llm_pipeline)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use mps


mps:0
Model pad_token_id: None
Setting pad_token to eos_token...
Model pad_token_id: 128009


In [11]:
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
messages = [
    SystemMessage(content="You are a helpful assistant."),
    HumanMessage(content="Explain LLMs to me"),
]
response = chat_model.invoke(messages)



In [16]:
response.content

"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 17 Apr 2025\n\nYou are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nExplain LLMs to me<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI'd be happy to explain Large Language Models (LLMs) to you.\n\n**What are Large Language Models (LLMs)?**\n\nLarge Language Models (LLMs) are a type of artificial intelligence (AI) model that are designed to process and understand human language. They are a subset of a broader category of machine learning models known as neural networks.\n\n**How do LLMs work?**\n\nLLMs are trained on vast amounts of text data, which allows them to learn patterns and relationships within language. This training data can come from various sources, such as books, articles, conversations, and even social media posts. The model's architecture is typically based on a transformer encoder-decoder structure, which is inspired

In [None]:
# from transformers import BitsAndBytesConfig

# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype="float16",
#     bnb_4bit_use_double_quant=True,
# )

# llm = HuggingFacePipeline.from_model_id(
#     model_id="HuggingFaceH4/zephyr-7b-beta",
#     task="text-generation",
#     pipeline_kwargs=dict(
#         max_new_tokens=512,
#         do_sample=False,
#         repetition_penalty=1.03,
#         return_full_text=False,
#     ),
#     model_kwargs={"quantization_config": quantization_config},
# )

# chat_model = ChatHuggingFace(llm=llm)