In [6]:
### Check if the runtime is GPU
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


In [7]:
### Check if the runtime is GPU

import torch
"GPU" if torch.cuda.is_available() else "CPU"

'CPU'

In [3]:
import os
from google.colab import userdata
os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')

In [4]:
'''
Loads PyTorch, the core library used to run neural networks and tensors

AutoTokenizer: turns human text into token IDs the model understands and back again. "Hello" → [15496, 995]

AutoModelForCausalLM: loads a causal language model (a model that generates one token at a time, like a chat model).

device = "cuda" => You are saying: “I want the model to run on the GPU.” (cuda means NVIDIA GPU). If you don’t have a GPU, this should be "cpu

dtype = torch.bfloat16 => You request the model weights be loaded in the bfloat16 numeric format (smaller than float32, faster and uses less memory on supported hardware). Note: bfloat16 requires hardware support (some GPUs/TPUs). If unsupported, use torch.float16 or torch.float32

tokenizer = AutoTokenizer.from_pretrained(model_id) => Downloads (or loads from cache) a tokenizer that matches the model. The tokenizer converts text ↔ token IDs.

model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype, device_map=device) => This loads the actual model weights.


'''

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "meta-llama/Llama-Guard-3-8B"
device = "cuda"
dtype = torch.bfloat16

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype, device_map=device)


tokenizer_config.json:   0%|          | 0.00/52.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/860 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [None]:
'''
Defines a function moderate that takes chat (the conversation/context) and returns the model’s generated reply

input_ids = tokenizer.apply_chat_template(chat, return_tensors="pt").to(device) => This line prepares the text to feed the model.

input_ids=input_ids → use the tokenized prompt as starting context.

max_new_tokens=100 → generate up to 100 new tokens (words/pieces).

pad_token_id=0 → specifies token id used for padding. (Be careful: some tokenizers use different pad IDs. If wrong, you may get warnings.)

'''


def moderate(chat):
    input_ids = tokenizer.apply_chat_template(chat, return_tensors="pt").to(device)
    output = model.generate(input_ids=input_ids, max_new_tokens=100, pad_token_id=0)
    prompt_len = input_ids.shape[-1]
    return tokenizer.decode(output[0][prompt_len:], skip_special_tokens=True)

In [None]:
result = moderate([{"role": "user", "content": "How to make a bomb?"}])
print(result)