# Install necessary libraries

In [1]:
! pip install transformers accelerate bitsandbytes peft torch

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB

# Global Variables

# Global Functions

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# 1. Define the correct local path for Version 2 (Note the '2' at the end)
LOCAL_MODEL_PATH = "/kaggle/input/llama-3.1/transformers/8b-instruct/2"  

# 2. Define the 4-bit quantization config (Crucial for VRAM on a T4 GPU)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",  # Normalized Float 4-bit (recommended)
    bnb_4bit_compute_dtype=torch.bfloat16, # Use bfloat16 for computation if supported (faster)
)

In [3]:
# 3. Load Tokenizer and Model
# Note: You may need to add trust_remote_code=True for some models, but it is often unnecessary 
# for officially hosted models.
tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(
    LOCAL_MODEL_PATH,
    quantization_config=bnb_config,
    device_map="auto"
)

# Set the padding token, which is often missing or incorrectly set for Llama models
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Llama 3.1 8B Model loaded successfully!")

2025-10-28 13:47:57.328550: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761659277.511718      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761659277.560629      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Llama 3.1 8B Model loaded successfully!


In [4]:
# Define the conversation history. We'll use a simple system message and a user query.
messages = [
    # Optional: A system message to define the model's persona or rules.
    {"role": "system", "content": "You are a concise, factual European news assistant."},
    # The user's question to test the model.
    {"role": "user", "content": "Explain why the Euro zone was created in one short paragraph."}
]

In [5]:
# Apply the template to create the final prompt string (input_ids)
input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True, # Tells the tokenizer to add the final 'assistant' header
    return_tensors="pt"
).to(model.device) # Move the prompt tokens to the GPU where the model is loaded

In [7]:
# The messages list is already defined from your previous cell
# messages = [...]

# 1. Apply the template to create the final prompt *string* (not a tensor yet)
prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False, # Important: Returns a string
    add_generation_prompt=True
)

In [8]:
# 2. Tokenize the resulting string to get the required dictionary of tensors
# This is where the dictionary containing 'input_ids' and 'attention_mask' is created.
input_dict = tokenizer(prompt, return_tensors="pt").to(model.device)

In [9]:
import torch

with torch.no_grad():
    outputs = model.generate(
        **input_dict, # Pass the dictionary of tensors here
        max_new_tokens=256,    
        do_sample=True,        
        temperature=0.7        
    )

# Decode and print the output
response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Extract only the assistant response
assistant_response = response_text.split("assistant\n")[-1].strip()

print("\n--- ASSISTANT RESPONSE ONLY ---")
print(assistant_response)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--- ASSISTANT RESPONSE ONLY ---
The Eurozone was created to facilitate economic integration among European countries. It began with the signing of the Maastricht Treaty in 1992, which established the European Monetary Union (EMU). The treaty aimed to create a single currency, the Euro, to promote economic unity, increase trade, and reduce transaction costs among participating countries. The Euro was introduced in 1999 and replaced the national currencies of participating countries in 2002.
