In [7]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id = "google/flan-t5-small"

# Force model to CPU
device = torch.device("cpu")

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id).to(device)

# Input prompt
input_text = "What is the capital of India?"
inputs = tokenizer(input_text, return_tensors="pt").to(device)

# Generate answer
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


sarajevo


In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Use Meta's LLaMA 2 base model
model_id = "meta-llama/Llama-2-7b-hf"  # ✅ Official base model

# Force CPU for compatibility
device = torch.device("cpu")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float32,
    device_map={"": "cpu"},
    trust_remote_code=True
)

# Prompt
prompt = "Explain black holes like I'm 10 years old."

# Tokenize and run inference
inputs = tokenizer(prompt, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=150)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("\n🧠 LLaMA 2 Output:\n", response)


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
#Here, we are preparing our session by loading both the Llama model and its associated tokenizer.
#The tokenizer will help in converting our text prompts into a format that the model can understand and process.
from transformers import AutoTokenizer
import transformers
import torch

model = "meta-llama/Llama-2-7b-chat-hf" # meta-llama/Llama-2-7b-hf

tokenizer = AutoTokenizer.from_pretrained(model, use_auth_token=True)

In [None]:
# We'll set up a pipeline for text generation.
# This pipeline simplifies the process of feeding prompts to our model and receiving generated text as output.
# Note: This cell takes 2-3 minutes to run

from transformers import pipeline

llama_pipeline = pipeline(
    "text-generation",  # LLM task
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
)

In [None]:
# With everything set up, let's see how Llama responds to some sample queries.

def get_llama_response(prompt: str) -> None:
    """
    Generate a response from the Llama model.

    Parameters:
        prompt (str): The user's input/question for the model.

    Returns:
        None: Prints the model's response.
    """
    sequences = llama_pipeline(
        prompt,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=256,
    )
    print("Chatbot:", sequences[0]['generated_text'])



prompt = 'I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n'
get_llama_response(prompt)

In [None]:
# ✅ Step C: Fine-Tuning Meta LLaMA 2–7B Chat using LoRA (Colab Ready)

# ---------------------------------------------
# 🧰 STEP 1: Install Required Packages (Colab)
# ---------------------------------------------
!pip install -q transformers datasets peft accelerate bitsandbytes

# ---------------------------------------------
# ✅ STEP 2: Hugging Face Login
# ---------------------------------------------
from huggingface_hub import notebook_login
notebook_login()

# ---------------------------------------------
# ✅ STEP 3: Load LLaMA 2–7B Chat Model in 4-bit
# ---------------------------------------------
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

model_id = "meta-llama/Llama-2-7b-chat-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)
model.config.use_cache = False  # Required for PEFT

# ---------------------------------------------
# ✅ STEP 4: Apply LoRA using PEFT
# ---------------------------------------------
from peft import LoraConfig, get_peft_model, TaskType

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    target_modules=["q_proj", "v_proj"]
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# ---------------------------------------------
# ✅ STEP 5: Load a Toy Instruction Dataset (Alpaca-style)
# ---------------------------------------------
from datasets import Dataset

samples = [
    {"instruction": "Translate to French:", "input": "Hello, how are you?", "output": "Bonjour, comment ca va?"},
    {"instruction": "Summarize:", "input": "LLaMA is an open-source large language model by Meta.", "output": "LLaMA is Meta's open LLM."},
    {"instruction": "What is the capital of France?", "input": "", "output": "Paris."},
    {"instruction": "Explain gravity:", "input": "To a 5-year-old", "output": "Gravity is what pulls things down."}
]

train_dataset = Dataset.from_list(samples)

# ---------------------------------------------
# ✅ STEP 6: Preprocess Data
# ---------------------------------------------
def format_prompt(example):
    text = f"<s>[INST] {example['instruction']} {example['input']} [/INST] {example['output']} </s>"
    tokenized = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=256,
        return_tensors="pt"
    )
    input_ids = tokenized["input_ids"][0]
    return {
        "input_ids": input_ids,
        "labels": input_ids.clone()  # ✅ Add labels here
    }

train_dataset = train_dataset.map(format_prompt)

# ---------------------------------------------
# ✅ STEP 7: Fine-tune the Model
# ---------------------------------------------
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="lora-llama2-demo",
    per_device_train_batch_size=1,
    num_train_epochs=3,
    logging_steps=1,               # ✅ Log after every step
    evaluation_strategy="no",      # Skip eval for now
    save_strategy="no",            # Don't save during training
    logging_dir="./logs",          # ✅ Where logs go
    report_to="tensorboard",       # ✅ Prevents integration with external tools like wandb if specified as none
    logging_first_step=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

trainer.train()

# ---------------------------------------------
# ✅ STEP 8: Save the LoRA Adapter
# ---------------------------------------------
model.save_pretrained("./lora_adapter")
tokenizer.save_pretrained("./lora_adapter")
