<a href="https://colab.research.google.com/github/ZhuxinJiang/Gen-ai-course/blob/main/lab2_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Key upgrades (still easy for students to understand):

* Clearer SYSTEM prompt + rules (better role anchoring)
* Less random decoding (more stable replies)
* Stop generation when it tries to start a new User: turn (prevents runaway fake dialogue)
* Cleans role leakage (removes stray Assistant: / User: tags)
* Pads correctly (removes that pad_token_id warning)
* Keeps context bounded (avoids slowing down over long chats)



In [None]:
# Install necessary libraries for transformers, acceleration, Hugging Face Hub, and 4-bit quantization.
!pip -q install "transformers>=4.45.0" "accelerate>=0.33.0" "huggingface_hub>=0.24.0" bitsandbytes

In [None]:
from google.colab import userdata
from huggingface_hub import login

# Retrieve the Hugging Face token from Colab's user data secrets.
# This ensures sensitive API keys are not hardcoded in the notebook.
hf_token = userdata.get("HF_TOKEN")

# Check if the token was successfully retrieved. If not, raise an error.
# The user needs to add 'HF_TOKEN' to Colab Secrets (the key icon on the left panel).
if not hf_token:
    raise ValueError("HF_TOKEN not found. Add it in Colab Secrets (ðŸ”‘) and rerun.")

# Log in to the Hugging Face Hub using the retrieved token.
# This authenticates the session, allowing access to models and datasets.
login(token=hf_token)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Define the model identifier for the Llama 3.2 3B Instruct model.
# This ID is used to fetch the pre-trained model and its tokenizer from Hugging Face.
model_id = "meta-llama/Llama-3.2-3B"

# Configure BitsAndBytes for 4-bit quantization.
# This helps reduce memory usage by loading the model in 4-bit precision.
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enable 4-bit quantization.
    bnb_4bit_compute_dtype=torch.float16, # Set the compute data type to float16 for performance.
)

# Load the tokenizer for the specified model.
# The tokenizer is responsible for converting text into numerical tokens that the model can understand.
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

# Load the pre-trained causal language model.
# device_map="auto" automatically distributes the model across available devices (e.g., GPU).
# quantization_config applies the BitsAndBytes configuration for memory-efficient loading.
# torch_dtype=torch.float16 ensures the model is loaded with half-precision floating-point numbers.
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, StoppingCriteria, StoppingCriteriaList

# --- Chatbot loop (Base model) ---
# Note: Base models are trained to continue text, not follow instructions perfectly.
# We can still make the chat feel more natural by:
# 1) Using a clearer conversation template (SYSTEM / User / Assistant)
# 2) Reducing randomness slightly
# 3) Stopping generation when the model tries to start the next "User:" turn
# 4) Cleaning up any role tags that leak into the output

# Ensure pad_token exists (avoids generation warnings in many decoder-only models)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token

# Define the system prompt for the chatbot, setting its persona and rules.
SYSTEM_PROMPT = (
    "SYSTEM: You are a helpful teaching assistant for a university GenAI applications course.\n"
    "SYSTEM RULES:\n"
    "- Be friendly and natural.\n"
    "- Keep answers concise (about 3â€“6 sentences) unless the user asks for more.\n"
    "- If you are unsure, say so and ask a clarifying question.\n"
    "- Do not invent sources or citations.\n"
)

# Define strings that, if generated, indicate the model is trying to start a new speaker turn.
STOP_STRINGS = ["\nUser:", "\nSYSTEM:", "\nAssistant:"]

# Helper function to convert stop strings into their token IDs.
def _ids(s: str):
    return tokenizer(s, add_special_tokens=False)["input_ids"]

# Convert stop strings to a list of token ID lists, filtering out any empty ones.
STOP_IDS_LIST = [ids for ids in (_ids(s) for s in STOP_STRINGS) if ids]

# Custom StoppingCriteria class to stop generation when a defined sequence of tokens is encountered.
class StopOnSubsequence(StoppingCriteria):
    def __init__(self, stop_ids_list):
        self.stop_ids_list = [torch.tensor(x) for x in stop_ids_list]

    def __call__(self, input_ids, scores, **kwargs):
        # Iterate through all defined stop ID sequences.
        for stop_ids in self.stop_ids_list:
            n = len(stop_ids)
            # Check if the current input_ids sequence is long enough to contain the stop sequence.
            if input_ids.shape[1] >= n:
                # Compare the end of the input_ids with the stop_ids.
                if torch.all(input_ids[0, -n:] == stop_ids.to(input_ids.device)):
                    return True  # Stop generation if a match is found.
        return False

# Create a StoppingCriteriaList with our custom stopping criterion.
stopping = StoppingCriteriaList([StopOnSubsequence(STOP_IDS_LIST)])

# Function to handle a single turn of the chat.
def base_chat_once(user_text: str, transcript: str):
    # Keep context bounded to prevent the input from growing too large and slowing down the model.
    MAX_CHARS = 6000
    if len(transcript) > MAX_CHARS:
        transcript = transcript[-MAX_CHARS:]

    # Construct the prompt for the model, including the system prompt, previous conversation, and current user input.
    prompt = transcript + f"\nUser: {user_text}\nAssistant:"
    # Tokenize the prompt and move it to the model's device (e.g., GPU).
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # Generate a response from the model without tracking gradients (for inference).
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=180,  # Maximum number of new tokens to generate.
            do_sample=True,      # Enable sampling for more varied responses.
            temperature=0.4,     # Controls randomness (lower means less random).
            top_p=0.9,           # Filters tokens by cumulative probability.
            repetition_penalty=1.08, # Penalizes repeated tokens.
            pad_token_id=tokenizer.pad_token_id, # Specify pad token ID.
            eos_token_id=tokenizer.eos_token_id, # Specify end-of-sequence token ID.
            stopping_criteria=stopping,  # Apply custom stopping criteria.
        )

    # Decode only the newly generated tokens to get the assistant's response.
    new_tokens = output_ids[0][inputs["input_ids"].shape[-1]:]
    assistant_text = tokenizer.decode(new_tokens, skip_special_tokens=True)

    # Clean up common "role leakage" where the model might generate role tags.
    assistant_text = assistant_text.split("\nUser:")[0]
    assistant_text = assistant_text.split("\nSYSTEM:")[0]
    assistant_text = assistant_text.replace("Assistant:", "").strip()

    # Update the transcript with the latest turn for the next iteration.
    transcript = prompt + " " + assistant_text
    return assistant_text, transcript


# Initialize conversation transcript with the system prompt.
transcript = SYSTEM_PROMPT
print("IN6241 Lab 2 (HF Base) â€” type 'exit' to stop.\n")

# Main chat loop.
while True:
    user_text = input("You: ")  # Get user input.
    if user_text.lower().strip() in ("exit", "quit"): # Check for exit commands.
        print("Chat ended.")
        break

    # Get assistant's response and updated transcript.
    assistant_text, transcript = base_chat_once(user_text, transcript)
    print("\nAssistant:", assistant_text, "\n") # Print assistant's response.