# HuggingFace Inference Basics

This notebook introduces the fundamentals of using HuggingFace Transformers for model inference.

## Learning Objectives
- Load pre-trained models from HuggingFace Hub
- Perform text generation with different models
- Understand tokenization and model outputs
- Explore different model architectures


In [None]:
# Import required libraries
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import warnings
warnings.filterwarnings('ignore')

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name()}")


## 1. Simple Text Generation with Pipeline

The easiest way to get started with HuggingFace is using pipelines.

In [None]:
# Create a text generation pipeline
generator = pipeline('text-generation', model='gpt2', max_length=100)

# Generate text
prompt = "The future of artificial intelligence is"
result = generator(prompt, max_length=50, num_return_sequences=2)

for i, text in enumerate(result):
    print(f"Generation {i+1}: {text['generated_text']}")
    print("-" * 50)


## 2. Manual Model Loading and Tokenization

For more control, we can load models and tokenizers manually.

In [None]:
# Load model and tokenizer
model_name = "microsoft/DialoGPT-medium"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Set pad token
tokenizer.pad_token = tokenizer.eos_token

print(f"Model loaded: {model_name}")
print(f"Vocabulary size: {tokenizer.vocab_size}")
print(f"Model parameters: {model.num_parameters():,}")


In [None]:
# Tokenization example
text = "Hello, how are you today?"

# Encode text
encoded = tokenizer.encode(text, return_tensors='pt')
print(f"Original text: {text}")
print(f"Encoded tokens: {encoded}")
print(f"Token IDs: {encoded[0].tolist()}")

# Decode back to text
decoded = tokenizer.decode(encoded[0])
print(f"Decoded text: {decoded}")

# Show individual tokens
tokens = tokenizer.tokenize(text)
print(f"Individual tokens: {tokens}")


## 3. Interactive Chat Example

Let's create a simple chatbot using DialoGPT.

In [None]:
def chat_with_model(model, tokenizer, user_input, chat_history_ids=None):
    """Generate a response using the model"""
    # Encode user input
    new_user_input_ids = tokenizer.encode(
        user_input + tokenizer.eos_token, 
        return_tensors='pt'
    )
    
    # Append to chat history
    if chat_history_ids is not None:
        bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1)
    else:
        bot_input_ids = new_user_input_ids
    
    # Generate response
    with torch.no_grad():
        chat_history_ids = model.generate(
            bot_input_ids,
            max_length=1000,
            num_beams=5,
            early_stopping=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Decode response
    response = tokenizer.decode(
        chat_history_ids[:, bot_input_ids.shape[-1]:][0], 
        skip_special_tokens=True
    )
    
    return response, chat_history_ids

# Example conversation
chat_history = None

user_inputs = [
    "Hello! How are you?",
    "What's your favorite programming language?",
    "Tell me a joke"
]

for user_input in user_inputs:
    print(f"User: {user_input}")
    response, chat_history = chat_with_model(model, tokenizer, user_input, chat_history)
    print(f"Bot: {response}")
    print("-" * 50)


## 4. Exploring Different Models

Let's try different types of models for various tasks.

In [None]:
# Text classification
classifier = pipeline("sentiment-analysis")

texts = [
    "I love this new AI technology!",
    "This is terrible and frustrating.",
    "The weather is okay today."
]

print("Sentiment Analysis Results:")
for text in texts:
    result = classifier(text)
    print(f"Text: {text}")
    print(f"Sentiment: {result[0]['label']} (confidence: {result[0]['score']:.3f})")
    print()


In [None]:
# Question answering
qa_pipeline = pipeline("question-answering")

context = """
The Transformer architecture was introduced in the paper "Attention Is All You Need" 
by Vaswani et al. in 2017. It revolutionized natural language processing by using 
self-attention mechanisms instead of recurrent or convolutional layers.
"""

questions = [
    "When was the Transformer architecture introduced?",
    "What did the Transformer architecture use instead of recurrent layers?",
    "Who introduced the Transformer architecture?"
]

print("Question Answering Results:")
for question in questions:
    result = qa_pipeline(question=question, context=context)
    print(f"Q: {question}")
    print(f"A: {result['answer']} (confidence: {result['score']:.3f})")
    print()


## 5. Model Information and Exploration

Let's explore model architectures and configurations.

In [None]:
# Explore model configuration
print("Model Configuration:")
print(f"Model type: {model.config.model_type}")
print(f"Hidden size: {model.config.n_embd}")
print(f"Number of layers: {model.config.n_layer}")
print(f"Number of attention heads: {model.config.n_head}")
print(f"Vocabulary size: {model.config.vocab_size}")
print(f"Maximum position embeddings: {model.config.n_positions}")

# Model size calculation
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Model size: ~{total_params * 4 / 1024**2:.1f} MB (float32)")


## 6. Generation Parameters

Experiment with different generation strategies.

In [None]:
def compare_generation_strategies(model, tokenizer, prompt):
    """Compare different text generation strategies"""
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    
    strategies = {
        'Greedy': {'do_sample': False},
        'Beam Search': {'num_beams': 5, 'do_sample': False},
        'Top-k Sampling': {'do_sample': True, 'top_k': 50},
        'Top-p Sampling': {'do_sample': True, 'top_p': 0.9},
        'Temperature': {'do_sample': True, 'temperature': 0.7}
    }
    
    print(f"Prompt: {prompt}")
    print("=" * 60)
    
    for strategy_name, params in strategies.items():
        with torch.no_grad():
            outputs = model.generate(
                input_ids,
                max_length=input_ids.shape[1] + 30,
                pad_token_id=tokenizer.eos_token_id,
                **params
            )
        
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        new_text = generated_text[len(prompt):].strip()
        
        print(f"{strategy_name}: {new_text}")
        print()

# Test with a creative prompt
creative_prompt = "Once upon a time in a magical forest"
compare_generation_strategies(model, tokenizer, creative_prompt)


## Next Steps

Congratulations! You've learned the basics of HuggingFace inference. Next, explore:

- **02_trae_intro_and_setup.ipynb**: Introduction to Trae AI integration
- **03_fine_tuning_with_trae.ipynb**: Fine-tuning models with Trae
- **04_rag_with_trae.ipynb**: Building RAG systems

## Exercise

Try the following exercises to reinforce your learning:

1. Load a different model (e.g., 'distilgpt2', 'microsoft/DialoGPT-small')
2. Experiment with different generation parameters
3. Create a simple text summarization pipeline
4. Build a multi-turn conversation system
