# Introduction to Pydantic for LLM Structured Outputs

Pydantic is a Python library for data validation using Python type annotations.
It's particularly useful for ensuring structured outputs from Large Language Models (LLMs).

In [None]:
import pydantic
from pydantic import BaseModel, Field
from typing import List, Optional, Literal

print(f"Using Pydantic version: {pydantic.__version__}\n")

In [2]:
# Define a User model
class User(BaseModel):
    id: int  
    name: str

In [None]:
# Create a valid user
user1 = User(id=1, name="John Doe")
print(user1)

In [4]:
# Create an invalid user
# user2 = User(id="a", name="John Doe")
# print(user2)

## Part 1: Common Data Types in Pydantic

### String Types

In [5]:
class StringExample(BaseModel):
    name: str
    description: Optional[str] = None  # Optional field with default None
    email: str = Field(..., pattern=r"^\S+@\S+\.\S+$")  # With validation regex

In [None]:
# Demonstration
print("\nString Type Example:")
valid_string = StringExample(
    name="John Doe", email="john@example.com", description="A sample user"
)
print(f"Valid data: {valid_string.model_dump_json(indent=2)}")

try:
    invalid_string = StringExample(
        name="Jane Doe",
        email="not-an-email",  # Invalid email format
    )
except Exception as e:
    print(f"Invalid data error: {e}")

### Numeric Types

In [7]:
class NumericExample(BaseModel):
    count: int
    price: float
    # Constrained numeric types:
    probability: float = Field(..., ge=0.0, le=1.0)  # Between 0 and 1
    age: int = Field(..., gt=0, lt=120)  # Between 1 and 119

In [None]:
# Demonstration
print("\nNumeric Type Example:")
valid_numeric = NumericExample(count=5, price=19.99, probability=0.75, age=30)
print(f"Valid data: {valid_numeric.model_dump_json(indent=2)}")

try:
    invalid_numeric = NumericExample(
        count=5,
        price=19.99,
        probability=1.5,  # Outside allowed range
        age=30,
    )
except Exception as e:
    print(f"Invalid data error: {e}")

### Boolean Types

In [9]:
class BooleanExample(BaseModel):
    is_active: bool
    has_subscription: bool = False  # With default value

In [None]:
# Demonstration
print("\nBoolean Type Example:")
valid_boolean = BooleanExample(is_active=True)
print(f"Valid data: {valid_boolean.model_dump_json(indent=2)}")

try:
    invalid_boolean = BooleanExample(is_active="yes")  # Not a boolean
except Exception as e:
    print(f"Invalid data error: {e}")

### Literal Types

In [11]:
class LiteralExample(BaseModel):
    status: Literal["pending", "approved", "rejected"]
    role: Literal["admin", "user", "guest"] = "user"  # With default


In [None]:
# Demonstration
print("\nLiteral Type Example:")
valid_literal = LiteralExample(status="approved")
print(f"Valid data: {valid_literal.model_dump_json(indent=2)}")

try:
    invalid_literal = LiteralExample(status="waiting")  # Not in allowed literals
except Exception as e:
    print(f"Invalid data error: {e}")

### List Types

In [13]:
class ListExample(BaseModel):
    tags: List[str]

In [None]:
# Demonstration
print("\nList Type Example:")
valid_list = ListExample(
    tags=["python", "data", "validation"]
)
print(f"Valid data: {valid_list.model_dump_json(indent=2)}")

try:
    invalid_list = ListExample(
        tags=["python", "data", "validation"],
    )
except Exception as e:
    print(f"Invalid data error: {e}")

## Part 2: Example Data Models for LLM Structured Outputs

### Example 1: Customer Sentiment Analysis

In [15]:
class SentimentAnalysis(BaseModel):
    text: str
    sentiment: Literal["positive", "negative", "neutral"]
    confidence: float = Field(..., gt=0.0, le=1.0)
    key_phrases: List[str] = Field(..., min_length=2)

In [None]:
# Test example
sentiment_analysis = SentimentAnalysis(
    text="I really enjoyed using this product. It's fantastic!",
    sentiment="positive",
    confidence=0.92,
    key_phrases=["enjoyed", "fantastic"],
)
print(f"\nValid SentimentAnalysis:\n{sentiment_analysis.model_dump_json(indent=2)}")

### Example 2: Product Recommendation

In [17]:
class Product(BaseModel):
    name: str
    description: str
    price: float = Field(..., gt=0)
    category: str
    in_stock: bool


class ProductRecommendation(BaseModel):
    user_query: str
    recommended_products: List[Product]
    reasoning: str
    personalization_level: float = Field(..., ge=0.0, le=1.0)

In [None]:
product_recommendation = ProductRecommendation(
    user_query="I need a laptop for gaming under $2000",
    recommended_products=[
        Product(
            name="GamerBook Pro",
            description="High-performance gaming laptop with RGB keyboard",
            price=1899.99,
            category="Electronics",
            in_stock=True,
        ),
        Product(
            name="PowerGamer X",
            description="Affordable gaming laptop with dedicated GPU",
            price=1499.99,
            category="Electronics",
            in_stock=False,
        ),
    ],
    reasoning="Selected based on gaming requirements and budget constraints",
    personalization_level=0.85,
)
print(
    f"\nValid ProductRecommendation:\n{product_recommendation.model_dump_json(indent=2)}"
)

### Example 3: Content Generation with Metadata

In [19]:
class ContentType(BaseModel):
    title: str
    content: str
    word_count: int = Field(..., gt=0)
    tags: List[str] = Field(..., min_length=1, max_length=10)
    target_audience: Literal["general", "technical", "business", "academic"]
    reading_time_minutes: int = Field(..., gt=0)
    seo_score: float = Field(..., ge=0.0, le=1.0)
    contains_code_snippets: bool = False

In [None]:
content = ContentType(
    title="Getting Started with Pydantic",
    content="Pydantic is a powerful library for data validation...",
    word_count=1200,
    tags=["python", "data-validation", "pydantic", "tutorial"],
    target_audience="technical",
    reading_time_minutes=6,
    seo_score=0.87,
    contains_code_snippets=True,
)
print(f"\nValid ContentType:\n{content.model_dump_json(indent=2)}")