In [None]:
%pip install -U instructor
# https://github.com/jxnl/instructor
# https://python.useinstructor.com/examples/


In [None]:
import instructor
from pydantic import BaseModel
from openai import OpenAI


# Define your desired output structure
class UserInfo(BaseModel):
    name: str
    age: int


# Patch the OpenAI client
client = instructor.from_openai(OpenAI())

# Extract structured data from natural language
user_info = client.chat.completions.create(
    model="gpt-3.5-turbo",
    response_model=UserInfo,
    messages=[{"role": "user", "content": "John Doe is 30 years old."}],
)

print(user_info.name)
#> John Doe
print(user_info.age)
#> 30

In [None]:
import instructor
from anthropic import Anthropic
from pydantic import BaseModel


class User(BaseModel):
    name: str
    age: int


client = instructor.from_anthropic(Anthropic())

# note that client.chat.completions.create will also work
resp = client.messages.create(
    model="claude-3-opus-20240229",
    max_tokens=1024,
    messages=[
        {
            "role": "user",
            "content": "Extract Jason is 25 years old.",
        }
    ],
    response_model=User,
)

assert isinstance(resp, User)
assert resp.name == "Jason"
assert resp.age == 25

In [None]:
### CREATE
import openai
import instructor
from pydantic import BaseModel


class User(BaseModel):
    name: str
    age: int


client = instructor.from_openai(openai.OpenAI())

user = client.chat.completions.create(
    model="gpt-4-turbo-preview",
    messages=[
        {"role": "user", "content": "Create a user"},
    ],
    response_model=User,
)

In [None]:
import openai
import instructor
from pydantic import BaseModel


client = instructor.from_openai(openai.AsyncOpenAI())


class User(BaseModel):
    name: str
    age: int


async def extract():
    return await client.chat.completions.create(
        model="gpt-4-turbo-preview",
        messages=[
            {"role": "user", "content": "Create a user"},
        ],
        response_model=User,
    )

In [None]:
import openai
import instructor
from pydantic import BaseModel


client = instructor.from_openai(openai.OpenAI())


class User(BaseModel):
    name: str
    age: int


user, completion = client.chat.completions.create_with_completion(
    model="gpt-4-turbo-preview",
    messages=[
        {"role": "user", "content": "Create a user"},
    ],
    response_model=User,
)

In [None]:
import openai
import instructor
from pydantic import BaseModel


client = instructor.from_openai(openai.OpenAI())


class User(BaseModel):
    name: str
    age: int


user_stream = client.chat.completions.create_partial(
    model="gpt-4-turbo-preview",
    messages=[
        {"role": "user", "content": "Create a user"},
    ],
    response_model=User,
)

for user in user_stream:
    print(user)
    #> name=None age=None
    #> name=None age=None
    #> name=None age=None
    #> name=None age=None
    #> name=None age=25
    #> name=None age=25
    #> name=None age=25
    #> name=None age=25
    #> name=None age=25
    #> name=None age=25
    #> name='John Doe' age=25
    # name=None age=None
    # name='' age=None
    # name='John' age=None
    # name='John Doe' age=None
    # name='John Doe' age=30

In [None]:
import openai
import instructor
from pydantic import BaseModel


client = instructor.from_openai(openai.OpenAI())


class User(BaseModel):
    name: str
    age: int


users = client.chat.completions.create_iterable(
    model="gpt-4-turbo-preview",
    messages=[
        {"role": "user", "content": "Create 2 users"},
    ],
    response_model=User,
)

for user in users:
    print(user)
    #> name='John' age=30
    #> name='Jane' age=25
    # User(name='John Doe', age=30)
    # User(name='Jane Smith', age=25)

In [None]:
import anthropic
import instructor
from pydantic import BaseModel, field_validator
from typing import List, Literal
from enum import Enum

client = instructor.from_anthropic(
    anthropic.Anthropic(), mode=instructor.Mode.ANTHROPIC_TOOLS
)


def test_simple():
    class User(BaseModel):
        name: str
        age: int

        @field_validator("name")
        def name_is_uppercase(cls, v: str):
            assert v.isupper(), "Name must be uppercase"
            return v

    resp = client.messages.create(
        model="claude-3-haiku-20240307",
        max_tokens=1024,
        max_retries=2,
        messages=[
            {
                "role": "user",
                "content": "Extract John is 18 years old.",
            }
        ],
        response_model=User,
    )  # type: ignore

    assert isinstance(resp, User)
    assert resp.name == "JOHN"  # due to validation
    assert resp.age == 18


def test_nested_type():
    class Address(BaseModel):
        house_number: int
        street_name: str

    class User(BaseModel):
        name: str
        age: int
        address: Address

    resp = client.messages.create(
        model="claude-3-haiku-20240307",
        max_tokens=1024,
        max_retries=0,
        messages=[
            {
                "role": "user",
                "content": "Extract John is 18 years old and lives at 123 First Avenue.",
            }
        ],
        response_model=User,
    )  # type: ignore

    assert isinstance(resp, User)
    assert resp.name == "John"
    assert resp.age == 18

    assert isinstance(resp.address, Address)
    assert resp.address.house_number == 123
    assert resp.address.street_name == "First Avenue"


def test_list_str():
    class User(BaseModel):
        name: str
        age: int
        family: List[str]

    resp = client.messages.create(
        model="claude-3-haiku-20240307",
        max_tokens=1024,
        max_retries=0,
        messages=[
            {
                "role": "user",
                "content": "Create a user for a model with a name, age, and family members.",
            }
        ],
        response_model=User,
    )

    assert isinstance(resp, User)
    assert isinstance(resp.family, List)
    for member in resp.family:
        assert isinstance(member, str)


def test_enum():
    class Role(str, Enum):
        ADMIN = "admin"
        USER = "user"

    class User(BaseModel):
        name: str
        role: Role

    resp = client.messages.create(
        model="claude-3-haiku-20240307",
        max_tokens=1024,
        max_retries=0,
        messages=[
            {
                "role": "user",
                "content": "Create a user for a model with a name and role of admin.",
            }
        ],
        response_model=User,
    )  # type: ignore

    assert isinstance(resp, User)
    assert resp.role == Role.ADMIN


def test_literal():
    class User(BaseModel):
        name: str
        role: Literal["admin", "user"]

    resp = client.messages.create(
        model="claude-3-haiku-20240307",
        max_tokens=1024,
        max_retries=2,
        messages=[
            {
                "role": "user",
                "content": "Create a admin user for a model with a name and role.",
            }
        ],
        response_model=User,
    )  # type: ignore

    assert isinstance(resp, User)
    assert resp.role == "admin"


def test_nested_list():
    class Properties(BaseModel):
        key: str
        value: str

    class User(BaseModel):
        name: str
        age: int
        properties: List[Properties]

    resp = client.messages.create(
        model="claude-3-haiku-20240307",
        max_tokens=1024,
        max_retries=0,
        messages=[
            {
                "role": "user",
                "content": "Create a user for a model with a name, age, and properties.",
            }
        ],
        response_model=User,
    )  # type: ignore

    assert isinstance(resp, User)
    for property in resp.properties:
        assert isinstance(property, Properties)


def test_system_messages_allcaps():
    class User(BaseModel):
        name: str
        age: int

    resp = client.messages.create(
        model="claude-3-haiku-20240307",
        max_tokens=1024,
        max_retries=0,
        messages=[
            {"role": "system", "content": "EVERYTHING MUST BE IN ALL CAPS"},
            {
                "role": "user",
                "content": "Create a user for a model with a name and age.",
            },
        ],
        response_model=User,
    )  # type: ignore

    assert isinstance(resp, User)
    assert resp.name.isupper()

In [None]:
import enum
from itertools import product
from typing import List

import pytest
import instructor

from pydantic import BaseModel

from instructor.function_calls import Mode
from ..util import models, modes


class Labels(str, enum.Enum):
    SPAM = "spam"
    NOT_SPAM = "not_spam"


class SinglePrediction(BaseModel):
    """
    Correct class label for the given text
    """

    class_label: Labels


data = [
    (
        "I am a spammer",
        Labels.SPAM,
    ),
    (
        "I am not a spammer",
        Labels.NOT_SPAM,
    ),
]


@pytest.mark.parametrize("model, data, mode", product(models, data, modes))
def test_classification(model, data, mode, client):
    client = instructor.patch(client, mode=mode)

    if mode == instructor.Mode.JSON and model in {"gpt-3.5-turbo", "gpt-4"}:
        pytest.skip(
            "JSON mode is not supported for gpt-3.5-turbo and gpt-4, skipping test"
        )

    input, expected = data
    resp = client.chat.completions.create(
        model=model,
        response_model=SinglePrediction,
        messages=[
            {
                "role": "user",
                "content": f"Classify the following text: {input}",
            },
        ],
    )
    assert resp.class_label == expected


# Define new Enum class for multiple labels
class MultiLabels(str, enum.Enum):
    BILLING = "billing"
    GENERAL_QUERY = "general_query"
    HARDWARE = "hardware"


# Adjust the prediction model to accommodate a list of labels
class MultiClassPrediction(BaseModel):
    predicted_labels: List[MultiLabels]


data = [
    (
        "I am having trouble with my billing",
        [MultiLabels.BILLING],
    ),
    (
        "I am having trouble with my hardware",
        [MultiLabels.HARDWARE],
    ),
    (
        "I have a general query and a billing issue",
        [MultiLabels.GENERAL_QUERY, MultiLabels.BILLING],
    ),
]


@pytest.mark.parametrize("model, data, mode", product(models, data, modes))
def test_multi_classify(model, data, mode, client):
    client = instructor.patch(client, mode=mode)

    if (mode, model) in {
        (Mode.JSON, "gpt-3.5-turbo"),
        (Mode.JSON, "gpt-4"),
    }:
        pytest.skip(f"{mode} mode is not supported for {model}, skipping test")

    input, expected = data

    resp = client.chat.completions.create(
        model=model,
        response_model=MultiClassPrediction,
        messages=[
            {
                "role": "user",
                "content": f"Classify the following support ticket: {input}",
            },
        ],
    )
    assert set(resp.predicted_labels) == set(expected)

In [None]:
from itertools import product
from typing import List, Literal

import pytest
import instructor

from pydantic import BaseModel

from instructor.function_calls import Mode
from ..util import models, modes


class SinglePrediction(BaseModel):
    """
    Correct class label for the given text
    """

    class_label: Literal["spam", "not_spam"]


data = [
    ("I am a spammer", "spam"),
    ("I am not a spammer", "not_spam"),
]


@pytest.mark.parametrize("model, data, mode", product(models, data, modes))
@pytest.mark.asyncio
async def test_classification(model, data, mode, aclient):
    client = instructor.patch(aclient, mode=mode)

    if mode == instructor.Mode.JSON and model in {"gpt-3.5-turbo", "gpt-4"}:
        pytest.skip(
            "JSON mode is not supported for gpt-3.5-turbo and gpt-4, skipping test"
        )

    input, expected = data
    resp = await client.chat.completions.create(
        model=model,
        response_model=SinglePrediction,
        messages=[
            {
                "role": "user",
                "content": f"Classify the following text: {input}",
            },
        ],
    )
    assert resp.class_label == expected


# Adjust the prediction model to accommodate a list of labels
class MultiClassPrediction(BaseModel):
    predicted_labels: List[Literal["billing", "general_query", "hardware"]]


data = [
    (
        "I am having trouble with my billing",
        ["billing"],
    ),
    (
        "I am having trouble with my hardware",
        ["hardware"],
    ),
    (
        "I have a general query and a billing issue",
        ["general_query", "billing"],
    ),
]


@pytest.mark.parametrize("model, data, mode", product(models, data, modes))
@pytest.mark.asyncio
async def test_multi_classify(model, data, mode, aclient):
    client = instructor.patch(aclient, mode=mode)

    if (mode, model) in {
        (Mode.JSON, "gpt-3.5-turbo"),
        (Mode.JSON, "gpt-4"),
    }:
        pytest.skip(f"{mode} mode is not supported for {model}, skipping test")

    input, expected = data

    resp = await client.chat.completions.create(
        model=model,
        response_model=MultiClassPrediction,
        messages=[
            {
                "role": "user",
                "content": f"Classify the following support ticket: {input}",
            },
        ],
    )
    assert set(resp.predicted_labels) == set(expected)

In [None]:
from itertools import product
from typing import List
from pydantic import BaseModel, Field
import pytest

import instructor

from instructor.function_calls import Mode
from ..util import models, modes


class Property(BaseModel):
    key: str
    value: str
    resolved_absolute_value: str


class Entity(BaseModel):
    id: int = Field(
        ...,
        description="Unique identifier for the entity, used for deduplication, design a scheme allows multiple entities",
    )
    subquote_string: List[str] = Field(
        ...,
        description="Correctly resolved value of the entity, if the entity is a reference to another entity, this should be the id of the referenced entity, include a few more words before and after the value to allow for some context to be used in the resolution",
    )
    entity_title: str
    properties: List[Property] = Field(
        ..., description="List of properties of the entity"
    )
    dependencies: List[int] = Field(
        ...,
        description="List of entity ids that this entity depends  or relies on to resolve it",
    )


class DocumentExtraction(BaseModel):
    entities: List[Entity] = Field(
        ...,
        description="Body of the answer, each fact should be its seperate object with a body and a list of sources",
    )


def ask_ai(content, model, client) -> DocumentExtraction:
    resp: DocumentExtraction = client.chat.completions.create(
        model=model,
        response_model=DocumentExtraction,
        messages=[
            {
                "role": "system",
                "content": "You are a perfect entity resolution system that extracts facts from the document. Extract and resolve a list of entities from the following document:",
            },
            {
                "role": "user",
                "content": content,
            },
        ],
    )  # type: ignore
    return resp


content = """
Sample Legal Contract
Agreement Contract

This Agreement is made and entered into on 2020-01-01 by and between Company A ("the Client") and Company B ("the Service Provider").

Article 1: Scope of Work

The Service Provider will deliver the software product to the Client 30 days after the agreement date.

Article 2: Payment Terms

The total payment for the service is $50,000.
An initial payment of $10,000 will be made within 7 days of the the signed date.
The final payment will be due 45 days after [SignDate].

Article 3: Confidentiality

The parties agree not to disclose any confidential information received from the other party for 3 months after the final payment date.

Article 4: Termination

The contract can be terminated with a 30-day notice, unless there are outstanding obligations that must be fulfilled after the [DeliveryDate].
"""


@pytest.mark.parametrize("model, mode", product(models, modes))
def test_extract(model, mode, client):
    client = instructor.patch(client, mode=mode)
    if (mode, model) in {
        (Mode.JSON, "gpt-3.5-turbo"),
        (Mode.JSON, "gpt-4"),
    }:
        pytest.skip(f"{mode} mode is not supported for {model}, skipping test")

    # Honestly, if there are no errors, then it's a pass
    extract = ask_ai(content=content, model=model, client=client)
    assert len(extract.entities) > 0

In [None]:
import pytest
from itertools import product
from pydantic import BaseModel
import instructor
from instructor.function_calls import Mode
from ..util import models, modes


class UserDetails(BaseModel):
    name: str
    age: int


# Lists for models, test data, and modes
test_data = [
    ("Jason is 10", "Jason", 10),
    ("Alice is 25", "Alice", 25),
    ("Bob is 35", "Bob", 35),
]


@pytest.mark.parametrize("model, data, mode", product(models, test_data, modes))
def test_extract(model, data, mode, client):
    sample_data, expected_name, expected_age = data

    if (mode, model) in {
        (Mode.JSON, "gpt-3.5-turbo"),
        (Mode.JSON, "gpt-4"),
    }:
        pytest.skip(f"{mode} mode is not supported for {model}, skipping test")

    # Setting up the client with the instructor patch
    client = instructor.patch(client, mode=mode)

    # Calling the extract function with the provided model, sample data, and mode
    response = client.chat.completions.create(
        model=model,
        response_model=UserDetails,
        messages=[
            {"role": "user", "content": sample_data},
        ],
    )

    # Assertions
    assert (
        response.name == expected_name
    ), f"Expected name {expected_name}, got {response.name}"
    assert (
        response.age == expected_age
    ), f"Expected age {expected_age}, got {response.age}"

In [None]:
import datetime 
class PersonBirthday(BaseModel): 
    name: str 
    age: int 
    birthday: datetime.date 

schema = { "properties": { "name": {"type": "string"}, "age": {"type": "integer"}, "birthday": {"type": "string", "format": "YYYY-MM-DD"}, }, "required": ["name", "age"], "type": "object", } 
resp = client.chat.completions.create( model="gpt-3.5-turbo", messages=[ { "role": "user", "content": f"Extract `Jason Liu is thirty years old his birthday is yesturday` into json today is {datetime.date.today()}", }, ], functions=[{"name": "Person", "parameters": schema}], function_call="auto", ) 
    
PersonBirthday.model_validate_json(resp.choices[0].message.function_call.arguments)

In [None]:
import instructor 
from openai import OpenAI 
from enum import Enum 
from pydantic import BaseModel, Field 
from typing_extensions import Literal 

client = instructor.patch(OpenAI()) # Tip: Do not use auto() as they cast to 1,2,3,4 
class House(Enum): 
    Gryffindor = "gryffindor" 
    Hufflepuff = "hufflepuff" 
    Ravenclaw = "ravenclaw"
    Slytherin = "slytherin" 
    
class Character(BaseModel): 
    age: int 
    name: str 
    house: House 
    
    def say_hello(self): 
        print( f"Hello, I'm {self.name}, I'm {self.age} years old and I'm from {self.house.value.title()}" ) 
        
resp = client.chat.completions.create( 
    model="gpt-4-1106-preview", 
    messages=[{"role": "user", "content": "Harry Potter"}], 
    response_model=Character, ) 

resp.model_dump()

resp.say_hello()

In [None]:
class Character(BaseModel): 
    age: int 
    name: str 
    house: Literal["Gryffindor", "Hufflepuff", "Ravenclaw", "Slytherin"] 
    
resp = client.chat.completions.create( 
    model="gpt-4-1106-preview", 
    messages=[{"role": "user", "content": "Harry Potter"}], 
    response_model=Character, ) 

resp.model_dump()

In [None]:
from typing import List 
class Property(BaseModel): 
    key: str = Field(description="Must be snake case") 
    value: str 
    
class Character(BaseModel): 
    age: int 
    name: str 
    house: Literal["Gryffindor", "Hufflepuff", "Ravenclaw", "Slytherin"] 
    properties: List[Property] 
    
resp = client.chat.completions.create( 
    model="gpt-4-1106-preview", 
    messages=[{"role": "user", "content": "Snape from Harry Potter"}], 
    response_model=Character, ) 

resp.model_dump()