In [1]:
pip install --upgrade langchain-core

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
# configure langsmith
import os
import getpass

os.environ["LANGSMITH_TRACING"]="true"
os.environ["LANGSMITH_API_KEY"] = getpass.getpass("Enter LangSmith API Key: ")

In [3]:
# Use a pydantic model to define an example schema (what type of info we want to extract)
from typing import Optional

from pydantic import BaseModel, Field

class Person(BaseModel):
    """Information about one person"""

    # ^ Doc-string for the entity Person
    # This doc-string can be sent to the LLM as the description of the schema Person
    # It can help to improve extraction results

    # Note that:
    # 1. Each field is an 'optional' - this allows the model to decline extracting it
    # 2. Each field has a 'description' - this description is used by the LLM
    # Having a good description can help improve extraction results
    name: Optional[str] = Field(default=None, description="the name of the person")
    hair_color: Optional[str] = Field(default=None, description="the color of the persons hair if known")
    height_in_meters: Optional[str] = Field(default=None, description="the persons height measured in meters")

In [4]:
# create information extractor using the schema we just defined
from typing import Optional

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from pydantic import BaseModel, Field

# Define a custom prompt to provide instructions and any additional context
# 1. You can add exmaples into the prompt template to improve extraction quality
# 2. Introduce additional parameters to take context into account (e.g., include metadata about the document from which the text was extracted)
prompt_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm. "
            "Only extract relevant data from the text. "
            "If you do not know the value of an attribute asked to extract, return null for the attributes value.",
        ),
        (
            "human", "{text}"
        ),

    ]
) 

In [5]:
pip install -qU langchain-openai

Note: you may need to restart the kernel to use updated packages.


In [6]:
# setup openai API and model
import os 
import getpass

if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter OpenAI API Key: ")

from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")

In [7]:
structured_llm = llm.with_structured_output(schema=Person)

In [10]:
text = "Alan Smith is 6 feet tall and has blonde hair."
prompt = prompt_template.invoke({"text": text})
structured_llm.invoke(prompt)

Person(name='Alan Smith', hair_color='blonde', height_in_meters='1.83')

In [11]:
# Setup an extraction chain that can extract mulitiple entities
from typing import List, Optional

from pydantic import BaseModel, Field

class Person(BaseModel):
    """Information about a person"""

    name: Optional[str] = Field(default=None, description="The name of the person")
    hair_color: Optional[str] = Field(default=None, description="The persons hair color if known")
    height_in_meters: Optional[str] = Field(default=None, description="Height measured in meters")

class Data(BaseModel):
    """Extract data about people."""

    # Creates a model so that we can extract multiple entities
    people: List[Person]

In [13]:
# test
structured_llm = llm.with_structured_output(schema=Data)
text = "My name is Jeff. My hair is black and I am 6 feet tall. Anna has the same hair color as me."
prompt = prompt_template.invoke({"text": text})
structured_llm.invoke(prompt)

Data(people=[Person(name='Jeff', hair_color='black', height_in_meters='1.83'), Person(name='Anna', hair_color='black', height_in_meters=None)])

In [15]:
from langchain_core.utils.function_calling import tool_example_to_messages

examples = [
    (
        "The ocean is vast and blue. Its more than 20,000 feet deep.",
        Data(people=[]),
    ),
    (
        "Fiona traveleled far from France to Spain",
        Data(people=[Person(name="Fiona", height_in_meters=None, hair_color=None)]),
    ),
]

messages = []

for txt, tool_call in examples:
    if tool_call.people:
        # This final message is optional for some providers
        ai_response = "Detected people."
    else:
        ai_response = "Detected no people."

        messages.extend(tool_example_to_messages(txt, [tool_call], ai_response=ai_response))

  messages.extend(tool_example_to_messages(txt, [tool_call], ai_response=ai_response))


In [16]:
for message in messages:
    message.pretty_print()


The ocean is vast and blue. Its more than 20,000 feet deep.
Tool Calls:
  Data (5b11ee8d-f070-431e-9710-1548f4f1ef96)
 Call ID: 5b11ee8d-f070-431e-9710-1548f4f1ef96
  Args:
    people: []

You have correctly called this tool.

Detected no people.


In [17]:
# Create a message which we intend no people do be extracted without few-shot prompting 
message_no_extraction = {
    "role": "user",
    "content": "The solar system is large, but earth only has 1 moon.",
}

structured_llm = llm.with_structured_output(schema=Data)
structured_llm.invoke([message_no_extraction])

Data(people=[])

In [18]:
# That worked but I dont think it was supposed to. Now we will re-call the model with our few-shot examples
structured_llm.invoke(messages + [message_no_extraction])

Data(people=[])