<a href="https://colab.research.google.com/github/ahsanrazi/LangChain/blob/main/06_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [141]:
!pip install -qU langchain-core
!pip install -qU langchain-google-genai

In [192]:
from google.colab import userdata

gemini_api_key = userdata.get('GEMINI_API_KEY').strip()

# Extraction

In [193]:
# Extract structured data from text and other unstructured media using chat models and few-shot examples.

# The Schema

In [194]:
# First, we need to describe what information we want to extract from the text.

In [263]:
# We'll use Pydantic to define an example schema to extract personal information.

from typing import Optional
from pydantic import BaseModel, Field


class Person(BaseModel):
    """Information about a person."""

    # ^ Doc-string for the entity Person.
    # This doc-string is sent to the LLM as the description of the schema Person, and it can help to improve extraction results.

    # Note that:
    # 1. Each field is an `optional` -- this allows the model to decline to extract it!
    # 2. Each field has a `description` -- this description is used by the LLM.
    # Having a good description can help improve extraction results.

    name: Optional[str] = Field(default=None, description="The name of the person")
    hair_color: Optional[str] = Field(default=None, description="Hair color of the person")
    height_in_meters: Optional[str] = Field(default=None, description="Height in meters")

In [257]:
# There are two best practices when defining schema:

# Document the attributes and the schema itself: This information is sent to the LLM and is used to improve the quality of information extraction.
# Do not force the LLM to make up information! Above we used Optional for the attributes allowing the LLM to output None if it doesn't know the answer.

# The Extractor

In [264]:
# Let's create an information extractor using the schema we defined above.

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

# Define a custom prompt to provide instructions and any additional context.
# 1) You can add examples into the prompt template to improve extraction quality
# 2) Introduce additional parameters to take context into account (e.g., include metadata about the document from which the text was extracted.)

prompt_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm. Only extract relevant information from the text. If you do not know the value of an attribute asked to extract, return null for the attribute's value.",
        ),
        # Please see the how-to about improving performance with MessagesPlaceholder('examples'),
        ("human", "{text}"),
    ]
)

In [265]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model = "gemini-2.0-flash-exp", api_key=gemini_api_key)

In [266]:
structured_llm = llm.with_structured_output(schema=Person)

In [267]:
text = "Raza is 6 feet tall and has a Red hair."
prompt = prompt_template.invoke({"text": text})
response = structured_llm.invoke(prompt)

In [268]:
response

Person(name='Raza', hair_color=None, height_in_meters='1.8288')

# Multiple Entities

In [269]:
# In most cases, we should be extracting a list of entities rather than a single entity.
# This can be easily achieved using pydantic by nesting models inside one another.

from typing import List, Optional
from pydantic import BaseModel, Field


class Person(BaseModel):
    """Information about a person."""

    name: Optional[str] = Field(default=None, description="The name of the person")
    hair_color: Optional[str] = Field(default=None, description="Hair color of the person")
    height_in_meters: Optional[str] = Field(default=None, description="Height in meters")


class Data(BaseModel):
    """Extracted data about people."""

    # Creates a model so that we can extract multiple entities.
    people: List[Person]

In [270]:
structured_llm = llm.with_structured_output(schema=Data)

In [271]:
text = "My name is Jeff, my hair is black and I am 6 feet tall. Anna has the same color hair as me."
prompt = prompt_template.invoke({"text": text})
response = structured_llm.invoke(prompt)

In [272]:
response

Data(people=[Person(name='Jeff', hair_color=None, height_in_meters=None), Person(name='Anna', hair_color=None, height_in_meters=None)])

In [273]:
# Structured output often uses tool calling under-the-hood.
# This typically involves the generation of AI messages containing tool calls, as well as tool messages containing the results of tool calls.

# MessagesPlaceholder

In [121]:
from langchain_core.prompts import MessagesPlaceholder

prompt = MessagesPlaceholder("history")
prompt.format_messages() # raises KeyError

prompt = MessagesPlaceholder("history", optional=True)
prompt.format_messages() # returns empty list []

prompt.format_messages(
    history=[
        ("system", "You are an AI assistant."),
        ("human", "Hello!"),
    ]
)

# -> [
#     SystemMessage(content="You are an AI assistant."),
#     HumanMessage(content="Hello!"),
# ]

In [None]:
# Building a prompt with chat history:

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are a helpful assistant."),
        MessagesPlaceholder("history"),
        ("human", "{question}")
    ]
)
prompt.invoke(
   {
       "history": [("human", "what's 5 + 2"), ("ai", "5 + 2 is 7")],
       "question": "now multiply that by 4"
   }
)
# -> ChatPromptValue(messages=[
#     SystemMessage(content="You are a helpful assistant."),
#     HumanMessage(content="what's 5 + 2"),
#     AIMessage(content="5 + 2 is 7"),
#     HumanMessage(content="now multiply that by 4"),
# ])

In [None]:
# Limiting the number of messages:

from langchain_core.prompts import MessagesPlaceholder

prompt = MessagesPlaceholder("history", n_messages=1)

prompt.format_messages(
    history=[
        ("system", "You are an AI assistant."),
        ("human", "Hello!"),
    ]
)
# -> [
#     HumanMessage(content="Hello!"),
# ]

# Use a Parsing Approach

In [274]:
# Use a prompt based approach to extract with models that do not support tool/function calling.

In [None]:
# Tool calling features are not required for generating structured output from LLMs.
# LLMs that are able to follow prompt instructions well can be tasked with outputting information in a given format.

# This approach relies on designing good prompts and then parsing the output of the LLMs to make them extract information well.

# To extract data without tool-calling features:
# Instruct the LLM to generate text following an expected format (e.g., JSON with a certain schema);
# Use output parsers to structure the model response into a desired Python object.

# Using PydanticOutputParser

In [285]:
from typing import List, Optional
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field, validator


class Person(BaseModel):
    """Information about a person."""

    name: str = Field(..., description="The name of the person")
    height_in_meters: float = Field(..., description="The height of the person expressed in meters.")  # ... means this field is required


class People(BaseModel):
    """Identifying information about all people in a text."""

    people: List[Person]

In [286]:
# Set up a parser
parser = PydanticOutputParser(pydantic_object=People)

# Prompt
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "Answer the user query. Wrap the output in `json` tags\n{format_instructions}"),
        ("human", "{query}"),
    ]
).partial(format_instructions=parser.get_format_instructions())

In [287]:
# Let's take a look at what information is sent to the model

query = "Anna is 23 years old and she is 6 feet tall"

print(prompt.format_prompt(query=query).to_string())

System: Answer the user query. Wrap the output in `json` tags
The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"$defs": {"Person": {"description": "Information about a person.", "properties": {"name": {"description": "The name of the person", "title": "Name", "type": "string"}, "height_in_meters": {"description": "The height of the person expressed in meters.", "title": "Height In Meters", "type": "number"}}, "required": ["name", "height_in_meters"], "title": "Person", "type": "object"}}, "description": "Identifying information about all people in a text.", "properties": {"people": {"items"

In [289]:
# we simply chain together the prompt, model and output parser

chain = prompt | llm | parser
chain.invoke({"query": query})

People(people=[Person(name='Anna', height_in_meters=1.8288)])