# Entity Extraction Easy Mode w/ Pydantic

In [1]:
# Import utils like parsing between XML tags, invoking bedrock
%run pydantic-entity-extraction-demo-utils.ipynb

### Grab some sample text from which we want to extract entities

In [2]:
# TODO: use curl or something to grab text
document_text = open("wikipedia-text-example.txt", "r").read()

### Describe the entities you want to extract as Fields in a pydantic BaseModel

In [3]:
import pydantic
from pydantic import BaseModel, Field, field_validator

print(pydantic.__version__)

2.6.3


In [4]:
import enum


class AgeEnum(enum.Enum):
    baby = "baby"
    child = "child"
    adult = "adult"


class Person(BaseModel):
    name: str | None = Field(
        ...,
        description=("The name of the person extracted. This field is required."),
        examples=["John Doe", "David Kaleko", "Rick Astley"],
    )
    age: int | None = Field(
        default=None,
        description=(
            "The age in years of the person extracted. This field is optional."
        ),
        examples=[42, 25],
    )
    age_class: AgeEnum | None = Field(
        default=None,
        description=(
            "The age maturity of the person extracted. This field is optional. "
        ),
        examples=AgeEnum._member_names_,
    )
    job_title: str | None = Field(
        default=None,
        description=("The job title of the person extracted. This field is optional."),
        examples=["Software Engineer", "Data Scientist", "Pirate King"],
    )


class URL(BaseModel):
    url: str = Field(
        ...,
        description=(
            "The full URL extracted from the provided text. "
            "Copy the URL exactly character by character, do not modify it."
        ),
    )


class ExtractedEntities(BaseModel):
    people: list[Person] = Field(
        default=[],
        description=(
            "A list of people extracted from the text. The people are ordered by the "
            "order in which they appear in the text."
        ),
    )

    urls: list[URL] = Field(
        default=[],
        description=("A list of all website URLs extracted from the text."),
    )

    @field_validator("urls", mode="after")
    def validate_url(cls, values: list[URL]) -> list[URL]:
        """Drop any urls which don't contain "www." """
        if not values:
            return []
        return [u for u in values if "www." in u.url]

### Build a prompt to extract entities

In [5]:
PROMPT_PREFIX = (
    "Your goal is to extract structured information from a provided block of source_text that matches "
    "the form described below."
)
FORMAT_INSTRUCTIONS = (
    "Please output the extracted information in JSON format. Do not output anything except "
    "for the extracted information. Do not add any clarifying information. Do not add any "
    "fields that are not in the schema. "
    "All output must be in JSON format and follow the schema specified above. "
    "Wrap the JSON in <json> tags. "
)


def build_full_prompt(source_text: str) -> str:
    prompt = PROMPT_PREFIX
    prompt += f"""
    <schema>
        {ExtractedEntities.model_json_schema()}
    </schema>
    """
    prompt += f"<source_text>{source_text}</source_text>"
    prompt += FORMAT_INSTRUCTIONS + "\n\n"

    return prompt

### Send the prompt to an LLM and see if it works

In [6]:
full_prompt = build_full_prompt(source_text=document_text)
llm_response = invoke_claude_3_with_text(full_prompt, temperature=0)

In [7]:
print(llm_response)

<json>
{
  "people": [
    {
      "name": "Andy Jassy",
      "age": 56,
      "age_class": "adult",
      "job_title": "President and CEO of Amazon"
    },
    {
      "name": "Jeff Bezos",
      "age": null,
      "age_class": null,
      "job_title": null
    },
    {
      "name": "Everett L. Jassy",
      "age": null,
      "age_class": null,
      "job_title": "senior partner in the corporate law firm Dewey Ballantine in New York City"
    },
    {
      "name": "Margery Jassy",
      "age": null,
      "age_class": null,
      "job_title": null
    },
    {
      "name": "Elana Caplan",
      "age": null,
      "age_class": null,
      "job_title": "fashion designer for Eddie Bauer"
    },
    {
      "name": "Adam Selipsky",
      "age": null,
      "age_class": null,
      "job_title": "CEO of AWS"
    }
  ],
  "urls": [
    {
      "url": "www.thecrimson.com"
    },
    {
      "url": "ww.thisisawebsite.com"
    }
  ]
}
</json>


In [8]:
extracted_meta = ExtractedEntities.model_validate_json(
    extract_json_content(llm_response)
)

In [9]:
from devtools import pprint

pprint(extracted_meta)

ExtractedEntities(
    people=[
        Person(
            name='Andy Jassy',
            age=56,
            age_class=<AgeEnum.adult: 'adult'>,
            job_title='President and CEO of Amazon',
        ),
        Person(
            name='Jeff Bezos',
            age=None,
            age_class=None,
            job_title=None,
        ),
        Person(
            name='Everett L. Jassy',
            age=None,
            age_class=None,
            job_title='senior partner in the corporate law firm Dewey Ballantine in New York City',
        ),
        Person(
            name='Margery Jassy',
            age=None,
            age_class=None,
            job_title=None,
        ),
        Person(
            name='Elana Caplan',
            age=None,
            age_class=None,
            job_title='fashion designer for Eddie Bauer',
        ),
        Person(
            name='Adam Selipsky',
            age=None,
            age_class=None,
            job_title='CEO of AW

In [10]:
# Demonstrating field validator
print(ExtractedEntities(urls=[URL(url="ww.amazon.com")]))
print(ExtractedEntities(urls=[URL(url="www.amazon.com")]))

people=[] urls=[]
people=[] urls=[URL(url='www.amazon.com')]
