In [2]:
!pip install langchain openai python-dotenv markdownify  -q

In [3]:
# LangChain Models
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI
from langchain.schema import HumanMessage, SystemMessage, AIMessage

# Standard Helpers
import pandas as pd
import requests
import time
import json
from datetime import datetime
import os
from dotenv import load_dotenv

load_dotenv()

# Text Helpers
from bs4 import BeautifulSoup
from markdownify import markdownify as md

# For token counting
from langchain.callbacks import get_openai_callback

def printOutput(output):
    print(json.dumps(output,sort_keys=True, indent=3))



In [4]:
# It's better to do this an environment variable but putting it in plain text for clarity
openai_api_key = os.getenv("OPENAI_API_KEY", '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')

In [5]:
chat = ChatOpenAI(
    model_name="gpt-3.5-turbo-0613", # Cheaper but less reliable
    temperature=0,
    max_tokens=2000,
    openai_api_key=openai_api_key
)

  warn_deprecated(


## Function Calling Hello World Example
Create an object that holds information about the fields you'd like to extract

In [6]:
functions = [
    {
        "name": "get_food_mentioned",
        "description": "Get the food that is mentioned in the review from the customer",
        "parameters": {
            "type": "object",
            "properties": {
                "food": {
                    "type": "string",
                    "description": "The type of food mentioned, ex: Ice cream"
                },
                "good_or_bad": {
                    "type": "string",
                    "description": "whether or not the user thought the food was good or bad",
                    "enum": ["good", "bad"]
                }
            },
            "required": ["location"]
        }
    }
]

In [7]:
output = chat(messages=
     [
         SystemMessage(content="You are an helpful AI bot"),
         HumanMessage(content="I thought the burgers were awesome")
     ],
     functions=functions
)

print(json.dumps(output.additional_kwargs, indent=4))

  warn_deprecated(


{
    "function_call": {
        "arguments": "{\n  \"food\": \"burgers\",\n  \"good_or_bad\": \"good\"\n}",
        "name": "get_food_mentioned"
    }
}


## Pydantic Model
Now let's do the same thing but with a pydantic model rather than json schema

In [8]:
from langchain.pydantic_v1 import BaseModel, Field
import enum

class GoodOrBad(str, enum.Enum):
    GOOD = "Good"
    BAD = "Bad"

class Food(BaseModel):
    """Identifying information about a person's food review."""

    name: str = Field(..., description="Name of the food mentioned")
    good_or_bad: GoodOrBad = Field(..., description="Whether or not the user thought the food was good or bad")

In [9]:
output = chat(messages=
     [
         SystemMessage(content="You are an helpful AI bot"),
         HumanMessage(content="I thought the burgers were awesome")
     ],
     functions=[{
         "name": "FoodExtractor",
         "description": (
             "Identifying information about a person's food review."
         ),
         "parameters": Food.schema(),
        }
     ]
)

In [10]:
output

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\n  "name": "burgers",\n  "good_or_bad": "Good"\n}', 'name': 'FoodExtractor'}})

But LangChain has an abstraction for us that we can use

In [11]:
from langchain.chains import create_extraction_chain_pydantic

# Extraction
chain = create_extraction_chain_pydantic(pydantic_schema=Food, llm=chat)

# Run
text = """I like burgers they are great"""
chain.run(text)

  warn_deprecated(


[Food(name='burgers', good_or_bad=<GoodOrBad.GOOD: 'Good'>)]

## Multiple Results
Let's try to extract multiple objects from the same text. I'll create a person object now

In [12]:
from typing import Sequence

chat = ChatOpenAI(
    model_name="gpt-4-0613", # Cheaper but less reliable
    temperature=0,
    max_tokens=2000,
    openai_api_key=openai_api_key
)

class Person(BaseModel):
    """Someone who gives their review on different foods"""

    name: str = Field(..., description="Name of the person")
    foods: Sequence[Food] = Field(..., description="A food that a person mentioned")

In [13]:
# Extraction
chain = create_extraction_chain_pydantic(pydantic_schema=Person, llm=chat)

# Run
text = """amy likes burgers and fries but doesn't like salads"""
output = chain.run(text)

In [14]:
output[0]

Person(name='amy', foods=[Food(name='burgers', good_or_bad=<GoodOrBad.GOOD: 'Good'>), Food(name='fries', good_or_bad=<GoodOrBad.GOOD: 'Good'>), Food(name='salads', good_or_bad=<GoodOrBad.BAD: 'Bad'>)])

## User Query Extraction

Let's do another fun example where we want to extract/convert a query from a user

In [15]:
class Query(BaseModel):
    """Extract the change a user would like to make to a financial forecast"""

    entity: str = Field(..., description="Name of the category or account a person would like to change")
    amount: int = Field(..., description="Amount they would like to change it by")
    year: int = Field(..., description="The year they would like the change to")

In [16]:
chain = create_extraction_chain_pydantic(pydantic_schema=Query, llm=chat)

chain.run("Can you please add 10 more units to inventory in 2022?")

[Query(entity='inventory', amount=10, year=2022)]

In [17]:
chain.run("Remove 3 million from revenue in 2021")

[Query(entity='revenue', amount=-3, year=2021)]

## Opening Attributes - Real World Example


In [18]:
def pull_from_greenhouse(board_token):
    # If doing this in production, make sure you do retries and backoffs

    # Get your URL ready to accept a parameter
    url = f'https://boards-api.greenhouse.io/v1/boards/{board_token}/jobs?content=true'

    try:
        response = requests.get(url)
    except:
        # In case it doesn't work
        print ("Whoops, error")
        return

    status_code = response.status_code

    jobs = response.json()['jobs']

    print (f"{board_token}: {status_code}, Found {len(jobs)} jobs")

    return jobs

In [19]:
jobs = pull_from_greenhouse("okta")

okta: 200, Found 178 jobs


In [20]:
job_index = 0
print ("Preview:\n")
print (json.dumps(jobs[job_index])[:400])

Preview:

{"absolute_url": "https://www.okta.com/company/careers/opportunity/5578093?gh_jid=5578093", "data_compliance": [{"type": "gdpr", "requires_consent": false, "requires_processing_consent": false, "requires_retention_consent": false, "retention_period": null}], "internal_job_id": 2747338, "location": {"name": "Spain"}, "metadata": null, "id": 5578093, "updated_at": "2024-01-16T09:21:28-05:00", "requi


In [21]:
# I parsed through an output to create the function below
def describeJob(job_description):
    print(f"Job ID: {job_description['id']}")
    print(f"Link: {job_description['absolute_url']}")
    print(f"Updated At: {datetime.fromisoformat(job_description['updated_at']).strftime('%B %-d, %Y')}")
    print(f"Title: {job_description['title']}\n")
    print(f"Content:\n{job_description['content'][:550]}")

Let's create a Kor object that will look for tools. This is the meat and potatoes of the application

In [22]:
class Tool(BaseModel):
    """The name of a tool or company"""

    name: str = Field(..., description="Name of the food mentioned")

class Tools(BaseModel):
    """A tool, application, or other company that is listed in a job description."""

    tools: Sequence[Tool] = Field(..., description=""" A tool or technology listed
        Examples:
        * "Experience in working with Netsuite, or Looker a plus." > NetSuite, Looker
        * "Experience with Microsoft Excel" > Microsoft Excel
    """)

In [23]:
chain = create_extraction_chain_pydantic(pydantic_schema=Tools, llm=chat)

In [24]:
output = chain(text)

  warn_deprecated(


In [25]:
with get_openai_callback() as cb:
    result = chain(text)
    print(f"Total Tokens: {cb.total_tokens}")
    print(f"Prompt Tokens: {cb.prompt_tokens}")
    print(f"Completion Tokens: {cb.completion_tokens}")
    print(f"Successful Requests: {cb.successful_requests}")
    print(f"Total Cost (USD): ${cb.total_cost}")

Total Tokens: 151
Prompt Tokens: 133
Completion Tokens: 18
Successful Requests: 1
Total Cost (USD): $0.00507
