# Tagging and Extraction

In [1]:
import os
import openai

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
openai.api_key = os.environ["OPENAI_API_KEY"]

## Tagging (assign single labels)

In [2]:
from typing import List
from pydantic import BaseModel, Field
from langchain_core.utils.function_calling import convert_to_openai_function

In [3]:
class Tagging(BaseModel):
    """Tag the piece of text with particular info."""
    sentiment: str = Field(description="sentiment of text, should be `pos`, `neg`, or `neutral`")
    language: str = Field(description="language of text (should be ISO 639-1 code)")

tagging = convert_to_openai_function(Tagging)

In [4]:
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

model = ChatOpenAI(temperature=0).bind(functions=[tagging])
prompt = ChatPromptTemplate.from_messages([
    ("system", "Think carefully, and then tag the text as instructed"),
    ("user", "{input}"),
])

chain = prompt | model

inputs = [
    {"input": "I love LangChain"},
    {"input": "Les films romantiques sont vraiment nuls"},
]
chain.batch(inputs)

[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"sentiment":"pos","language":"en"}', 'name': 'Tagging'}, 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 19, 'prompt_tokens': 99, 'total_tokens': 118, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'function_call', 'logprobs': None}, id='run-2700f000-ec5b-4891-a9d2-0be394e5dfe7-0', usage_metadata={'input_tokens': 99, 'output_tokens': 19, 'total_tokens': 118, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}),
 AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"sentiment":"neg","language":"fr"}', 'name': 'Tagging'}, 'refusal': None}, response_metadata={'token_usage': {'co

## Retrieve function output as JSON

In [5]:
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
chain = prompt | model | JsonOutputFunctionsParser()
chain.batch(inputs)

[{'sentiment': 'pos', 'language': 'en'},
 {'sentiment': 'neg', 'language': 'fr'}]

## Extraction (multiple pieces of information)

In [6]:
from typing import Optional

class Person(BaseModel):
    """Information about a person."""
    name: str = Field(description="Person's name")
    age: Optional[int] = Field(description="Person's age")

class Information(BaseModel):
    """Information to extract."""
    people: List[Person] = Field(description="List of info about people")

information = convert_to_openai_function(Information)

In [7]:
model = ChatOpenAI(temperature=0).bind(functions=[information])
prompt = ChatPromptTemplate.from_messages([
    ("system", "Extract the relevant information. If not explicitly provided do not guess, extract partial info."),
    ("human", "{input}"),
])

chain = prompt | model | JsonOutputFunctionsParser()
chain.invoke({"input": "Joe is 30, his mom is Martha"})

{'people': [{'name': 'Joe', 'age': 30}, {'name': 'Martha', 'age': None}]}

In [8]:
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser

chain = prompt | model | JsonKeyOutputFunctionsParser(key_name="people")
chain.invoke({"input": "Joe is 30, his mom is Martha"})

[{'name': 'Joe', 'age': 30}, {'name': 'Martha', 'age': None}]

## Applying to a larger text

In [9]:
import re
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
documents = loader.load()
doc = documents[0]
page_content = doc.page_content[:10000]
page_content = re.sub(r"\n{2,}", "\n", page_content)
print(page_content[:500])

USER_AGENT environment variable not set, consider setting it to identify your requests.



LLM Powered Autonomous Agents | Lil'Log
Lil'Log
Posts
Archive
Search
Tags
FAQ
emojisearch.app
      LLM Powered Autonomous Agents
    
Date: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng
 
Table of Contents
Agent System Overview
Component One: Planning
Task Decomposition
Self-Reflection
Component Two: Memory
Types of Memory
Maximum Inner Product Search (MIPS)
Component Three: Tool Use
Case Studies
Scientific Discovery Agent
Generative Agents Simulation
Proof-of-Concep


In [10]:
# Tagging
class Overview(BaseModel):
    """Overview of a section of text."""
    summary: str = Field(description="Provide a concise summary of the content.")
    language: str = Field(description="Provide the language that the content is written in.")
    keywords: str = Field(description="Provide keywords related to the content.")

overview = convert_to_openai_function(Overview)
model = ChatOpenAI().bind(functions=[overview], function_call={"name": "Overview"})
chain = prompt | model | JsonOutputFunctionsParser()

chain.invoke({"input": page_content})

{'summary': 'This article discusses the concept of building autonomous agents powered by LLM (large language model) as the core controller. It explores the components of such agents, including planning, memory, and tool use. It also delves into specific techniques like Task Decomposition, Self-Reflection, and Algorithm Distillation in agent systems.',
 'language': 'English',
 'keywords': 'LLM, autonomous agents, large language model, planning, memory, tool use, task decomposition, self-reflection, algorithm distillation'}

In [11]:
# Extraction
class Paper(BaseModel):
    """Information about papers mentioned."""
    title: str
    author: Optional[str]

class Info(BaseModel):
    """Information to extract"""
    papers: List[Paper]

info = convert_to_openai_function(Info)

template = """An article will be passed to you. Extract from it all papers that are mentioned by this article followed by its author. 

Do not extract the name of the article itself. If no papers are mentioned that's fine - you don't need to extract any! Just return an empty list.

Do not make up or guess ANY extra information. Only extract what exactly is in the text."""

prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", "{input}")
])

model = ChatOpenAI().bind(functions=[info], function_call={"name": "Info"})
chain = prompt | model | JsonOutputFunctionsParser()

chain.invoke({"input": page_content})

{'papers': [{'title': 'Chain of thought (CoT; Wei et al. 2022)',
   'author': 'Wei et al. 2022'},
  {'title': 'Tree of Thoughts (Yao et al. 2023)', 'author': 'Yao et al. 2023'},
  {'title': 'LLM+P (Liu et al. 2023)', 'author': 'Liu et al. 2023'},
  {'title': 'ReAct (Yao et al. 2023)', 'author': 'Yao et al. 2023'},
  {'title': 'Reflexion (Shinn & Labash 2023)',
   'author': 'Shinn & Labash 2023'},
  {'title': 'Chain of Hindsight (CoH; Liu et al. 2023)',
   'author': 'Liu et al. 2023'},
  {'title': 'Algorithm Distillation (AD; Laskin et al. 2023)',
   'author': 'Laskin et al. 2023'}]}

In [12]:
chain.invoke({"input": "Hi!"})

{'papers': [{'title': 'Paper A', 'author': 'Author A'},
  {'title': 'Paper B', 'author': 'Author B'}]}

In [13]:
# Improve with chunking
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.runnable import RunnableLambda

text_splitter = RecursiveCharacterTextSplitter(chunk_overlap=50)
prep = RunnableLambda(lambda x: [{"input": doc} for doc in text_splitter.split_text(x)])
prep.invoke(page_content) # creates chunks

[{'input': 'LLM Powered Autonomous Agents | Lil\'Log\nLil\'Log\nPosts\nArchive\nSearch\nTags\nFAQ\nemojisearch.app\n      LLM Powered Autonomous Agents\n    \nDate: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng\n \nTable of Contents\nAgent System Overview\nComponent One: Planning\nTask Decomposition\nSelf-Reflection\nComponent Two: Memory\nTypes of Memory\nMaximum Inner Product Search (MIPS)\nComponent Three: Tool Use\nCase Studies\nScientific Discovery Agent\nGenerative Agents Simulation\nProof-of-Concept Examples\nChallenges\nCitation\nReferences\nBuilding agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\nAgent System Overview#\nIn a LLM-powered autonomous agent system,

In [14]:
def flatten(chunks):
    """Join all chunk results in a single list"""
    papers = []
    for chunk in chunks:
        papers += chunk["papers"]
    return papers

final_chain = prep | chain.map() | flatten
final_chain.invoke(page_content)

[{'title': 'Chain of thought (CoT; Wei et al. 2022)', 'author': 'Wei'},
 {'title': 'Tree of Thoughts (Yao et al. 2023)', 'author': 'Yao'},
 {'title': 'LLM+P (Liu et al. 2023)', 'author': 'Liu'},
 {'title': 'ReAct', 'author': 'Yao et al. 2023'},
 {'title': 'Reflexion', 'author': 'Shinn & Labash 2023'},
 {'title': 'Chain of Hindsight', 'author': 'Liu et al. 2023'},
 {'title': 'Algorithm Distillation (AD)', 'author': 'Laskin et al. 2023'}]