# Tagging and Extraction

In [None]:
import os
import openai

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
openai.api_key = os.environ["OPENAI_API_KEY"]

## Tagging (assign single labels)

In [None]:
from typing import List
from pydantic import BaseModel, Field
from langchain_core.utils.function_calling import convert_to_openai_function

In [None]:
class Tagging(BaseModel):
    """Tag the piece of text with particular info."""
    sentiment: str = Field(description="sentiment of text, should be `pos`, `neg`, or `neutral`")
    language: str = Field(description="language of text (should be ISO 639-1 code)")

tagging = convert_to_openai_function(Tagging)

In [None]:
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

model = ChatOpenAI(temperature=0).bind(functions=[tagging])
prompt = ChatPromptTemplate.from_messages([
    ("system", "Think carefully, and then tag the text as instructed"),
    ("user", "{input}"),
])

chain = prompt | model

inputs = [
    {"input": "I love LangChain"},
    {"input": "Les films romantiques sont vraiment nuls"},
]
chain.batch(inputs)

## Retrieve function output as JSON

In [None]:
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
chain = prompt | model | JsonOutputFunctionsParser()
chain.batch(inputs)

## Extraction (multiple pieces of information)

In [None]:
from typing import Optional

class Person(BaseModel):
    """Information about a person."""
    name: str = Field(description="Person's name")
    age: Optional[int] = Field(description="Person's age")

class Information(BaseModel):
    """Information to extract."""
    people: List[Person] = Field(description="List of info about people")

information = convert_to_openai_function(Information)

In [None]:
model = ChatOpenAI(temperature=0).bind(functions=[information])
prompt = ChatPromptTemplate.from_messages([
    ("system", "Extract the relevant information. If not explicitly provided do not guess, extract partial info."),
    ("human", "{input}"),
])

chain = prompt | model | JsonOutputFunctionsParser()
chain.invoke({"input": "Joe is 30, his mom is Martha"})

In [None]:
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser

chain = prompt | model | JsonKeyOutputFunctionsParser(key_name="people")
chain.invoke({"input": "Joe is 30, his mom is Martha"})

## Applying to a larger text

In [None]:
import re
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
documents = loader.load()
doc = documents[0]
page_content = doc.page_content[:10000]
page_content = re.sub(r"\n{2,}", "\n", page_content)
print(page_content[:500])

In [None]:
# Tagging
class Overview(BaseModel):
    """Overview of a section of text."""
    summary: str = Field(description="Provide a concise summary of the content.")
    language: str = Field(description="Provide the language that the content is written in.")
    keywords: str = Field(description="Provide keywords related to the content.")

overview = convert_to_openai_function(Overview)
model = ChatOpenAI().bind(functions=[overview], function_call={"name": "Overview"})
chain = prompt | model | JsonOutputFunctionsParser()

chain.invoke({"input": page_content})

In [None]:
# Extraction
class Paper(BaseModel):
    """Information about papers mentioned."""
    title: str
    author: Optional[str]

class Info(BaseModel):
    """Information to extract"""
    papers: List[Paper]

info = convert_to_openai_function(Info)

template = """An article will be passed to you. Extract from it all papers that are mentioned by this article followed by its author. 

Do not extract the name of the article itself. If no papers are mentioned that's fine - you don't need to extract any! Just return an empty list.

Do not make up or guess ANY extra information. Only extract what exactly is in the text."""

prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", "{input}")
])

model = ChatOpenAI().bind(functions=[info], function_call={"name": "Info"})
chain = prompt | model | JsonKeyOutputFunctionsParser(key_name="papers")

chain.invoke({"input": page_content})

In [None]:
chain.invoke({"input": "Hi!"})

In [None]:
# Improve with chunking
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.runnable import RunnableLambda

text_splitter = RecursiveCharacterTextSplitter(chunk_overlap=50)
prep = RunnableLambda(lambda x: [{"input": doc} for doc in text_splitter.split_text(x)])
prep.invoke(page_content) # creates chunks

In [None]:
def flatten(chunks):
    """Join all chunk results in a single list"""
    papers = []
    for chunk in chunks:
        papers += chunk
    return papers

final_chain = prep | chain.map() | flatten
final_chain.invoke(page_content)