### Performing Tagging and Extraction on Documents and Texts using Langchain

Langchain enables us to build tagging and data extraction pipelines around our documents to build powerful applications, we would be exploring using openai functions to peform tagging and data extraction in langchain

In [2]:
import os
import openai
from langchain.utils.openai_functions import convert_pydantic_to_openai_function
from pydantic import Field, BaseModel

In [3]:
class Tagging(BaseModel):
    """Tagging a piece of text with a particular information"""
    sentiment: str = Field(description="sentiment of text, should be `pos`, `neg`, or `neutral`")
    language: str = Field(description="language of text (should be ISO 639-1 code)")

In [4]:
tagging_function = convert_pydantic_to_openai_function(Tagging)

In [5]:
tagging_function

{'name': 'Tagging',
 'description': 'Tagging a piece of text with a particular information',
 'parameters': {'description': 'Tagging a piece of text with a particular information',
  'properties': {'sentiment': {'description': 'sentiment of text, should be `pos`, `neg`, or `neutral`',
    'title': 'Sentiment',
    'type': 'string'},
   'language': {'description': 'language of text (should be ISO 639-1 code)',
    'title': 'Language',
    'type': 'string'}},
  'required': ['sentiment', 'language'],
  'title': 'Tagging',
  'type': 'object'}}

In [6]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai.chat_models import ChatOpenAI

In [7]:
from dotenv import load_dotenv
load_dotenv()

True

In [8]:
mistral7b = ChatOpenAI(model="mistralai/Mistral-7B-Instruct-v0.2")

In [None]:
mistral7b._default_params

In [9]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant"),
    ("human", "{input}")
])

In [10]:
mistral7b = mistral7b.bind(
    functions=[tagging_function],
    function_call={"name": "Tagging"}
)

In [12]:
tagging_chain = prompt | mistral7b

In [13]:
tagging_chain.invoke({"input": "I love langchain"})

BadRequestError: Error code: 400 - {'error': "mistralai/Mistral-7B-Instruct-v0.2 doesn't support constraints"}

In [14]:
tagging_chain.invoke({"input": "non mi piace questo cibo"})

BadRequestError: Error code: 400 - {'error': "mistralai/Mistral-7B-Instruct-v0.2 doesn't support constraints"}

In [15]:
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser

In [16]:
tagging_chain = prompt | mistral7b | JsonOutputFunctionsParser()

In [17]:
tagging_chain.invoke({"input": "non mi piace questo cibo"})

BadRequestError: Error code: 400 - {'error': "mistralai/Mistral-7B-Instruct-v0.2 doesn't support constraints"}

#### Performing information extraction using langchain and openai function calling

In [18]:
from typing import List

In [19]:
from typing import Optional
class Person(BaseModel):
    """Information about a person."""
    name: str = Field(description="person's name")
    age: Optional[int] = Field(description="person's age")

In [20]:
class Information(BaseModel):
    """Information to extract."""
    people: List[Person] = Field(description="List of info about people")

In [21]:
extraction_function = convert_pydantic_to_openai_function(Information)

In [22]:
extraction_function

{'name': 'Information',
 'description': 'Information to extract.',
 'parameters': {'$defs': {'Person': {'description': 'Information about a person.',
    'properties': {'name': {'description': "person's name",
      'title': 'Name',
      'type': 'string'},
     'age': {'anyOf': [{'type': 'integer'}, {'type': 'null'}],
      'description': "person's age",
      'title': 'Age'}},
    'required': ['name', 'age'],
    'title': 'Person',
    'type': 'object'}},
  'description': 'Information to extract.',
  'properties': {'people': {'description': 'List of info about people',
    'items': {'description': 'Information about a person.',
     'properties': {'name': {'description': "person's name",
       'title': 'Name',
       'type': 'string'},
      'age': {'anyOf': [{'type': 'integer'}, {'type': 'null'}],
       'description': "person's age",
       'title': 'Age'}},
     'required': ['name', 'age'],
     'title': 'Person',
     'type': 'object'},
    'title': 'People',
    'type': 'array'

In [23]:
extraction_model = mistral7b.bind(functions=[extraction_function], function_call={"name": "Information"})

In [24]:
extraction_model.invoke("Joe is 30, his mom is Martha")


BadRequestError: Error code: 400 - {'error': "mistralai/Mistral-7B-Instruct-v0.2 doesn't support constraints"}

In [25]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "Extract the relevant information, if not explicitly provided do not guess. Extract partial info"),
    ("human", "{input}")
])

In [26]:
extraction_chain = prompt | extraction_model | JsonOutputFunctionsParser()

In [28]:
from langchain.output_parsers.openai_functions import  JsonKeyOutputFunctionsParser

In [29]:
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="people")

### Performing extractions on real large body of text
We would be using a real large data corpus to see how it works in a real setting

In [30]:
from langchain.document_loaders import WebBaseLoader
loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
documents = loader.load()

In [31]:
len(documents)

1

In [34]:
print(documents[0].page_content)







LLM Powered Autonomous Agents | Lil'Log







































Lil'Log






















Posts




Archive




Search




Tags




FAQ




emojisearch.app









      LLM Powered Autonomous Agents
    
Date: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng


 


Table of Contents



Agent System Overview

Component One: Planning

Task Decomposition

Self-Reflection


Component Two: Memory

Types of Memory

Maximum Inner Product Search (MIPS)


Component Three: Tool Use

Case Studies

Scientific Discovery Agent

Generative Agents Simulation

Proof-of-Concept Examples


Challenges

Citation

References





Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general

In [35]:
class Overview(BaseModel):
    """Overview of a section of text."""
    summary: str = Field(description="Provide a concise summary of the content.")
    language: str = Field(description="Provide the language that the content is written in.")
    keywords: str = Field(description="Provide keywords related to the content.")

In [36]:
overview_functions = [convert_pydantic_to_openai_function(Overview)]

In [37]:
overview_extraction_model = mistral7b.bind(functions=overview_functions, function_call={"name": "Overview"})

In [None]:
tagging_chain = prompt | overview_extraction_model | JsonOutputFunctionsParser()

In [38]:
class Paper(BaseModel):
    """Information about papers mentioned."""
    title: str
    author: Optional[str]


class Info(BaseModel):
    """Information to extract"""
    papers: List[Paper]

In [39]:
info_functions = [convert_pydantic_to_openai_function(Info)]
info_functions

[{'name': 'Info',
  'description': 'Information to extract',
  'parameters': {'$defs': {'Paper': {'description': 'Information about papers mentioned.',
     'properties': {'title': {'title': 'Title', 'type': 'string'},
      'author': {'anyOf': [{'type': 'string'}, {'type': 'null'}],
       'title': 'Author'}},
     'required': ['title', 'author'],
     'title': 'Paper',
     'type': 'object'}},
   'description': 'Information to extract',
   'properties': {'papers': {'items': {'description': 'Information about papers mentioned.',
      'properties': {'title': {'title': 'Title', 'type': 'string'},
       'author': {'anyOf': [{'type': 'string'}, {'type': 'null'}],
        'title': 'Author'}},
      'required': ['title', 'author'],
      'title': 'Paper',
      'type': 'object'},
     'title': 'Papers',
     'type': 'array'}},
   'required': ['papers'],
   'title': 'Info',
   'type': 'object'}}]

In [40]:
paper_info_model = mistral7b.bind(functions=info_functions, function_call={"name": "Info"})

In [41]:
paper_info_model

RunnableBinding(bound=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x127f92410>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x127ae2610>, model_name='mistralai/Mistral-7B-Instruct-v0.2', openai_api_key='fe1f4854dd8970c1d52e05e795d053db950947b1cc4fe010db76f3557f93b3bf', openai_api_base='https://api.together.xyz/v1', openai_proxy=''), kwargs={'functions': [{'name': 'Info', 'description': 'Information to extract', 'parameters': {'$defs': {'Paper': {'description': 'Information about papers mentioned.', 'properties': {'title': {'title': 'Title', 'type': 'string'}, 'author': {'anyOf': [{'type': 'string'}, {'type': 'null'}], 'title': 'Author'}}, 'required': ['title', 'author'], 'title': 'Paper', 'type': 'object'}}, 'description': 'Information to extract', 'properties': {'papers': {'items': {'description': 'Information about papers mentioned.', 'properties': {'title': {'title': 'Title', 'type': 'string'}, 'author': {'anyOf': [{'type

In [42]:
paper_extraction_chain = prompt | paper_info_model | JsonOutputFunctionsParser()

In [43]:
template = """A article will be passed to you. Extract from it all papers that are mentioned by this article. 

Do not extract the name of the article itself. If no papers are mentioned that's fine - you don't need to extract any! Just return an empty list.

Do not make up or guess ANY extra information. Only extract what exactly is in the text."""

prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", "{input}")
])

In [44]:
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="papers")

In [45]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_overlap=0)

In [48]:
splits = text_splitter.split_text(documents[0].page_content)

In [50]:
len(splits)

14

In [51]:
def flatten(matrix):
    flat_list = []
    for row in matrix:
        flat_list += row
    return flat_list

In [52]:
flatten([[1, 2], [3, 4]])

[1, 2, 3, 4]

In [53]:
from langchain.schema.runnable import RunnableLambda

In [54]:
prep = RunnableLambda(
    lambda x: [{"input": doc} for doc in text_splitter.split_text(x)]
)

In [55]:
prep.invoke("hi")

[{'input': 'hi'}]

In [56]:
chain = prep | extraction_chain.map() | flatten