Taken from: https://github.com/menloparklab/LangGraphJourney

In [1]:

import pdfplumber
import json
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.output_parsers import CommaSeparatedListOutputParser
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.prompts import PromptTemplate
from langgraph.graph import Graph
from langchain_openai import ChatOpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from semantic_text_splitter import TextSplitter
from langchain.retrievers import ParentDocumentRetriever
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Access the environment variables
os.environ['OPENAI_API_KEY'] = os.environ.get("OPENAI_API_KEY")

# Set the model as ChatOpenAI
model = ChatOpenAI(model_name="gpt-4", temperature=0)

pdfs_path = 'pdfs/ai-in-business/'

# List all files in the given folder
files = os.listdir(pdfs_path)

# Filter out all files that are not PDFs
pdf_docs = [f for f in files if f.endswith('.pdf')]

class Facts(BaseModel):
    fact: str = Field(description="document text") 
    data: str = Field(description="numerical facts")

class Section(BaseModel):
    title: str = Field(description="title of article section") 
    content: str = Field(description="content of article section")

## Context handlers

In [3]:

def get_pdf_text(pdfs_path, pdf_docs):
    text = ""
    for pdf_doc in pdf_docs:
        pdf_path = os.path.join(pdfs_path, pdf_doc)
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                # Extract text
                page_text = page.extract_text(layout=True)
                if page_text:
                    text += page_text + "\n"

                # Extract tables and handle None values
                table = page.extract_table()
                if table:
                    for row in table:
                        # Convert each cell to a string, replacing None with an empty string
                        cleaned_row = [cell if cell is not None else "" for cell in row]
                        text += ' | '.join(cleaned_row) + "\n"
    return text

# def get_text_chunks(text):
#     text_splitter = CharacterTextSplitter(
#         separator="\n",
#         chunk_size=1000,
#         chunk_overlap=100,
#         length_function=len
#     )
#     chunks = text_splitter.split_text(text)
#     return chunks

def get_text_chunks(text):
    text_splitter = TextSplitter.from_tiktoken_model("gpt-3.5-turbo")
    chunks = text_splitter.chunks(text, chunk_capacity=(500,1000))
    return chunks

def get_vectorstore(text_chunks):
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return vectorstore

# get pdf text
raw_text = get_pdf_text(pdfs_path, pdf_docs)

# get the text chunks
text_chunks = get_text_chunks(raw_text)

# create vector store
vectorstore = get_vectorstore(text_chunks)                               

In [4]:
quant = vectorstore.as_retriever(search_kwargs={"k": 15})
stats = quant.get_relevant_documents("Retrieve all the numerical and statistical facts related to the following topic: GenAI for business processes innovation.")
parser = JsonOutputParser(pydantic_object=Facts)
prompt = PromptTemplate(
        template="Answer the user query.\n{format_instructions}\n{query}\n",
        input_variables=["query"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
)
stats_data = []
for fact in stats:
        agent_quant_query = "Your task is to extract all the numerical and statistical facts from this doc: " + fact.page_content + " and. \
        return it as as list of key value pairs. The key should be 'fact' with value the fact text as value and 'data ' with the extracted numerical data as value."
        chain = prompt | model | parser
        result = chain.invoke({"query": agent_quant_query})
        stats_data.append(result)
print(stats_data)

[{'fact': "Then we'll have another session on November 8th, which will be about how is it being used in business?", 'data': 'November 8th'}, {'fact': '', 'data': ''}, {'fact': 'A recent study suggested that more than 70% of the large companies surveyed were still wondering how to reap the potential benefits that AI can offer.', 'data': '70%'}, {'fact': '', 'data': ''}, {'fact': '3 sets of activities for IT leaders to set up the organization for AI success', 'data': '3'}, {'fact': 'In one product we are building, we found that ChatGPT-4 is better at “understanding” users’ queries, while version 3.5 is faster and better at converting processed output into responses to users.', 'data': 'ChatGPT-4, version 3.5'}, {'fact': 'Today, Serena works as a software developer. She shifts among 21 unique activities. By 2030, Serena is more productive. The automation of some of Serena’s activities allows her to focus on more valuable tasks and brings new opportunities to her firm. The activities that 

## Agents

In [5]:
def agent_editor(state):
    messages = state['messages']
    user_input = messages[-1]
    output_parser = CommaSeparatedListOutputParser()
    format_instructions = output_parser.get_format_instructions()
    prompt = PromptTemplate(
        template="List eight relevant subtopics for the topic {subject}. \
                  Order the subtopics in such a way that they form a logical outline for an article.\n{format_instructions}",
        input_variables=["subject"],
        partial_variables={"format_instructions": format_instructions},
        )
    chain = prompt | model | output_parser
    response = chain.invoke({"subject": user_input})
    state['messages'].append(response) # appending AIMessage response to the AgentState
    return state

def agent_writer(state):
    messages = state['messages']
    main_topic = messages[-2]
    subtopics = messages[-1]
    print(subtopics)
    retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
    # Set up a parser + inject instructions into the prompt template.
    parser = JsonOutputParser(pydantic_object=Section)
    article= []
    for topic in subtopics:
        print("Started working on topic: "  + topic + "\n")
        docs = retriever.get_relevant_documents("Get all the relevant documents for the following subtopics: " + topic + ".")
        docs_str = '\n\n'.join(doc.page_content for doc in docs)
        agent_writer_query = "Your task is to write a section for an article on" + main_topic + ". The topic of the section is:" + topic + ".\
            ONLY use information from these documents for creating content: " + docs_str + ".\
            Each section should contain at least 300 words. Add a subtitle to the head of the section.\
            Provide as much statistical information as possible and insert expert quotes."
        prompt = PromptTemplate(
            template="Answer the user query.\n{format_instructions}\n{query}\n",
            input_variables=["query"],
            partial_variables={"format_instructions": parser.get_format_instructions()},
        )
        chain = prompt | model | parser
        result = chain.invoke({"query": agent_writer_query})
        article.append(result)
    state['messages'].append(article)
    return state

def agent_proofreader(state):
    messages = state['messages']
    main_topic = messages[-3]
    article = messages[-1]
    # Set up a parser + inject instructions into the prompt template.
    parser = JsonOutputParser(pydantic_object=Section)
    new_article= []
    for section in article:
        print("Started proodreading section: "  + section["title"] + "\n")
        agent_proofreader_query = "Your task is to rewrite a section for an article on" + main_topic + ". \
            The topic of the section is:" + section["title"] + ".\
            The section content is " + section["content"] + " Only focus on grammar, style and tone. \
            The style and tone should be appropriate for a business managment magazine. Do NOT truncate the text. \
            Each paragraph in the section should be contained within a <p> </p> tag pair."
        prompt = PromptTemplate(
            template="Answer the user query.\n{format_instructions}\n{query}\n",
            input_variables=["query"],
            partial_variables={"format_instructions": parser.get_format_instructions()},
        )
        chain = prompt | model | parser
        result = chain.invoke({"query": agent_proofreader_query})
        new_article.append(result)
    
    print(new_article)
    state['messages'].append(new_article)
    with open('article.json', 'w') as f:
        json.dump(new_article, f, indent=4)


In [6]:
# Define a Langchain graph
workflow = Graph()

#calling node 1 as agent
workflow.add_node("editor", agent_editor)
workflow.add_node("writer", agent_writer)
workflow.add_node("proofreader", agent_proofreader)

workflow.add_edge('editor', 'writer')
workflow.add_edge('writer', 'proofreader')

workflow.set_entry_point("editor")
workflow.set_finish_point("proofreader")

app = workflow.compile()

In [7]:
inputs = {"messages": ["GenAI for business processes innovation."]}
app.invoke(inputs)

['Understanding the Concept of GenAI', 'The Role of GenAI in Business Innovation', 'Exploring the Benefits of GenAI for Business Processes', 'Real-world Examples of GenAI in Business Innovation', 'Challenges in Implementing GenAI for Business Processes', 'Overcoming the Barriers to GenAI Adoption', 'Future Trends in GenAI for Business Innovation', 'Conclusion: The Impact of GenAI on Business Processes Innovation.']
Started working on topic: Understanding the Concept of GenAI

Started working on topic: The Role of GenAI in Business Innovation

Started working on topic: Exploring the Benefits of GenAI for Business Processes

Started working on topic: Real-world Examples of GenAI in Business Innovation

Started working on topic: Challenges in Implementing GenAI for Business Processes

Started working on topic: Overcoming the Barriers to GenAI Adoption

Started working on topic: Future Trends in GenAI for Business Innovation

Started working on topic: Conclusion: The Impact of GenAI on Bus