# Packages 

- change the below markdown chunk to code chunk if needed

!pip install llama_index
!pip install langchain_openai
!pip install langchain_community
!pip install langgraph
!pip install retriever
!pip install chromadb
!pip install llama-index-utils-workflow

# Chunking
---

In [3]:
import os
import warnings
from glob import glob
import openai
import nest_asyncio
from dotenv import load_dotenv, find_dotenv
from llama_index.llms.openai import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from llama_index.core.node_parser import LangchainNodeParser, SentenceWindowNodeParser,SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.extractors import BaseExtractor,KeywordExtractor,TitleExtractor
from llama_index.core import Settings,SimpleDirectoryReader,StorageContext,VectorStoreIndex,load_index_from_storage, Document
from llama_index.core.schema import MetadataMode
warnings.filterwarnings('ignore')

# Settings---------- #

_ = load_dotenv(find_dotenv()) 
openai.api_key  = os.environ['OPENAI_API_KEY']

Settings.llm = OpenAI(
    model="gpt-4o-mini",
    api_key=openai.api_key,
    temperature=0.1
)
Settings.embed_model = OpenAIEmbedding(
    model="text-embedding-ada-002",
    api_key=openai.api_key,
    embed_batch_size=100
)
Settings.text_splitter = SentenceSplitter(chunk_size=512,chunk_overlap=50)

node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=5,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)
Settings.node_parser = node_parser

# Paths for storage
DB_DIR = os.getenv("DB_DIR", os.path.join(os.getcwd(), "docs", "chroma"))
INDEX_DIR = os.getenv("INDEX_DIR", os.path.join(os.getcwd(), "index"))
METADATA_ENRICHMENT_INDEX_DIR = os.getenv("METADATA_ENRICHMENT_INDEX_DIR", os.path.join(os.getcwd(), "enriched_index"))

# Folder containing the PDF files
DATA_FOLDER = os.getenv("DATA_FOLDER", os.path.join(os.getcwd(), "docs"))

# Settings end----- #

class CustomExtractor(BaseExtractor):
    def extract(self, nodes):
        metadata_list = [
            {
                "custom": (
                    node.metadata["document_title"]
                    + "\n"
                    + node.metadata["excerpt_keywords"]
                )
            }
            for node in nodes
        ]
        return metadata_list

def metadata_enrichment_index(files=DATA_FOLDER, documents=None):
    """
    Create an enriched index with transformations.
    returns Enriched index with metadata
    """
    nest_asyncio.apply()
    extractors = [
        TitleExtractor(nodes=5, llm=Settings.llm),
        KeywordExtractor(keywords=10, llm=Settings.llm)
    ]
    transformations = [Settings.node_parser] + extractors
    pipeline = IngestionPipeline(transformations=transformations)
    docs_nodes = []

    if documents:
        docs_nodes.extend(pipeline.run(documents=documents))
    else:
        file_paths = glob(os.path.join(files, "*.pdf")) if isinstance(files, str) else files
        for file_path in file_paths:
            docs = SimpleDirectoryReader(input_files=[file_path]).load_data()
            docs_nodes.extend(pipeline.run(documents=docs))
    
    index = VectorStoreIndex(nodes=docs_nodes,embed_model=Settings.embed_model)
    index.storage_context.persist(persist_dir=METADATA_ENRICHMENT_INDEX_DIR)
    
    return index

def llama_index_chunk_pdf(files=DATA_FOLDER, index_dir=METADATA_ENRICHMENT_INDEX_DIR):
    """
    Load/create new one from PDF files.
    """
    if os.path.exists(index_dir):
        return load_index_from_storage(StorageContext.from_defaults(persist_dir=index_dir))
    
    file_paths = [os.path.join(files, f) for f in os.listdir(files) if f.endswith('.pdf')]
    documents = SimpleDirectoryReader(input_files=file_paths).load_data()
    document = Document(text="\n\n".join([doc.text for doc in documents]))
    
    return metadata_enrichment_index(documents=document)

def main():
    """execute chunking process"""
    llama_index_chunk_pdf(files=DATA_FOLDER)


if __name__ == "__main__":
    main()

# Retrieval & Engine
---

In [4]:
#from transformers import AutoModel, AutoTokenizer
from langchain_core.tools import tool
from llama_index.core import Settings, StorageContext, load_index_from_storage
from llama_index.core.indices.postprocessor import MetadataReplacementPostProcessor#, SentenceTransformerRerank
from llama_index.core.query_engine import SubQuestionQueryEngine
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.llms.openai import OpenAI
from llama_index.core.question_gen import LLMQuestionGenerator
from llama_index.core.question_gen.prompts import DEFAULT_SUB_QUESTION_PROMPT_TMPL

'''settings'''
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=10,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)
Settings.llm = OpenAI(model="gpt-4o-mini", temperature= 0.1, timeout=60)
Settings.node_parser = node_parser

def get_sentence_window_query_engine(sentence_index, similarity_top_k=6):
    """
    Create a sentence window query engine from index.
    """
    postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
    sentence_window_engine = sentence_index.as_query_engine(
        similarity_top_k=similarity_top_k,
        node_postprocessors=[postproc]
    )
    return sentence_window_engine

def final_engine(engine, verbose=False):
    """
    Create final query engine with sub-question capability.
    """
    question_gen = LLMQuestionGenerator.from_defaults(
        llm=Settings.llm,
        prompt_template_str="""
            Instead of giving a question, always prefix the question
            with: 'By first identifying and quoting the most relevant sources, '.
            """ + DEFAULT_SUB_QUESTION_PROMPT_TMPL,
    )
    return SubQuestionQueryEngine.from_defaults(
        query_engine_tools=[
            QueryEngineTool(
                query_engine=engine,
                metadata=ToolMetadata(
                    name="docs",
                    description="ESG information and portfolio constructions on companies.",
                ),
            )
        ],
        question_gen=question_gen,
        use_async=True,
        verbose=verbose
    )

def llama_index_retriever_tool(index_path: str, index_type='sentence', similarity_top_k=6):
    """
   Alows searching and retrieving information from documents using llama-index.
    """
    # Load the index
    storage_context = StorageContext.from_defaults(persist_dir=index_path)
    index = load_index_from_storage(storage_context)
    
    # Create query engine
    query_engine = get_sentence_window_query_engine(index, similarity_top_k) if 'sentence' in index_type else index.as_query_engine(similarity_top_k=similarity_top_k)
    query_engine = final_engine(query_engine, verbose=True)
    
    @tool
    def engine(query=''):
        """
        RAG query tool.
        """
        response = query_engine.query(query)
        print(f'---RAG---:\n {response}')
        return response
    
    return engine

# Graph
---

In [5]:
from typing import Annotated, Literal, Sequence, TypedDict
from langchain import hub
from langchain_core.messages import BaseMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
from langgraph.graph.message import add_messages

MODEL_NAME = "gpt-4o-mini"
MAX_ATTEMPT = 5

class AgentState(TypedDict):
    # The add_messages function defines how an update should be processed
    # Default is to replace. add_messages says "append"
    messages: Annotated[Sequence[BaseMessage], add_messages]
    attempt_num: int


### Edges
def grade_documents(state) -> Literal["generate", "rewrite", "generate_no_ans"]:
   """
   Determines whether the retrieved documents are relevant to the question.
   """
   print("---CHECK RELEVANCE---")
   
   try:
       messages = state["messages"]
       question = messages[0].content
       print('Question:', question)
       
       # Use llama-index for retrieval if index_path provided
       if "index_path" in state:
           retriever = llama_index_retriever_tool(
               index_path=state["index_path"],
               index_type='sentence',
               similarity_top_k=10  # Increased for better coverage
           )
           
           # Get retrieval results
           retrieval_response = retriever(question)
           docs = str(retrieval_response)
           
           # Normalize text for matching
           question_lower = question.lower()
           docs_lower = docs.lower()
           
           # Extract year and check for temporal+numeric relevance 
           import re
           year_match = re.search(r'20\d{2}', question)
           if year_match:
               year = year_match.group()
               has_year = year in docs_lower
               has_numbers = bool(re.search(r'(?:rm|myr|rp)?\s*\d+(?:\.\d+)?(?:\s*(?:million|m|billion|b))?', docs_lower))
               
               if has_year and has_numbers:
                   print("---DECISION: DOCS RELEVANT (Contains Year and Numbers)---")
                   print("docs:")
                   print(docs)
                   return "generate"
               
       else:
           docs = messages[-1].content
           
       print("Retrieved docs:", docs)
       
       # Grade relevance using LLM
       class grade(BaseModel):
           binary_score: str = Field(description="Relevance score 'yes' or 'no'")
       
       model = ChatOpenAI(temperature=0.1, model=MODEL_NAME, streaming=True)
       llm_with_tool = model.with_structured_output(grade)
       
       prompt = PromptTemplate(
           template="""You are a grader assessing relevance of a retrieved document to a user question about financial and ESG data. \n 
           Here is the retrieved document: \n\n {context} \n\n
           Here is the user question: {question} \n
           
           For financial questions, grade the document as relevant if it contains:
           1. Specific numeric values (like revenue, profit, etc.) for the requested time period
           2. Financial figures with currency indicators (USD, MYR, RM, etc.)
           3. Year-specific financial information that matches the question
           4. Comparative financial data between years
           
           The document should be considered relevant even if it needs some interpretation 
           (e.g., if asking about 2023 and document mentions 'FY2023' or 'current year').
           
           If the document only states that information is not found or not available, grade it as not relevant.
           Give a binary score 'yes' or 'no' to indicate whether the document is relevant to the question.""",
           input_variables=["context", "question"],
       )
       chain = prompt | llm_with_tool

       scored_result = chain.invoke({"question": question, "context": docs})
       score = scored_result.binary_score

       # Return decision based on score and attempt count
       if score == "yes":
           print("---DECISION: DOCS RELEVANT---")
           print("docs:")
           print(docs)
           return "generate"
       elif state["attempt_num"] < MAX_ATTEMPT:
           print("---DECISION: DOCS NOT RELEVANT---")
           print(score)
           print("docs:")
           print(docs)
           return "rewrite"
       else:
           print("---DECISION: DOCS NOT RELEVANT, MAX_ATTEMPT achieved---")
           print(score)
           print("docs:")
           print(docs)
           return "generate_no_ans"
           
   except Exception as e:
       print(f"Error in grade_documents: {str(e)}")
       raise


### Nodes

def agent_with_tools(tools):
    def agent(state):
        """
        Invokes the agent model to generate a response based on the current state. Given
        the question, it will decide to retrieve using the retriever tool, or simply end.
        """
        print("---CALL AGENT---")
        messages = state["messages"]
        if not state.get("attempt_num"):
            state["attempt_num"] = 0  # Initialize attempt number
        model = ChatOpenAI(temperature=0.1, streaming=True, model=MODEL_NAME)
        model = model.bind_tools(tools)
        response = model.invoke(messages)
        # We return a list, because this will get added to the existing list
        return {"messages": [response], "attempt_num": state["attempt_num"]}
    return agent



def rewrite(state):
    """
    Transform the query to produce a better question.
    """

    print("---TRANSFORM QUERY---")
    messages = state["messages"]
    question = messages[0].content

    msg = [
        HumanMessage(
            content=f""" \n 
    Look at the input and try to reason about the underlying semantic intent / meaning. \n 
    Here is the initial question:
    \n ------- \n
    {question} 
    \n ------- \n
    Formulate an improved question: """,
        )
    ]

    # Grader
    model = ChatOpenAI(temperature=0.1, model=MODEL_NAME, streaming=True)
    response = model.invoke(msg)
    return {"messages": [response], "attempt_num": state["attempt_num"]+1}

def generate_no_ans(state):
    """
    Generate response when no answer found
    """
    print("---GENERATE_NO_ANS---")
    return {"messages": ["No Relevant Info found in the documents"], "attempt_num": 0}

def generate(state):
    """
    Generate answer
    """
    print("---GENERATE---")
    messages = state["messages"]
    question = messages[0].content
    last_message = messages[-1]
    docs = last_message.content
    
    print("Question:", question)
    print("Last Message:", last_message)

    # Prompt
    prompt = hub.pull("rlm/rag-prompt")

    # LLM
    llm = ChatOpenAI(model_name=MODEL_NAME, temperature=0.1, streaming=True)

    # Post-processing
    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    # Chain
    rag_chain = prompt | llm | StrOutputParser()

    # Run
    response = rag_chain.invoke({"context": docs, "question": question})
    return {"messages": [response], "attempt_num": 0}

# Workflow
---

In [6]:
from typing import TypedDict
from langgraph.graph import END, StateGraph, START
from langgraph.prebuilt import ToolNode, tools_condition

def build_workflow(vecdb):
    retriever_tool = llama_index_retriever_tool(vecdb)
    tools = [retriever_tool]

    # Define a new graph
    workflow = StateGraph(AgentState)
    workflow.attempt_num = 0

    # Define the nodes we will cycle between
    workflow.add_node("agent", agent_with_tools(tools))  # agent
    retrieve = ToolNode(tools)
    workflow.add_node("retrieve", retrieve)  # retrieval
    workflow.add_node("rewrite", rewrite)  # Re-writing the question
    workflow.add_node("generate_no_ans", generate_no_ans)  #  Generating a response after we know no document is relevant
    workflow.add_node("generate", generate)  # Generating a response after we know the documents are relevant
    # Call agent node to decide to retrieve or not
    workflow.add_edge(START, "agent")

    # Decide whether to retrieve
    workflow.add_conditional_edges(
        "agent",
        # Assess agent decision
        tools_condition,
        {
            # Translate the condition outputs to nodes in our graph
            "tools": "retrieve",
            END: END,
        },
    )

    # Edges taken after the `action` node is called.
    workflow.add_conditional_edges("retrieve", grade_documents)
    workflow.add_edge("generate", END)
    workflow.add_edge("generate_no_ans", END)
    workflow.add_edge("rewrite", "agent")

    # Compile
    return workflow.compile()

# Interface
---

In [7]:
import sys
import gradio as gr
from langchain_core.messages import HumanMessage

# Gradio Integration
def get_answer_func(graph):
    def get_answer_chat(question, history):
        print("User question:", question)
        result = graph.invoke(
            {"messages": [HumanMessage(content=question)]},
            config={"configurable": {"thread_id": 42, "max_attempt": 5}}
        )
        response = result["messages"][-1].content
        history.append((question, response))
        return response
    return get_answer_chat

index_path = os.path.join(os.getcwd(), "enriched_index")
graph = build_workflow(index_path)

gr.ChatInterface(
    get_answer_func(graph),
    chatbot=gr.Chatbot(height=300),
    title="Agent",
    description="Ask me any question",
    theme="ocean"
).launch()

* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




User question: hi
---CALL AGENT---
User question: What's Public Shareholders' share for SOL?
---CALL AGENT---
Generated 3 sub questions.
[1;3;38;2;237;90;200m[docs] Q: By first identifying and quoting the most relevant sources, what is the percentage of public shareholders' ownership in SOL?
[0m[1;3;38;2;90;149;237m[docs] Q: By first identifying and quoting the most relevant sources, what are the recent changes in public shareholders' share for SOL?
[0m[1;3;38;2;11;159;203m[docs] Q: By first identifying and quoting the most relevant sources, how does the public shareholders' share in SOL compare to industry standards?
[0m[1;3;38;2;237;90;200m[docs] A: The percentage of public shareholders' ownership in Shui On Land (SOL) is 43.77%. This is stated in the excerpt: "Shui On Group 56.23% Public Shareholders 43.77%."
[0m[1;3;38;2;90;149;237m[docs] A: The recent changes in public shareholders' share for Shui On Land (SOL) indicate that public shareholders hold 43.77% of the company,

---

---

---

---
# CHECK ERROR FUNCTION BY FUNCTION
---

## CHUNK

In [9]:
def metadata_enrichment_index(files=DATA_FOLDER, documents=None):
    """
    Added processing logs to check if files are processed functionally
    
    Args:
        files: Path to data folder or list of file paths (default: DATA_FOLDER)
        documents: Optional pre-loaded documents (default: None)
        
    Returns:
        tuple: (VectorStoreIndex, dict) - Enriched index with metadata and processing logs
    """
    nest_asyncio.apply()
    
    os.makedirs(DATA_FOLDER, exist_ok=True)
    os.makedirs(METADATA_ENRICHMENT_INDEX_DIR, exist_ok=True)
    
    # Initialize logging dictionary
    processing_log = {
        'processed_files': [],
        'total_documents': 0,
        'nodes_generated': 0,
        'processing_details': []
    }
    
    # Initialize extractors and transformations
    extractors = [
        TitleExtractor(nodes=5, llm=Settings.llm),
        KeywordExtractor(keywords=10, llm=Settings.llm)
    ]
    transformations = [Settings.node_parser] + extractors
    pipeline = IngestionPipeline(transformations=transformations)
    docs_nodes = []
    
    if documents:
        # Handle pre-loaded documents
        if isinstance(documents, list):
            doc_count = len(documents)
        else:
            doc_count = 1
            documents = [documents]
            
        processing_log['total_documents'] = doc_count
        new_nodes = pipeline.run(documents=documents)
        docs_nodes.extend(new_nodes)
        processing_log['nodes_generated'] += len(new_nodes)
        
    else:
        # Get PDF files from the data folder
        if isinstance(files, str):
            if os.path.isdir(files):
                file_paths = glob(os.path.join(files, "*.pdf"))
            else:
                file_paths = [files]
        else:
            file_paths = files
            
        # Log the search path
        processing_log['data_folder'] = DATA_FOLDER
        processing_log['total_documents'] = len(file_paths)
        
        print(f"\nSearching for PDF files in: {DATA_FOLDER}")
        
        if not file_paths:
            print(f"No PDF files found in {DATA_FOLDER}")
            processing_log['processing_details'].append({
                'status': 'warning',
                'message': f'No PDF files found in directory: {DATA_FOLDER}'
            })
        
        for file_path in file_paths:
            if not os.path.exists(file_path):
                processing_log['processing_details'].append({
                    'file_path': file_path,
                    'file_name': os.path.basename(file_path),
                    'status': 'failed',
                    'error': f'File does not exist: {file_path}'
                })
                continue
                
            try:
                print(f"Processing: {os.path.basename(file_path)}")
                reader = SimpleDirectoryReader(input_files=[file_path])
                docs = reader.load_data()
                
                # Log document details
                doc_info = {
                    'file_path': file_path,
                    'file_name': os.path.basename(file_path),
                    'file_size': f"{os.path.getsize(file_path) / 1024:.2f} KB",
                }
                
                # Process document
                new_nodes = pipeline.run(documents=docs)
                docs_nodes.extend(new_nodes)
                
                # Update log with success details
                doc_info.update({
                    'nodes_generated': len(new_nodes),
                    'status': 'success'
                })
                processing_log['processed_files'].append(doc_info)
                processing_log['nodes_generated'] += len(new_nodes)
                print(f"Generated {len(new_nodes)} nodes")
                
            except Exception as e:
                print(f"Error processing {os.path.basename(file_path)}: {str(e)}")
                processing_log['processing_details'].append({
                    'file_path': file_path,
                    'file_name': os.path.basename(file_path),
                    'status': 'failed',
                    'error': str(e)
                })
    

    index = VectorStoreIndex(nodes=docs_nodes,embed_model=Settings.embed_model)
    index.storage_context.persist(persist_dir=METADATA_ENRICHMENT_INDEX_DIR)
    
    # Add final statistics
    processing_log['total_nodes'] = len(docs_nodes)
    processing_log['index_path'] = METADATA_ENRICHMENT_INDEX_DIR
    
    # Print summary
    print("\nDocument Processing Summary:")
    print(f"Total documents processed successfully: {len(processing_log['processed_files'])}")
    print(f"Total nodes generated: {processing_log['nodes_generated']}")
    print(f"Index saved to: {METADATA_ENRICHMENT_INDEX_DIR}")
    
    if processing_log['processed_files']:
        print("\nSuccessfully processed files:")
        for file_info in processing_log['processed_files']:
            print(f"- {file_info['file_name']}: {file_info.get('nodes_generated', 'N/A')} nodes")
    
    if processing_log['processing_details']:
        print("\nIssues encountered:")
        for file_info in processing_log['processing_details']:
            if 'message' in file_info:
                print(f"- {file_info['message']}")
            else:
                print(f"- {file_info['file_name']}: {file_info['error']}")
    
    return index, processing_log

In [10]:
metadata_enrichment_index()


Searching for PDF files in: c:\Users\Yueyi\OneDrive\桌面\Cneutral\codes\LLM tool\docs
Processing: 797324_e_SOL_Sustainability Report 2023_240422.pdf


100%|██████████| 1/1 [00:01<00:00,  1.72s/it]
100%|██████████| 1/1 [00:01<00:00,  1.23s/it]
100%|██████████| 5/5 [00:01<00:00,  4.87it/s]
100%|██████████| 5/5 [00:01<00:00,  4.45it/s]
100%|██████████| 5/5 [00:00<00:00,  5.48it/s]
100%|██████████| 5/5 [00:00<00:00,  5.12it/s]
100%|██████████| 1/1 [00:00<00:00,  1.94it/s]
100%|██████████| 5/5 [00:01<00:00,  4.15it/s]
100%|██████████| 5/5 [00:01<00:00,  3.13it/s]
100%|██████████| 5/5 [00:01<00:00,  4.35it/s]
100%|██████████| 5/5 [00:00<00:00,  5.61it/s]
100%|██████████| 5/5 [00:01<00:00,  4.48it/s]
100%|██████████| 5/5 [00:00<00:00,  5.35it/s]
100%|██████████| 5/5 [00:03<00:00,  1.31it/s]
100%|██████████| 5/5 [00:00<00:00,  5.65it/s]
100%|██████████| 5/5 [00:01<00:00,  4.64it/s]
100%|██████████| 5/5 [00:00<00:00,  5.58it/s]
100%|██████████| 5/5 [00:01<00:00,  4.68it/s]
100%|██████████| 5/5 [00:01<00:00,  4.38it/s]
100%|██████████| 5/5 [00:00<00:00,  5.95it/s]
100%|██████████| 5/5 [00:00<00:00,  5.20it/s]
100%|██████████| 5/5 [00:01<00:00,

Generated 1148 nodes
Processing: 797418_GCCP-Sustainability_Report_FY2023.pdf


100%|██████████| 5/5 [00:01<00:00,  3.88it/s]
100%|██████████| 5/5 [00:04<00:00,  1.19it/s]
100%|██████████| 5/5 [00:01<00:00,  4.71it/s]
100%|██████████| 5/5 [00:05<00:00,  1.02s/it]
100%|██████████| 5/5 [00:00<00:00,  5.53it/s]
100%|██████████| 5/5 [00:01<00:00,  4.31it/s]
100%|██████████| 5/5 [00:01<00:00,  4.78it/s]
100%|██████████| 1/1 [00:00<00:00,  1.38it/s]
100%|██████████| 5/5 [00:05<00:00,  1.04s/it]
100%|██████████| 5/5 [00:01<00:00,  2.99it/s]
100%|██████████| 5/5 [00:00<00:00,  5.76it/s]
100%|██████████| 5/5 [00:00<00:00,  6.09it/s]
100%|██████████| 5/5 [00:03<00:00,  1.66it/s]
100%|██████████| 5/5 [00:01<00:00,  4.75it/s]
100%|██████████| 5/5 [00:00<00:00,  6.58it/s]
100%|██████████| 5/5 [00:01<00:00,  4.86it/s]
100%|██████████| 5/5 [00:00<00:00,  5.12it/s]
100%|██████████| 5/5 [00:01<00:00,  4.72it/s]
100%|██████████| 5/5 [00:00<00:00,  5.14it/s]
100%|██████████| 1/1 [00:01<00:00,  1.07s/it]
100%|██████████| 262/262 [00:56<00:00,  4.65it/s]


Generated 262 nodes
Processing: cvx_portfolio.pdf


100%|██████████| 3/3 [00:00<00:00,  5.31it/s]
100%|██████████| 5/5 [00:00<00:00,  5.86it/s]
100%|██████████| 5/5 [00:01<00:00,  4.43it/s]
100%|██████████| 5/5 [00:01<00:00,  3.81it/s]
100%|██████████| 5/5 [00:00<00:00,  5.14it/s]
100%|██████████| 5/5 [00:00<00:00,  5.12it/s]
100%|██████████| 5/5 [00:00<00:00,  5.04it/s]
100%|██████████| 5/5 [00:01<00:00,  3.87it/s]
100%|██████████| 5/5 [00:00<00:00,  6.36it/s]
100%|██████████| 5/5 [00:00<00:00,  6.72it/s]
100%|██████████| 5/5 [00:01<00:00,  3.18it/s]
100%|██████████| 5/5 [00:00<00:00,  5.23it/s]
100%|██████████| 5/5 [00:03<00:00,  1.47it/s]
100%|██████████| 5/5 [00:00<00:00,  7.15it/s]
100%|██████████| 5/5 [00:00<00:00,  5.10it/s]
100%|██████████| 5/5 [00:00<00:00,  5.49it/s]
100%|██████████| 5/5 [00:00<00:00,  5.10it/s]
100%|██████████| 5/5 [00:00<00:00,  5.58it/s]
100%|██████████| 5/5 [00:00<00:00,  5.42it/s]
100%|██████████| 5/5 [00:01<00:00,  3.82it/s]
100%|██████████| 5/5 [00:01<00:00,  2.99it/s]
100%|██████████| 5/5 [00:00<00:00,

Generated 1809 nodes
Processing: MachineLearning-Lecture01.pdf


100%|██████████| 5/5 [00:02<00:00,  1.79it/s]
100%|██████████| 5/5 [00:01<00:00,  3.82it/s]
100%|██████████| 5/5 [00:01<00:00,  4.65it/s]
100%|██████████| 5/5 [00:00<00:00,  5.20it/s]
100%|██████████| 5/5 [00:00<00:00,  6.05it/s]
100%|██████████| 5/5 [00:00<00:00,  5.51it/s]
100%|██████████| 5/5 [00:01<00:00,  3.55it/s]
100%|██████████| 5/5 [00:00<00:00,  5.42it/s]
100%|██████████| 5/5 [00:00<00:00,  5.29it/s]
100%|██████████| 5/5 [00:01<00:00,  4.91it/s]
100%|██████████| 5/5 [00:01<00:00,  4.65it/s]
100%|██████████| 5/5 [00:00<00:00,  5.57it/s]
100%|██████████| 5/5 [00:00<00:00,  5.77it/s]
100%|██████████| 5/5 [00:01<00:00,  4.01it/s]
100%|██████████| 5/5 [00:01<00:00,  4.87it/s]
100%|██████████| 5/5 [00:01<00:00,  4.84it/s]
100%|██████████| 5/5 [00:04<00:00,  1.25it/s]
100%|██████████| 5/5 [00:01<00:00,  4.50it/s]
100%|██████████| 5/5 [00:00<00:00,  5.92it/s]
100%|██████████| 5/5 [00:00<00:00,  6.17it/s]
100%|██████████| 5/5 [00:00<00:00,  6.48it/s]
100%|██████████| 1/1 [00:00<00:00,

Generated 564 nodes

Document Processing Summary:
Total documents processed successfully: 4
Total nodes generated: 3783
Index saved to: c:\Users\Yueyi\OneDrive\桌面\Cneutral\codes\LLM tool\enriched_index

Successfully processed files:
- 797324_e_SOL_Sustainability Report 2023_240422.pdf: 1148 nodes
- 797418_GCCP-Sustainability_Report_FY2023.pdf: 262 nodes
- cvx_portfolio.pdf: 1809 nodes
- MachineLearning-Lecture01.pdf: 564 nodes


(<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x26b0248f3d0>,
 {'processed_files': [{'file_path': 'c:\\Users\\Yueyi\\OneDrive\\桌面\\Cneutral\\codes\\LLM tool\\docs\\797324_e_SOL_Sustainability Report 2023_240422.pdf',
    'file_name': '797324_e_SOL_Sustainability Report 2023_240422.pdf',
    'file_size': '11688.18 KB',
    'nodes_generated': 1148,
    'status': 'success'},
   {'file_path': 'c:\\Users\\Yueyi\\OneDrive\\桌面\\Cneutral\\codes\\LLM tool\\docs\\797418_GCCP-Sustainability_Report_FY2023.pdf',
    'file_name': '797418_GCCP-Sustainability_Report_FY2023.pdf',
    'file_size': '326.74 KB',
    'nodes_generated': 262,
    'status': 'success'},
   {'file_path': 'c:\\Users\\Yueyi\\OneDrive\\桌面\\Cneutral\\codes\\LLM tool\\docs\\cvx_portfolio.pdf',
    'file_name': 'cvx_portfolio.pdf',
    'file_size': '771.07 KB',
    'nodes_generated': 1809,
    'status': 'success'},
   {'file_path': 'c:\\Users\\Yueyi\\OneDrive\\桌面\\Cneutral\\codes\\LLM tool\\docs\\MachineLearning-Le

In [13]:
def llama_index_chunk_pdf(files=DATA_FOLDER, index_dir=METADATA_ENRICHMENT_INDEX_DIR):
    """
    Load/create new one from PDF files with validation checks.
    
    Args:
        files: Directory containing PDF files
        index_dir: Directory for storing the index
        
    Returns:
        Loaded or newly created index
        
    Raises:
        ValueError: If index validation fails
    """
    # Load existing index if available
    if os.path.exists(index_dir):
        try:
            index = load_index_from_storage(StorageContext.from_defaults(persist_dir=index_dir))
            
            # Validate the loaded index
            if index is None:
                raise ValueError("Index loaded as None")
                
            # Check if index has documents
            doc_count = len(index.docstore.docs)
            if doc_count == 0:
                raise ValueError("Loaded index contains no documents")
                
            print(f"Successfully loaded existing index with {doc_count} documents")
            return index
            
        except Exception as e:
            print(f"Error loading existing index: {str(e)}")
            print("Falling back to creating new index...")
    
    # Create new index from PDF files
    try:
        file_paths = [os.path.join(files, f) for f in os.listdir(files) if f.endswith('.pdf')]
        
        if not file_paths:
            raise ValueError(f"No PDF files found in {files}")
            
        print(f"Found {len(file_paths)} PDF files")
        
        documents = SimpleDirectoryReader(input_files=file_paths).load_data()
        if not documents:
            raise ValueError("No documents loaded from PDF files")
            
        document = Document(text="\n\n".join([doc.text for doc in documents]))
        
        index = metadata_enrichment_index(documents=document)
        if index is None:
            raise ValueError("Failed to create new index")
            
        print(f"Successfully created new index from {len(file_paths)} PDF files")
        return index
        
    except Exception as e:
        raise ValueError(f"Failed to create new index: {str(e)}")
    
llama_index_chunk_pdf()

Successfully loaded existing index with 3783 documents


<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x26b689064d0>

 'nodes_generated': 3783 == loaded documents

## RETRIEVAL

In [17]:
index_test = llama_index_chunk_pdf()

INFO:llama_index.core.indices.loading:Loading all indices.


Successfully loaded existing index with 3783 documents


In [40]:
import logging
from typing import Optional, Union
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.core.query_engine import SubQuestionQueryEngine
from llama_index.core.question_gen import LLMQuestionGenerator
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.postprocessor import MetadataReplacementPostProcessor

# add log
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def get_sentence_window_query_engine(sentence_index, similarity_top_k=6) -> Optional[Union[str, Exception]]:
    """create sentence query engine, return engine
    return with log"""
    try:
        logger.info("starting to create sentence engine...")
        
        if sentence_index is None:
            raise ValueError("sentence_index cannot be None")
        if similarity_top_k < 1:
            raise ValueError("similarity_top_k must be greater than 0")
            
        postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
        logger.info("postprocessor created successfully")
        
        sentence_window_engine = sentence_index.as_query_engine(
            similarity_top_k=similarity_top_k,
            node_postprocessors=[postproc]
        )
        
        if sentence_window_engine is None:
            raise ValueError("fail to create sentence window engine")
            
        logger.info("sentence window engine created successfully")
        return sentence_window_engine
        
    except Exception as e:
        logger.error(f"error in creating postprocess engine: {str(e)}")
        raise
get_sentence_window_query_engine(index_test)

INFO:__main__:starting to create sentence engine...
INFO:__main__:postprocessor created successfully
INFO:__main__:sentence window engine created successfully


<llama_index.core.query_engine.retriever_query_engine.RetrieverQueryEngine at 0x26bc5e78e90>

In [29]:
def final_engine(engine, verbose=False):
    """create final engine based on the sentence engine, with log"""
    try:
        logger.info("start to create final engine...")
        
        if engine is None:
            raise ValueError("input engine cannot be None")
        
        # create generator
        question_gen = LLMQuestionGenerator.from_defaults(
            llm=Settings.llm,
            prompt_template_str="""
                Instead of giving a question, always prefix the question
                with: 'By first identifying and quoting the most relevant sources, '.
                """ + DEFAULT_SUB_QUESTION_PROMPT_TMPL,
        )
        logger.info("question generator created")
        
        # create final engine
        final_query_engine = SubQuestionQueryEngine.from_defaults(
            query_engine_tools=[
                QueryEngineTool(
                    query_engine=engine,
                    metadata=ToolMetadata(
                        name="docs",
                        description="ESG information and portfolio constructions on companies.",
                    ),
                )
            ],
            question_gen=question_gen,
            use_async=False,
            verbose=verbose
        )
        
        # validate finel engine
        if final_query_engine is None:
            raise ValueError("failed to create final engine")
            
        logger.info("final engine created")
        return final_query_engine
        
    except Exception as e:
        logger.error(f"error in building final engine: {str(e)}")
        raise

In [30]:
def llama_index_retriever_tool(index_path: str, index_type='sentence', similarity_top_k=6):
    """
    create retriever tool, returns engine
    """
    try:
        logger.info(f"loading index from{index_path}...")
        
        # validate input
        if not isinstance(index_path, str) or not index_path:
            raise ValueError("invalid index path")
        if similarity_top_k < 1:
            raise ValueError("similarity_top_k must be greater than 0")
        
        # load index
        storage_context = StorageContext.from_defaults(persist_dir=index_path)
        index = load_index_from_storage(storage_context)
        
        # validate index
        if index is None:
            raise ValueError("failed to load index")
        
        logger.info("index loaded successfully")
        
        # create the engine
        query_engine = get_sentence_window_query_engine(index, similarity_top_k) if 'sentence' in index_type else index.as_query_engine(similarity_top_k=similarity_top_k)
        query_engine = final_engine(query_engine, verbose=True)
        
        logger.info("query engine created successfully")

        @tool
        def engine(query=''):
            """RAG query tool"""
            try:
                if not query:
                    logger.warning("recieved empty queries")
                    return "queries cannot be empty"
                
                logger.info(f"processing: {query}")
                response = query_engine.query(query)
                
                # test response
                if response is None:
                    raise ValueError("returned empty response")
                
                logger.info("response finished")
                print(f'---RAG---:\n {response}')
                return response
                
            except Exception as e:
                logger.error(f"error in responsing: {str(e)}")
                raise
        
        return engine
        
    except Exception as e:
        logger.error(f"error in creating retriever: {str(e)}")
        raise

In [41]:
engine_test = llama_index_retriever_tool(METADATA_ENRICHMENT_INDEX_DIR)
engine_test_test = final_engine(engine_test)

INFO:__main__:loading index fromc:\Users\Yueyi\OneDrive\桌面\Cneutral\codes\LLM tool\enriched_index...
INFO:llama_index.core.indices.loading:Loading all indices.
INFO:__main__:index loaded successfully
INFO:__main__:starting to create sentence engine...
INFO:__main__:postprocessor created successfully
INFO:__main__:sentence window engine created successfully
INFO:__main__:start to create final engine...
INFO:__main__:question generator created
INFO:__main__:final engine created
INFO:__main__:query engine created successfully
INFO:__main__:start to create final engine...
INFO:__main__:question generator created
INFO:__main__:final engine created


## GRAPH

In [53]:
def validate_query_engine_response(response: Any) -> bool:
    """
    Validate response from query engine
    
    Args:
        response: Response from query engine
    
    Returns:
        bool: True if response is valid
    """
    if response is None:
        logging.error("Query engine response cannot be None")
        return False
    return True

def grade_documents(state) -> Literal["generate", "rewrite", "generate_no_ans"]:
    """
    Determines whether the retrieved documents are relevant to the question.

    Args:
        state (messages): The current state including index_path and messages

    Returns:
        str: Decision on next action - "generate", "rewrite", or "generate_no_ans"
    """
    print("---CHECK RELEVANCE---")
    
    try:
        # Validate input state format
        if not validate_state(state):
            raise ValueError("Invalid state format")
        
        messages = state["messages"]
        question = messages[0].content
        print('Question:', question)
        
        # Use llama-index for retrieval if index_path is provided
        if "index_path" in state:
            # Create retriever with increased similarity_top_k for better coverage
            retriever = llama_index_retriever_tool(
                index_path=state["index_path"],
                index_type='sentence',
                similarity_top_k=10  # Increased from 6 to 10
            )
            
            # Get retrieval results and validate
            retrieval_response = retriever(question)
            if not validate_query_engine_response(retrieval_response):
                raise ValueError("Invalid retrieval response")
                
            docs = str(retrieval_response)
            
            # Normalize question and response for better matching
            question_lower = question.lower()
            docs_lower = docs.lower()
            
            # Extract year from question for temporal relevance
            import re
            year_match = re.search(r'20\d{2}', question)
            if year_match:
                year = year_match.group()
                
                # Check if the document contains both the year and some numeric value
                has_year = year in docs_lower
                has_numbers = bool(re.search(r'(?:rm|myr|rp)?\s*\d+(?:\.\d+)?(?:\s*(?:million|m|billion|b))?', docs_lower))
                
                if has_year and has_numbers:
                    print("---DECISION: DOCS RELEVANT (Contains Year and Numbers)---")
                    print("docs:")
                    print(docs)
                    return "generate"
            
            # Check for "no information" type responses
            no_info_phrases = [
                "do not contain",
                "does not specify",
                "cannot provide",
                "no information",
                "not available",
                "could not find",
                "not found",
                "therefore, i cannot"
            ]
            
            if any(phrase in docs_lower for phrase in no_info_phrases):
                print("---DECISION: DOCS NOT RELEVANT (No Information Found)---")
                print("docs:")
                print(docs)
                if state["attempt_num"] < MAX_ATTEMPT:
                    return "rewrite"
                else:
                    return "generate_no_ans"
                
        else:
            docs = messages[-1].content
            
        print("Retrieved docs:", docs)
        
        # Set up relevance scoring with improved prompt
        class grade(BaseModel):
            binary_score: str = Field(description="Relevance score 'yes' or 'no'")
        
        model = ChatOpenAI(temperature=0.1, model=MODEL_NAME, streaming=True)
        llm_with_tool = model.with_structured_output(grade)
        
        prompt = PromptTemplate(
            template="""You are a grader assessing relevance of a retrieved document to a user question about financial data. \n 
            Here is the retrieved document: \n\n {context} \n\n
            Here is the user question: {question} \n
            
            For financial or ESG related questions, grade the document as relevant if it contains:
            1. Specific numeric values (like revenue, profit, etc.) for the requested time period
            2. Financial figures with currency indicators (USD, MYR, RM, etc.)
            3. Year-specific financial information that matches the question
            4. Comparative financial data between years
            
            The document should be considered relevant even if it needs some interpretation 
            (e.g., if asking about 2023 and document mentions 'FY2023' or 'current year'; ).
            
            If the document only states that information is not found or not available, grade it as not relevant.
            Give a binary score 'yes' or 'no' to indicate whether the document is relevant to the question.""",
            input_variables=["context", "question"],
        )
        chain = prompt | llm_with_tool

        scored_result = chain.invoke({"question": question, "context": docs})
        if not hasattr(scored_result, 'binary_score'):
            raise ValueError("Invalid scoring result")
            
        score = scored_result.binary_score

        if score == "yes":
            print("---DECISION: DOCS RELEVANT---")
            print("docs:")
            print(docs)
            return "generate"
        elif state["attempt_num"] < MAX_ATTEMPT:
            print("---DECISION: DOCS NOT RELEVANT---")
            print(score)
            print("docs:")
            print(docs)
            return "rewrite"
        else:
            print("---DECISION: DOCS NOT RELEVANT, MAX_ATTEMPT achieved---")
            print(score)
            print("docs:")
            print(docs)
            return "generate_no_ans"
            
    except Exception as e:
        print(f"Error in grade_documents: {str(e)}")
        raise

In [59]:
test_state = {
    "messages": [
        HumanMessage(content="What's Public Shareholders' share for SOL?"),
    ],
    "attempt_num": 0,
    "index_path": METADATA_ENRICHMENT_INDEX_DIR   
}

# test
try:
    result = grade_documents(test_state)
    print(f"Grade result: {result}")
except Exception as e:
    print(f"Test failed: {str(e)}") 

INFO:__main__:loading index fromc:\Users\Yueyi\OneDrive\桌面\Cneutral\codes\LLM tool\enriched_index...


---CHECK RELEVANCE---
Question: What's Public Shareholders' share for SOL?


INFO:llama_index.core.indices.loading:Loading all indices.
INFO:__main__:index loaded successfully
INFO:__main__:starting to create sentence engine...
INFO:__main__:postprocessor created successfully
INFO:__main__:sentence window engine created successfully
INFO:__main__:start to create final engine...
INFO:__main__:question generator created
INFO:__main__:final engine created
INFO:__main__:query engine created successfully
INFO:__main__:processing: What's Public Shareholders' share for SOL?
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Generated 3 sub questions.
[1;3;38;2;237;90;200m[docs] Q: By first identifying and quoting the most relevant sources, what is the percentage of public shareholders' ownership in SOL?
[0m

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


[1;3;38;2;237;90;200m[docs] A: The percentage of public shareholders' ownership in Shui On Land (SOL) is 43.77%. This information is found in the excerpt stating, "Shui On Land is jointly owned by the Shui On Group and public shareholders, with a 100% ownership stake shared between them. Shui On Group 56.23% Public Shareholders 43.77%."
[0m[1;3;38;2;90;149;237m[docs] Q: By first identifying and quoting the most relevant sources, what is the total number of shares outstanding for SOL?
[0m

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


[1;3;38;2;90;149;237m[docs] A: The provided excerpts do not contain specific information regarding the total number of shares outstanding for Shui On Land Limited (SOL). Therefore, I cannot provide that information based on the available context.
[0m[1;3;38;2;11;159;203m[docs] Q: By first identifying and quoting the most relevant sources, what are the recent changes in SOL's shareholder structure?
[0m

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[1;3;38;2;11;159;203m[docs] A: The recent changes in Shui On Land's (SOL) shareholder structure indicate that the company is jointly owned by the Shui On Group and public shareholders, with a 100% ownership stake shared between them. Specifically, the Shui On Group holds a 56.23% stake, while public shareholders account for 43.77%. This structure reflects a significant commitment to maintaining a balance between private and public ownership.

Additionally, there has been a focus on enhancing corporate governance and stakeholder engagement, which may influence shareholder dynamics. The report emphasizes the importance of strong sustainability performance to attract shareholders who are focused on creating long-term value. 

These insights highlight the current ownership distribution and the strategic emphasis on sustainability as a means to engage and retain shareholders.
[0m

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:__main__:response finished


---RAG---:
 Public shareholders' share for Shui On Land (SOL) is 43.77%.
Retrieved docs: Public shareholders' share for Shui On Land (SOL) is 43.77%.


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


---DECISION: DOCS RELEVANT---
docs:
Public shareholders' share for Shui On Land (SOL) is 43.77%.
Grade result: generate
