In [3]:
from copy import deepcopy
from pathlib import Path
import os
import re
from copy import deepcopy
from pathlib import Path
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_parse import LlamaParse
from llama_index.core.schema import TextNode
from typing import Optional


## Setting Up LlamaIndex and OpenAI Models

In this section, we initialize the `Settings` for LlamaIndex with the OpenAI embedding and language models. We use the `OpenAIEmbedding` model for embeddings and the `OpenAI` model for language processing.

In [33]:
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') 
LLAMA_API_KEY = os.getenv('LLAMA_CLOUD_API_KEY')

In [5]:
embed_model = OpenAIEmbedding(model="text-embedding-3-large", api_key=OPENAI_API_KEY)
llm = OpenAI(model="gpt-4o-mini", api_key=OPENAI_API_KEY)

Settings.embed_model = embed_model
Settings.llm = llm

In [36]:
parsing_instructions = """
Do not parse slide pages that contains:
- Let's Know Each Other...
    - Ground Rules
    - Lesson Plan
    - Skills Framework
    - Skills Framework TSC
    - Learning Outcomes
    - Course Outline
    - Final Assessment
"""

In [38]:
import nest_asyncio

nest_asyncio.apply()

parser_text = LlamaParse(
    result_type="text",
    api_key=LLAMA_API_KEY,
    show_progress=True,
    verbose=True,
    parsing_instruction=parsing_instructions,
    num_workers=8
)

parser_gpt4o = LlamaParse(
    api_key=LLAMA_API_KEY,
    result_type="markdown",
    show_progress=True,
    verbose=True,
    parsing_instruction=parsing_instructions,
    num_workers=8,
    gpt4o_mode=True)

## Use LlamaParse to Parse Text and Images

In this example, use LlamaParse to parse both the text and images from the document.

We parse out the text in two ways: 
- in regular `text` mode using our default text layout algorithm
- in `markdown` mode using GPT-4o (`gpt4o_mode=True`). This also allows us to capture page screenshots

In [39]:
print(f"Parsing text...")
docs_text = parser_text.load_data(r"C:\Users\dljh1\Documents\courseware_autogen\Assessment\input\WSQ- Learner Guide Slides - Develop Artificial Intelligence and Large Language Model (LLM) Applications with Google Gemini - v5.pdf")
print(f"Parsing PDF file...")
md_json_objs = parser_gpt4o.get_json_result(r"C:\Users\dljh1\Documents\courseware_autogen\Assessment\input\WSQ- Learner Guide Slides - Develop Artificial Intelligence and Large Language Model (LLM) Applications with Google Gemini - v5.pdf")
md_json_list = md_json_objs[0]["pages"]

Parsing text...
Error while parsing the file 'C:\Users\dljh1\Documents\courseware_autogen\Assessment\input\WSQ- Learner Guide Slides - Develop Artificial Intelligence and Large Language Model (LLM) Applications with Google Gemini - v5.pdf': Failed to parse the file: {"detail":"You've exceeded the maximum number of pages you can parse in a day (1000). Please contact support to increase your limit."}
Parsing PDF file...
Error while parsing the file 'C:\Users\dljh1\Documents\courseware_autogen\Assessment\input\WSQ- Learner Guide Slides - Develop Artificial Intelligence and Large Language Model (LLM) Applications with Google Gemini - v5.pdf': Failed to parse the file: {"detail":"You've exceeded the maximum number of pages you can parse in a day (1000). Please contact support to increase your limit."}


IndexError: list index out of range

In [None]:
image_dicts = parser_gpt4o.get_images(md_json_objs, download_path="data_images")

> Image for page 1: [{'name': 'page_1.jpg', 'height': 0, 'width': 0, 'x': 0, 'y': 0, 'type': 'full_page_screenshot'}]
> Image for page 2: [{'name': 'page_2.jpg', 'height': 0, 'width': 0, 'x': 0, 'y': 0, 'type': 'full_page_screenshot'}]
> Image for page 3: [{'name': 'page_3.jpg', 'height': 0, 'width': 0, 'x': 0, 'y': 0, 'type': 'full_page_screenshot'}]
> Image for page 4: [{'name': 'page_4.jpg', 'height': 0, 'width': 0, 'x': 0, 'y': 0, 'type': 'full_page_screenshot'}]
> Image for page 5: [{'name': 'page_5.jpg', 'height': 0, 'width': 0, 'x': 0, 'y': 0, 'type': 'full_page_screenshot'}]
> Image for page 6: [{'name': 'page_6.jpg', 'height': 0, 'width': 0, 'x': 0, 'y': 0, 'type': 'full_page_screenshot'}]
> Image for page 7: [{'name': 'page_7.jpg', 'height': 0, 'width': 0, 'x': 0, 'y': 0, 'type': 'full_page_screenshot'}]
> Image for page 8: [{'name': 'page_8.jpg', 'height': 0, 'width': 0, 'x': 0, 'y': 0, 'type': 'full_page_screenshot'}]
> Image for page 9: [{'name': 'page_9.jpg', 'height': 0,

In [9]:
# get pages loaded through llamaparse
def get_page_number(file_name):
    match = re.search(r"-page-(\d+)\.jpg$", str(file_name))
    if match:
        return int(match.group(1))
    return 0


def _get_sorted_image_files(image_dir):
    """Get image files sorted by page."""
    raw_files = [f for f in list(Path(image_dir).iterdir()) if f.is_file()]
    sorted_files = sorted(raw_files, key=get_page_number)
    return sorted_files

In [10]:
# attach image metadata to the text nodes
def get_text_nodes(docs, image_dir=None, json_dicts=None):
    """Split docs into nodes, by separator."""
    nodes = []

    image_files = _get_sorted_image_files(image_dir) if image_dir is not None else None
    md_texts = [d["md"] for d in json_dicts] if json_dicts is not None else None

    doc_chunks = [c for d in docs for c in d.text.split("---")]
    for idx, doc_chunk in enumerate(doc_chunks):
        chunk_metadata = {"page_num": idx + 1}
        if image_files is not None:
            image_file = image_files[idx]
            chunk_metadata["image_path"] = str(image_file)
        if md_texts is not None:
            chunk_metadata["parsed_text_markdown"] = md_texts[idx]
        chunk_metadata["parsed_text"] = doc_chunk
        node = TextNode(
            text="",
            metadata=chunk_metadata,
        )
        nodes.append(node)

    return nodes

In [11]:
# this will split into pages
text_nodes = get_text_nodes(docs_text, image_dir="data_images", json_dicts=md_json_list)

In [12]:
print(text_nodes[10].get_content(metadata_mode="all"))

page_num: 11
image_path: data_images\359dd6bb-0f2e-49c6-90aa-1da883b8bf28-page_108.jpg
parsed_text_markdown: # Final Assessment

- Written Assessment (SAQ) - 1 hr
- Practical Performance (PP) - 1 hr

This material belongs to Tertiary Infotech Pte Ltd (UEN: 201200096W). All Rights Reserved.
parsed_text:                             Final Assessment
●  Written Assessment (SAQ) - 1 hr
●  Practical Performance (PP) - 1 hr

                                                                                                            11
                        This material belongs to Tertiary Infotech Pte Ltd (UEN: 20120096W). All Rights Reserved


In [13]:
import os
from llama_index.core import (
    StorageContext,
    VectorStoreIndex,
    load_index_from_storage,
)

if not os.path.exists("storage_nodes"):
    index = VectorStoreIndex(text_nodes, embed_model=embed_model)
    # save index to disk
    index.set_index_id("vector_index")
    index.storage_context.persist("./storage_nodes")
else:
    # rebuild storage context
    storage_context = StorageContext.from_defaults(persist_dir="storage_nodes")
    # load index
    index = load_index_from_storage(storage_context, index_id="vector_index")

retriever = index.as_retriever()

In [None]:
from llama_index.core.query_engine import CustomQueryEngine, SimpleMultiModalQueryEngine
from llama_index.core.retrievers import BaseRetriever
from llama_index.multi_modal_llms.openai import OpenAIMultiModal
from llama_index.core.schema import ImageNode, NodeWithScore, MetadataMode
from llama_index.core.prompts import PromptTemplate
from llama_index.core.base.response.schema import Response
from typing import Optional


gpt_4o = OpenAIMultiModal(model="gpt-4o-mini", max_new_tokens=4096)

QA_PROMPT_TMPL = """\
Below we give parsed text from slides in two different formats, as well as the image.

We parse the text in both 'markdown' mode as well as 'raw text' mode. Markdown mode attempts \
to convert relevant diagrams into tables, whereas raw text tries to maintain the rough spatial \
layout of the text.

Use the image information first and foremost. ONLY use the text/markdown information 
if you can't understand the image.

---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the query. Explain whether you got the answer
from the parsed markdown or raw text or image, and if there's discrepancies, and your reasoning for the final answer.

Query: {query_str}
Answer: """

QA_PROMPT = PromptTemplate(QA_PROMPT_TMPL)


class MultimodalQueryEngine(CustomQueryEngine):
    """Custom multimodal Query Engine.

    Takes in a retriever to retrieve a set of document nodes.
    Also takes in a prompt template and multimodal model.

    """

    qa_prompt: PromptTemplate
    retriever: BaseRetriever
    multi_modal_llm: OpenAIMultiModal

    def __init__(self, qa_prompt: Optional[PromptTemplate] = None, **kwargs) -> None:
        """Initialize."""
        super().__init__(qa_prompt=qa_prompt or QA_PROMPT, **kwargs)

    def custom_query(self, query_str: str):
        # retrieve text nodes
        nodes = self.retriever.retrieve(query_str)
        # create ImageNode items from text nodes
        image_nodes = [
            NodeWithScore(node=ImageNode(image_path=n.metadata["image_path"]))
            for n in nodes
        ]

        # create context string from text nodes, dump into the prompt
        context_str = "\n\n".join(
            [r.get_content(metadata_mode=MetadataMode.LLM) for r in nodes]
        )
        fmt_prompt = self.qa_prompt.format(context_str=context_str, query_str=query_str)

        # synthesize an answer from formatted text and images
        llm_response = self.multi_modal_llm.complete(
            prompt=fmt_prompt,
            image_documents=[image_node.node for image_node in image_nodes],
        )
        return Response(
            response=str(llm_response),
            source_nodes=nodes,
            metadata={"text_nodes": text_nodes, "image_nodes": image_nodes},
        )

        return response

In [None]:
query_engine = MultimodalQueryEngine(
    retriever=index.as_retriever(similarity_top_k=9), multi_modal_llm=gpt_4o
)

In [20]:
def get_nodes(docs):
    """Split docs into nodes, by separator."""
    nodes = []
    for doc in docs:
        doc_chunks = doc.text.split("\n---\n")
        for doc_chunk in doc_chunks:
            node = TextNode(
                text=doc_chunk,
                metadata=deepcopy(doc.metadata),
            )
            nodes.append(node)

    return nodes

In [21]:
base_nodes = get_nodes(docs_text)

In [22]:
print(base_nodes[13].get_content(metadata_mode="all"))

                 CERTIFICATE
Two e-certificates will be awarded to trainees who have
demonstrated competency in the WSQ assessment and
achieved at least 75% attendance.

 ●   A SkillsFuture WSQ Statement of Attainment (SOA) issued
     by WSG. Typically take 4 weeks
 ●   Certification of Completion issued by Tertiary Infotech Pte
     Ltd, immediately after the course
                                                                                                 14
                       This material belongs to Tertiary Infotech Pte Ltd (UEN: 20120096W). All Rights Reserved
                                                                                                               This material belongs to Tertiary Infotech Pte Ltd (UEN: 20120096W). All Rights Reserved


In [49]:
from typing import List
from pydantic import BaseModel


class RetrievedContent(BaseModel):
    """Data model for a biography."""

    knowledge_statement: str
    retrieved_info: str

In [53]:
base_index = VectorStoreIndex(base_nodes, embed_model=embed_model)
base_query_engine = base_index.as_query_engine(output_cls=RetrievedContent, response_mode="compact", llm=llm, similarity_top_k=9)

In [24]:
from llama_index.core.tools import QueryEngineTool
from llama_index.core.agent import FunctionCallingAgentWorker


vector_tool = QueryEngineTool.from_defaults(
    query_engine=query_engine,
    name="vector_tool",
    description=(
        "Useful for retrieving specific context from the data. Do NOT select if question asks for a summary of the data."
    ),
)
agent = FunctionCallingAgentWorker.from_tools(
    [vector_tool], llm=llm, verbose=True
).as_agent()

In [54]:
# define a similar agent for the baseline
base_vector_tool = QueryEngineTool.from_defaults(
    query_engine=base_query_engine,
    name="vector_tool",
    description=(
        "Useful for retrieving specific context from the data. Do NOT select if question asks for a summary of the data."
    ),
)
base_agent = FunctionCallingAgentWorker.from_tools(
    [base_vector_tool], llm=llm, verbose=True
).as_agent()

In [55]:
# Assuming tsc_knowledges is a dictionary mapping ks_id to description
tsc_knowledges = {
    'K1': 'Range of AI applications',
    'K2': 'Concepts pertaining to performance effectiveness and analysis',
    'K3': 'Methods of evaluating effectiveness of AI applications',
    'K4': 'Algorithm design and implementation',
    'K5': 'Methods of evaluating process improvements to the engineering processes using AI',
    'K6': 'Applicability of AI in the industry'
}

# topics data as defined previously
topics = [
    {
        'topic_number': 1,
        'title': 'Overview of Large Language Model (LLM)',
        'tsc_knowledges': ['K1', 'K6'],
        'tsc_abilities': ['A1', 'A3']
    },
    {
        'topic_number': 2,
        'title': 'Multimodal Prompting with Google Gemini LLM',
        'tsc_knowledges': ['K4', 'K5'],
        'tsc_abilities': ['A2', 'A6']
    },
    {
        'topic_number': 3,
        'title': 'Building LLM Applications with Google Gemini LLM',
        'tsc_knowledges': ['K3'],
        'tsc_abilities': ['A5']
    },
    {
        'topic_number': 4,
        'title': 'Implementing Retrieval Augmented Generation (RAG)',
        'tsc_knowledges': ['K2'],
        'tsc_abilities': ['A4']
    }
]

In [56]:
# Build a mapping of knowledge statements to a single topic
knowledge_to_topic = {}
for topic in topics:
    topic_title = topic['title']
    tsc_knowledges_in_topic = topic['tsc_knowledges']
    
    for ks_id in tsc_knowledges_in_topic:
        if ks_id not in knowledge_to_topic:
            knowledge_to_topic[ks_id] = topic_title

# Process each unique knowledge statement only once
for ks_id, topic_title in knowledge_to_topic.items():
    ks_full = f"{ks_id}: {tsc_knowledges[ks_id]}"
    
    # Craft the query
    query = f"""From the topic '{topic_title}' contents, retrieve all relevant information most aligned with the knowledge statement '{ks_full}'.
    Provide structured summaries, key points, detailed explanations, and any practical examples explicitly covered in the course material.
    Strictly retrieve information starting from slide 15 onwards, ensuring no content from slides 1 to 14 is included."""

    # Execute the query
    result = base_agent.query(query)
    # Process or display the result as needed
    print(f"Results for {ks_full} under {topic_title}:\n{result}\n")

Added user message to memory: From the topic 'Overview of Large Language Model (LLM)' contents, retrieve all relevant information most aligned with the knowledge statement 'K1: Range of AI applications'.
    Provide structured summaries, key points, detailed explanations, and any practical examples explicitly covered in the course material.
    Strictly retrieve information starting from slide 15 onwards, ensuring no content from slides 1 to 14 is included.
=== Calling Function ===
Calling function: vector_tool with args: {"input": "Overview of Large Language Model (LLM) K1: Range of AI applications"}
=== Function Output ===
{"knowledge_statement":"Range of AI applications","retrieved_info":"The applications of Large Language Models (LLMs) include content creation and assistance, customer support and chatbots, language translation and localization, educational tools, business intelligence and analytics, accessibility for disabled persons, coding and development, legal and compliance as