In [1]:
!pip install llama-index-multi-modal-llms-openai
!pip install llama-index-vector-stores-qdrant
!pip install llama_index ftfy regex tqdm
!pip install torch torchvision
!pip install matplotlib scikit-image
!pip install -U qdrant_client

Collecting llama-index-vector-stores-qdrant
  Downloading llama_index_vector_stores_qdrant-0.4.0-py3-none-any.whl.metadata (767 bytes)
Collecting grpcio<2.0.0,>=1.60.0 (from llama-index-vector-stores-qdrant)
  Downloading grpcio-1.68.0-cp311-cp311-win_amd64.whl.metadata (4.0 kB)
Collecting qdrant-client>=1.7.1 (from llama-index-vector-stores-qdrant)
  Downloading qdrant_client-1.12.1-py3-none-any.whl.metadata (10 kB)
Collecting grpcio-tools>=1.41.0 (from qdrant-client>=1.7.1->llama-index-vector-stores-qdrant)
  Downloading grpcio_tools-1.68.0-cp311-cp311-win_amd64.whl.metadata (5.5 kB)
Collecting portalocker<3.0.0,>=2.7.0 (from qdrant-client>=1.7.1->llama-index-vector-stores-qdrant)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Collecting h2<5,>=3 (from httpx[http2]>=0.20.0->qdrant-client>=1.7.1->llama-index-vector-stores-qdrant)
  Using cached h2-4.1.0-py3-none-any.whl.metadata (3.6 kB)
Collecting hyperframe<7,>=6.0 (from h2<5,>=3->httpx[http2]>=0.20.0->qdrant-cl

## Importing Dependencies

To start, we need to import the necessary libraries and modules. These include:

- `os` for interacting with the operating system.
- `requests` for making HTTP requests.
- `Path` from `pathlib` for handling file paths.
- `matplotlib.pyplot` for plotting.
- `Image` from `PIL` for image processing.
- `nest_asyncio` for applying nested asyncio loops.

Additionally, we will import specific classes and functions from the `llama_index` and `llama_parse` libraries to work with OpenAI and Llama APIs.

In [30]:
import os
import nest_asyncio

nest_asyncio.apply()

OPENAI_API_KEY = os.getenv('TERTIARY_INFOTECH_API_KEY') 
LLAMA_API_KEY = os.getenv('LLAMA_CLOUD_API_KEY')

In [31]:
from llama_parse import LlamaParse

parsingInstructions= """
The provided document is an educational learning resource.
Output any code in markdown (between ```).
Output any math equation in LATEX markdown (between $$).
"""

parser = LlamaParse(
    api_key=LLAMA_API_KEY,
    result_type="markdown",
    parsing_instruction=parsingInstructions,
    # use_vendor_multimodal_model=True,
    # vendor_multimodal_api_key=OPENAI_API_KEY,
    # vendor_multimodal_model_name="openai-gpt-4o-mini",
    show_progress=True,
    verbose=True,
    # invalidate_cache=True, # Turned off for prototyping
    # do_not_cache=True, # Turned off for prototyping
    num_workers=8,
    language="en"
)

### Helper Function to Get All Data Files

The following function, `get_data_files`, is a helper function designed to retrieve all data files from the specified directory (`DATA_DIR`). It returns a list of file paths for further processing.

In [32]:
DATA_DIR = "input_documents"


def get_data_files(data_dir=DATA_DIR) -> list[str]:
    files = []
    for f in os.listdir(data_dir):
        fname = os.path.join(data_dir, f)
        if os.path.isfile(fname):
            files.append(fname)
    return files


files = get_data_files()

print(files)

['input_documents\\Web_API_Development_with_Python_A_Beginners_Guide_using_Flask_and.pdf']


In [33]:
jsonObjs = parser.get_json_result(files)
jsonList = jsonObjs[0]["pages"]

Parsing files: 100%|██████████| 1/1 [00:12<00:00, 12.05s/it]

Error while parsing the file 'input_documents\Web_API_Development_with_Python_A_Beginners_Guide_using_Flask_and.pdf': Failed to parse the file: {"detail":"You've exceeded the maximum number of pages you can parse in a day (1000). Please contact support to increase your limit."}





IndexError: list index out of range

In [None]:
from typing import List
from llama_index.core.schema import ImageDocument, TextNode

def get_image_nodes(json_objs: List[dict], download_path: str) -> List[ImageDocument]:
    image_dicts = parser.get_images(json_objs, download_path=download_path)
    return [ImageDocument(image_path=image_dict["path"]) for image_dict in image_dicts]

imageDicts = get_image_nodes(jsonObjs, "data_images")

> Image for page 1: [{'name': 'img_p0_1.png', 'height': 1568, 'width': 980, 'x': 0, 'y': 0, 'original_width': 980, 'original_height': 1568, 'ocr': [{'x': 67, 'y': 64, 'w': 281, 'h': 112, 'confidence': '0.9999450086157631', 'text': 'Web'}, {'x': 385, 'y': 67, 'w': 240, 'h': 108, 'confidence': '0.9998567066524358', 'text': 'API'}, {'x': 61, 'y': 194, 'w': 859, 'h': 144, 'confidence': '0.9723038822429899', 'text': 'Development'}, {'x': 59, 'y': 331, 'w': 781, 'h': 140, 'confidence': '0.9998620322869443', 'text': 'with Python'}, {'x': 74, 'y': 528, 'w': 846, 'h': 53, 'confidence': '0.8022458180164818', 'text': "A Beginner's Guide using Flask and FastAPI"}, {'x': 663, 'y': 655, 'w': 252, 'h': 40, 'confidence': '0.9863252243150417', 'text': 'First Edition'}, {'x': 151, 'y': 1313, 'w': 274, 'h': 76, 'confidence': '0.9945141016542265', 'text': 'FastAPI'}, {'x': 587, 'y': 1279, 'w': 262, 'h': 102, 'confidence': '0.9999964814062795', 'text': 'Flask'}, {'x': 194, 'y': 1454, 'w': 582, 'h': 77, 'co

In [None]:
import re
from pathlib import Path
import typing as t
from llama_index.core.schema import TextNode

def get_page_number(file_name):
    """Gets page number of images using regex on file names"""
    match = re.search(r"-page-(\d+)\.jpg$", str(file_name))
    if match:
        return int(match.group(1))
    return 0

def _get_sorted_image_files(image_dir):
    """Get image files sorted by page."""
    raw_files = [f for f in list(Path(image_dir).iterdir()) if f.is_file()]
    sorted_files = sorted(raw_files, key=get_page_number)
    return sorted_files


def get_text_nodes(md_json_objs, image_dir) -> t.List[TextNode]:
    """Creates nodes from json + images"""

    nodes = []

    for result in md_json_objs:
      json_dicts = result["pages"]
      document_name = result["file_path"].split('/')[-1]

      print(json_dicts)

      docs = [doc["md"] for doc in json_dicts] # extract text
      image_files = _get_sorted_image_files(image_dir) # extract images

      for idx, doc in enumerate(docs):
          # adds both a text node and the corresponding image node (jpg of the page) for each page
          node = TextNode(
              text=doc,
              metadata={"image_path": str(image_files[idx]), "page_num": idx + 1, "document_name": document_name},
          )
          nodes.append(node)

    return nodes

text_nodes = get_text_nodes(jsonObjs, "data_images")




IndexError: list index out of range

In [None]:
from llama_index.core import (
    VectorStoreIndex,
    StorageContext,
    load_index_from_storage,
    Settings
)
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI

embed_model = OpenAIEmbedding(model="text-embedding-3-small")
llm = OpenAI(model="gpt-4o-mini")

Settings.llm = llm
Settings.embed_model = embed_model

if not os.path.exists("storage_manuals"):
    index = VectorStoreIndex(text_nodes, embed_model=embed_model)
    index.storage_context.persist(persist_dir="./storage_manuals")
else:
    ctx = StorageContext.from_defaults(persist_dir="./storage_manuals")
    index = load_index_from_storage(ctx)

retriever = index.as_retriever()

In [None]:
from llama_index.core.query_engine import CustomQueryEngine
from llama_index.core.retrievers import BaseRetriever
from llama_index.multi_modal_llms.openai import OpenAIMultiModal
from llama_index.core.schema import NodeWithScore, MetadataMode, QueryBundle
from llama_index.core.base.response.schema import Response
from llama_index.core.prompts import PromptTemplate
from llama_index.core.schema import ImageNode

from typing import Any, List, Optional, Tuple
from llama_index.core.postprocessor.types import BaseNodePostprocessor

QA_PROMPT_TMPL = """\
You are an educational assistant that helps users retrieve relevant information and images from educational PDF materials.

Below we provide parsed text from the documents in two different formats, as well as the image.

We parse the text in both 'markdown' mode as well as 'raw text' mode. Markdown mode attempts \
to convert relevant diagrams into tables, whereas raw text tries to maintain the rough spatial \
layout of the text.

Use the image information first and foremost. ONLY use the text/markdown information 
if you can't understand the image.

When you reply, provide both text explanations and relevant images or tables.

Context:
---------------------
{context_str}
---------------------

Given the context information and not prior knowledge, answer the query using ONLY the context information. If you don't find the answer in the context, reply that you don't know and provide the page number and document name where the user can find similar information.

Query: {query_str}
Answer: """

QA_PROMPT = PromptTemplate(QA_PROMPT_TMPL)

gpt_4o_mm = OpenAIMultiModal(model="gpt-4o-mini")


class MultimodalQueryEngine(CustomQueryEngine):
    qa_prompt: PromptTemplate
    retriever: BaseRetriever
    multi_modal_llm: OpenAIMultiModal
    node_postprocessors: Optional[List[BaseNodePostprocessor]]

    def __init__(
        self,
        qa_prompt: PromptTemplate,
        retriever: BaseRetriever,
        multi_modal_llm: OpenAIMultiModal,
        node_postprocessors: Optional[List[BaseNodePostprocessor]] = [],
    ):
        super().__init__(
            qa_prompt=qa_prompt,
            retriever=retriever,
            multi_modal_llm=multi_modal_llm,
            node_postprocessors=node_postprocessors
        )

    def custom_query(self, query_str: str):
        # retrieve most relevant nodes
        nodes = self.retriever.retrieve(query_str)

        for postprocessor in self.node_postprocessors:
            nodes = postprocessor.postprocess_nodes(
                nodes, query_bundle=QueryBundle(query_str)
            )


        # create image nodes from the image associated with those nodes
        image_nodes = [
            NodeWithScore(node=ImageNode(image_path=n.node.metadata["image_path"]))
            for n in nodes
        ]

        # create context string from parsed markdown text
        ctx_str = "\n\n".join(
            [r.node.get_content(metadata_mode=MetadataMode.LLM).strip() for r in nodes]
        )

        # prompt for the LLM
        fmt_prompt = self.qa_prompt.format(context_str=ctx_str, query_str=query_str)

        # use the multimodal LLM to interpret images and generate a response to the prompt
        llm_repsonse = self.multi_modal_llm.complete(
            prompt=fmt_prompt,
            image_documents=[image_node.node for image_node in image_nodes],
        )
        return Response(
            response=str(llm_repsonse),
            source_nodes=nodes,
            metadata={"text_nodes": text_nodes, "image_nodes": image_nodes},
        )

In [None]:
from llama_index.postprocessor.flag_embedding_reranker import (
    FlagEmbeddingReranker,
)

rerank = FlagEmbeddingReranker(model="BAAI/bge-reranker-large", top_n=5)

In [None]:
# Insert reranking here only if after some test it increase the accuracy
query_engine = MultimodalQueryEngine(
    qa_prompt=QA_PROMPT,
    retriever=index.as_retriever(similarity_top_k=9),
    multi_modal_llm=gpt_4o_mm,
    node_postprocessors=[rerank]
)

In [None]:
from IPython.display import display, Markdown

response = query_engine.query("What is Fast API?")
display(Markdown(str(response)))

pre tokenize:   0%|          | 0/1 [00:00<?, ?it/s]You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 200.03it/s]


FileNotFoundError: [Errno 2] No such file or directory: 'data_images\\a87214e4-4f9a-483a-af5f-b84c45901e54-img_p215_1.png'

In [None]:
print(response)

### How GitHub Copilot Works

GitHub Copilot is an AI-powered code completion tool that assists developers by providing intelligent code suggestions. Here’s a breakdown of how it functions:

1. **OpenAI Codex Model**: The core of GitHub Copilot is the OpenAI Codex model, which is trained on a vast dataset of code from public repositories.

2. **Contextual Understanding**: As you write code, Copilot analyzes the context, including comments, variable names, and existing code, to generate relevant suggestions.

3. **Code Suggestions**: Based on the context, Copilot offers code completions, snippets, or entire functions to help you code faster.

4. **Integration with IDEs**: Copilot integrates seamlessly with popular code editors like Visual Studio Code, IntelliJ IDEA, and PyCharm.

5. **Learning and Improvement**: Copilot learns from interactions and feedback, continually improving its suggestions over time.

### Key Features

- **Code Completion**: Provides intelligent code completion su