In [70]:
import os
from pathlib import Path
from tempfile import mkdtemp

from dotenv import load_dotenv
from langchain_core.prompts import PromptTemplate


In [71]:
pip install docling


3322.20s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Note: you may need to restart the kernel to use updated packages.


In [72]:
pip install torch torchvision torchaudio

3328.61s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Note: you may need to restart the kernel to use updated packages.


In [73]:
pip install vllm==0.6.6

3334.91s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Note: you may need to restart the kernel to use updated packages.


In [74]:
from langchain_docling.loader import ExportType

In [75]:
def _get_env_from_colab_or_os(key):
    try:
        from google.colab import userdata

        try:
            return userdata.get(key)
        except userdata.SecretNotFoundError:
            pass
    except ImportError:
        pass
    return os.getenv(key)

In [76]:
load_dotenv()

# https://github.com/huggingface/transformers/issues/5486:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

HF_TOKEN = _get_env_from_colab_or_os("HF_TOKEN")
FILE_PATH = ["paper.pdf"]  # Docling Technical Report
EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
GEN_MODEL_ID = "ibm-granite/granite-vision-3.1-2b-preview"
#GEN_MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"
EXPORT_TYPE = ExportType.DOC_CHUNKS
QUESTION = "explain me what error used in the uploaded document?"
PROMPT = PromptTemplate.from_template(
    "Context information is below.\n---------------------\n{context}\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: {input}\nAnswer:\n",
)
TOP_K = 3
MILVUS_URI = str(Path(mkdtemp()) / "docling.db")

In [77]:
from langchain_huggingface import HuggingFaceEndpoint

# Initialize the HuggingFaceEndpoint with the Granite model
llm = HuggingFaceEndpoint(
    repo_id="ibm-granite/granite-vision-3.1-2b-preview",  # Make sure the model repo is correct
    huggingfacehub_api_token=HF_TOKEN,  # The Hugging Face token
    task="text-generation",  # Ensure this task is appropriate for the Granite model
)


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [78]:
from langchain_docling import DoclingLoader

from docling.chunking import HybridChunker

In [79]:
loader = DoclingLoader(
    file_path=FILE_PATH,
    export_type=EXPORT_TYPE,
    chunker=HybridChunker(tokenizer=EMBED_MODEL_ID),
)

docs = loader.load()

Token indices sequence length is longer than the specified maximum sequence length for this model (532 > 512). Running this sequence through the model will result in indexing errors


In [80]:
if EXPORT_TYPE == ExportType.DOC_CHUNKS:
    splits = docs
elif EXPORT_TYPE == ExportType.MARKDOWN:
    from langchain_text_splitters import MarkdownHeaderTextSplitter

    splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=[
            ("#", "Header_1"),
            ("##", "Header_2"),
            ("###", "Header_3"),
        ],
    )
    splits = [split for doc in docs for split in splitter.split_text(doc.page_content)]
else:
    raise ValueError(f"Unexpected export type: {EXPORT_TYPE}")

In [81]:
for d in splits[:3]:
    print(f"- {d.page_content=}")
print("...")

- d.page_content='FastVPINNs: Tensor-Driven Acceleration of VPINNs for Complex Geometries\nThivin Anandh 1\nthivinanandh@iisc.ac.in\nDivij Ghose 1\ndivijghose@iisc.ac.in\nHimanshu Jain 2\nms19026@iisermohali.ac.in\nSashikumaar Ganesan 1 ∗\nsashi@iisc.ac.in\n1 Department of Computational and Data Sciences\nIndian Institute of Science, Bangalore\nKarnataka, India\n2 Department of Physical Sciences\nIndian Institute of Science Education and Research, Mohali\nPunjab, India\n∗ Corresponding author'
- d.page_content="Abstract\nVariational Physics-Informed Neural Networks (VPINNs) utilize a variational loss function to solve partial differential equations, mirroring Finite Element Analysis techniques. Traditional hp-VPINNs, while effective for high-frequency problems, are computationally intensive and scale poorly with increasing element counts, limiting their use in complex geometries. This work introduces FastVPINNs, a tensor-based advancement that significantly reduces computational overhe

In [82]:
pip show grpcio


3357.42s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Name: grpcio
Version: 1.67.1
Summary: HTTP/2-based RPC framework
Home-page: https://grpc.io
Author: The gRPC Authors
Author-email: grpc-io@googlegroups.com
License: Apache License 2.0
Location: /home/parani/IBM/env1/lib/python3.10/site-packages
Requires: 
Required-by: grpcio-tools, pymilvus
Note: you may need to restart the kernel to use updated packages.


In [83]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import VectorDBQA
from langchain.prompts import PromptTemplate

# Embed documents using HuggingFaceEmbeddings
embedding = HuggingFaceEmbeddings(model_name=EMBED_MODEL_ID)

# Assuming 'splits' are your document splits (from Docling)
# Create a FAISS vector store from the documents
vectorstore = FAISS.from_documents(documents=splits, embedding=embedding)

# Now you can use the vectorstore with a retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": TOP_K})

# Define the question answering chain
llm = HuggingFaceEndpoint(
    repo_id=GEN_MODEL_ID,
    huggingfacehub_api_token=HF_TOKEN,
    task="text-generation"
)

question_answer_chain = create_stuff_documents_chain(llm, PROMPT)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

# Invoke the RAG chain
resp_dict = rag_chain.invoke({"input": QUESTION})


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [84]:
# import json
# from pathlib import Path
# from tempfile import mkdtemp
# from langchain_milvus import Milvus
# from langchain_huggingface.embeddings import HuggingFaceEmbeddings


# embedding = HuggingFaceEmbeddings(model_name=EMBED_MODEL_ID)


# milvus_uri = str(Path(mkdtemp()) / "docling.db")  # or set as needed
# vectorstore = Milvus.from_documents(
#     documents=splits,
#     embedding=embedding,
#     collection_name="docling_demo",
#     connection_args={"uri": milvus_uri},
#     index_params={"index_type": "FLAT"},
#     drop_old=True,
# )

In [85]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_huggingface import HuggingFaceEndpoint

retriever = vectorstore.as_retriever(search_kwargs={"k": TOP_K})
llm = HuggingFaceEndpoint(
    repo_id=GEN_MODEL_ID,
    huggingfacehub_api_token=HF_TOKEN,
    task = "text-generation",
)


def clip_text(text, threshold=100):
    return f"{text[:threshold]}..." if len(text) > threshold else text

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [86]:
question_answer_chain = create_stuff_documents_chain(llm, PROMPT)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)
resp_dict = rag_chain.invoke({"input": QUESTION})

clipped_answer = clip_text(resp_dict["answer"], threshold=200)
print(f"Question:\n{resp_dict['input']}\n\nAnswer:\n{clipped_answer}")
for i, doc in enumerate(resp_dict["context"]):
    print()
    print(f"Source {i+1}:")
    print(f"  text: {json.dumps(clip_text(doc.page_content, threshold=350))}")
    for key in doc.metadata:
        if key != "pk":
            val = doc.metadata.get(key)
            clipped_val = clip_text(val) if isinstance(val, str) else val
            print(f"  {key}: {clipped_val}")



Question:
explain me what error used in the uploaded document?

Answer:
In the uploaded document, the authors introduce and use several types of errors or loss functions to train their Physics-Informed Neural Networks (PINNs) and hp-VPINNs for solving forward and inverse ...

Source 1:
  text: "2.1 Governing Equations\nConsider a two-dimensional steady-state convection-diffusion equation:\n\nHere, x \u2208 \u2126, \u03b5 , and b are the diffusion coefficient and convective velocity, respectively. In addition, f ( x ) is a known source function with appropriate smoothness. The Dirichlet boundary condition u x ( ) = g x ( ) is imposed on the domain boundary ..."
  source: paper.pdf
  dl_meta: {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/33', 'parent': {'$ref': '#/body'}, 'children': [], 'content_layer': 'body', 'label': 'text', 'prov': [{'page_no': 2, 'bbox': {'l': 90.0, 't': 203.5019989013672, 'r': 427.9049987792969, 

In [87]:
from langchain_docling.loader import DoclingLoader, ExportType

# Specify the path to your PDF file
FILE_PATH = ["paper.pdf"]  # replace with your file path

# Set export type as necessary, usually ExportType.DOC_CHUNKS for document parsing
EXPORT_TYPE = ExportType.DOC_CHUNKS

loader = DoclingLoader(
    file_path=FILE_PATH,
    export_type=EXPORT_TYPE,
    chunker=None,  # Add chunker if you need specific text handling
)

docs = loader.load()


Token indices sequence length is longer than the specified maximum sequence length for this model (532 > 512). Running this sequence through the model will result in indexing errors


In [97]:
from PIL import Image
import io
import os

# Directory to save the extracted images
output_dir = '/home/parani/ibm/'# Modify this path to your preferred directory
os.makedirs(output_dir, exist_ok=True)  # Create the directory if it doesn't exist

# Iterate through the document to extract images
for doc_idx, doc in enumerate(docs):
    print(f"Page content: {doc.page_content}")
    
    if hasattr(doc, 'images') and doc.images:
        for img_idx, img in enumerate(doc.images):
            try:
                # Check if the image data is byte-like or file path and handle accordingly
                if isinstance(img, bytes):  # Byte data
                    img_data = io.BytesIO(img)  # Convert byte data to a file-like object
                    image = Image.open(img_data)  # Open image using Pillow
                else:
                    # If it's a file path, directly open the image (if possible)
                    image = Image.open(img)

                # Save the image as PNG or JPG
                image_path = os.path.join(output_dir, f"extracted_image_{doc_idx}_{img_idx}.png")
                image.save(image_path)
                print(f"Extracted image saved at: {image_path}")
            except Exception as e:
                print(f"Error extracting image: {e}")


Page content: FastVPINNs: Tensor-Driven Acceleration of VPINNs for Complex Geometries
Thivin Anandh 1
thivinanandh@iisc.ac.in
Divij Ghose 1
divijghose@iisc.ac.in
Himanshu Jain 2
ms19026@iisermohali.ac.in
Sashikumaar Ganesan 1 ∗
sashi@iisc.ac.in
1 Department of Computational and Data Sciences
Indian Institute of Science, Bangalore
Karnataka, India
2 Department of Physical Sciences
Indian Institute of Science Education and Research, Mohali
Punjab, India
∗ Corresponding author
Page content: Abstract
Variational Physics-Informed Neural Networks (VPINNs) utilize a variational loss function to solve partial differential equations, mirroring Finite Element Analysis techniques. Traditional hp-VPINNs, while effective for high-frequency problems, are computationally intensive and scale poorly with increasing element counts, limiting their use in complex geometries. This work introduces FastVPINNs, a tensor-based advancement that significantly reduces computational overhead and improves scalabili

In [98]:
# Iterate through the document to extract images
for doc_idx, doc in enumerate(docs):
    print(f"Page content: {doc.page_content}")
    
    # Check if images exist in the document
    if hasattr(doc, 'images'):
        print(f"Found images in doc {doc_idx}: {len(doc.images)} images found.")
        
        for img_idx, img in enumerate(doc.images):
            print(f"Image {img_idx}: {type(img)}")
            try:
                # Handle the image depending on its type
                if isinstance(img, bytes):  # Byte data
                    img_data = io.BytesIO(img)  # Convert byte data to a file-like object
                    image = Image.open(img_data)  # Open image using Pillow
                else:
                    # If it's a file path, directly open the image (if possible)
                    image = Image.open(img)
                
                # Save the image
                image_path = os.path.join(output_dir, f"extracted_image_{doc_idx}_{img_idx}.png")
                image.save(image_path)
                print(f"Extracted image saved at: {image_path}")
            except Exception as e:
                print(f"Error extracting image: {e}")
    else:
        print(f"No images found in doc {doc_idx}.")


Page content: FastVPINNs: Tensor-Driven Acceleration of VPINNs for Complex Geometries
Thivin Anandh 1
thivinanandh@iisc.ac.in
Divij Ghose 1
divijghose@iisc.ac.in
Himanshu Jain 2
ms19026@iisermohali.ac.in
Sashikumaar Ganesan 1 ∗
sashi@iisc.ac.in
1 Department of Computational and Data Sciences
Indian Institute of Science, Bangalore
Karnataka, India
2 Department of Physical Sciences
Indian Institute of Science Education and Research, Mohali
Punjab, India
∗ Corresponding author
No images found in doc 0.
Page content: Abstract
Variational Physics-Informed Neural Networks (VPINNs) utilize a variational loss function to solve partial differential equations, mirroring Finite Element Analysis techniques. Traditional hp-VPINNs, while effective for high-frequency problems, are computationally intensive and scale poorly with increasing element counts, limiting their use in complex geometries. This work introduces FastVPINNs, a tensor-based advancement that significantly reduces computational overh

In [107]:
import os
import fitz  # PyMuPDF

# Path to your PDF file
pdf_path = 'paper.pdf'

# Folder where images will be saved
output_dir = 'extracted_images'

# Create the directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Open the PDF
doc = fitz.open(pdf_path)

# Loop through each page
for page_num in range(len(doc)):
    page = doc.load_page(page_num)
    image_list = page.get_images(full=True)
    
    for img_index, img in enumerate(image_list):
        xref = img[0]
        image = doc.extract_image(xref)
        image_bytes = image["image"]
        
        # Define the path where the image will be saved
        image_filename = os.path.join(output_dir, f"image_{page_num}_{img_index}.png")
        
        # Save the image to the specified folder
        with open(image_filename, "wb") as img_file:
            img_file.write(image_bytes)
        print(f"check {image_filename}")


Extracted image saved at extracted_images/image_6_0.png
Extracted image saved at extracted_images/image_7_0.png
Extracted image saved at extracted_images/image_7_1.png
Extracted image saved at extracted_images/image_8_0.png
Extracted image saved at extracted_images/image_10_0.png
Extracted image saved at extracted_images/image_11_0.png
Extracted image saved at extracted_images/image_13_0.png
Extracted image saved at extracted_images/image_14_0.png
Extracted image saved at extracted_images/image_15_0.png
Extracted image saved at extracted_images/image_16_0.png
Extracted image saved at extracted_images/image_17_0.png
Extracted image saved at extracted_images/image_18_0.png
Extracted image saved at extracted_images/image_19_0.png
Extracted image saved at extracted_images/image_20_0.png
Extracted image saved at extracted_images/image_21_0.png
Extracted image saved at extracted_images/image_22_0.png
Extracted image saved at extracted_images/image_28_0.png
Extracted image saved at extracted_