In [1]:
# !pip install chromadb langchain langchain-community pypdf sentence-transformers
# !pip install pymupdf
# !pip install camelot-py[cv]
# !pip install tabula-py

In [2]:

import requests
from huggingface_hub import configure_http_backend

def backend_factory() -> requests.Session:
    session = requests.Session()
    session.verify = False
    return session

configure_http_backend(backend_factory=backend_factory)


In [3]:
from langchain_community.document_loaders import PyPDFLoader


In [4]:
file_path = "data/ISO_14229-1_2013.en.PDF.pdf"

In [5]:
loader = PyPDFLoader(file_path)
documents = loader.load()

In [6]:
from sentence_transformers import SentenceTransformer

In [7]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')



In [8]:
# !pip install langchain
# !pip install langchain-text-splitters

In [9]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [10]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=150
)
chunks = text_splitter.split_documents(documents)

In [17]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embedding_function = HuggingFaceEmbeddings(
    model_name="./all-MiniLM-L6-v2"
)


In [18]:
from langchain_community.vectorstores import Chroma

vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_function,
    persist_directory="./chroma_iso14229"
)

vector_db.persist()


In [19]:
vector_db = Chroma(
    persist_directory="./chroma_iso14229",
    embedding_function=embedding_function
)


In [20]:
query = "value of cvt in negative response for A_PDU parameter TA"

In [21]:
results_with_scores = vector_db.similarity_search_with_score(
    query=query,
    k=5
)



In [22]:
results_with_scores

[(Document(metadata={'producer': 'Acrobat Distiller 6.0.1 (Windows); modified using iText® 5.5.0 ©2000-2013 iText Group NV (AGPL-version)', 'page': 40, 'page_label': '41', 'creationdate': '2013-03-13T15:31:47+01:00', 'author': '', 'creator': 'Acrobat PDFMaker 6.0 for Word', 'title': 'Microsoft Word - C055283e.doc', 'moddate': '2014-11-19T15:41:18+01:00', 'source': 'data/ISO_14229-1_2013.en.PDF.pdf', 'total_pages': 402}, page_content='service. \nTable 16 defines the positive response A_PDU. \nTable 16 — Positive response A_PDU \nA_PDU parameter Parameter Name Cvt Byte Value Mnemonic \nSA Source Address M xxxx SA \nTA Target Address M xxxx TA \nTAtype Target Address type M xx TAT \nRA Remote Address C xxxx RA \nA_Data.A_PCI.SI <Service Name> Response SID \nS xx SIDPR \nA_Data.Parameter 1 data-parameter#1 U xx DP_…#1 \n :  : : :  : \nA_Data.Parameter k data-parameter#k U xx DP_…#k \nC: The RA (Remote Address) PDU parameter is only present in case of remote addressing. \n \nIn all response

In [3]:
import mineru

In [4]:
dir(mineru)

['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__']

In [5]:
from mineru import MinerU

ImportError: cannot import name 'MinerU' from 'mineru' (C:\Users\aendra.shukla\AppData\Local\miniconda3\Lib\site-packages\mineru\__init__.py)

In [6]:
from mineru.mineru import MinerU


ModuleNotFoundError: No module named 'mineru.mineru'

In [7]:
from mineru.pdf import PDFMiner


ModuleNotFoundError: No module named 'mineru.pdf'

In [8]:
from mineru.pdf_miner import PDFMiner


ModuleNotFoundError: No module named 'mineru.pdf_miner'

In [9]:
import mineru
help(mineru)


Help on package mineru:

NAME
    mineru - # Copyright (c) Opendatalab. All rights reserved.

PACKAGE CONTENTS
    backend (package)
    cli (package)
    data (package)
    model (package)
    utils (package)
    version

FILE
    c:\users\aendra.shukla\appdata\local\miniconda3\lib\site-packages\mineru\__init__.py




In [4]:
import fitz  # PyMuPDF

doc = fitz.open(file_path)

pages = []
for page_num, page in enumerate(doc):
    blocks = page.get_text("blocks")  # layout-aware
    pages.append({
        "page": page_num + 1,
        "blocks": [
            {
                "text": b[4],
                "bbox": b[:4],
                "type": b[6]
            }
            for b in blocks
        ]
    })


In [10]:
import camelot

tables = camelot.read_pdf(
    file_path,
    pages="all",
    flavor="lattice"  # use "stream" if no borders
)

tables_json = [t.df.to_dict() for t in tables]


Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is a

In [11]:
tables_json

[{0: {0: 'Applicability',
   1: 'Seven layer \naccording to \nISO/IEC 7498-1  \nand  \nISO/IEC 10731',
   2: '',
   3: '',
   4: '',
   5: '',
   6: '',
   7: ''},
  1: {0: 'OSI seven \nlayer',
   1: 'Application \n(layer 7)',
   2: 'Presentation \n(layer 6)',
   3: 'Session  \n(layer 5)',
   4: 'Transport  \n(layer 4)',
   5: 'Network  \n(layer 3)',
   6: 'Data link  \n(layer 2)',
   7: 'Physical  \n(layer 1)'},
  2: {0: 'Enhanced diagnostics services',
   1: 'ISO 14229-1, ISO 14229-3 UDSonCAN, ISO 14229-4 UDSonFR, ISO 14229-5 \nUDSonIP, ISO 14229-6 UDSonK-Line, ISO 14229-7 UDSonLIN, further standards',
   2: 'vehicle manufacturer specific',
   3: 'ISO 14229-2',
   4: 'ISO \n15765-2',
   5: '',
   6: 'ISO \n11898-1, \nISO \n11898-2',
   7: ''},
  3: {0: '',
   1: '',
   2: '',
   3: '',
   4: 'ISO \n10681-2',
   5: '',
   6: 'ISO \n17458-2',
   7: 'ISO \n17458-4'},
  4: {0: '',
   1: '',
   2: '',
   3: '',
   4: 'ISO \n13400-2',
   5: '',
   6: 'ISO \n13400-3,\nIEEE \n802.3',
   7: '

In [12]:
tables = camelot.read_pdf(file_path, pages='30')

# Access the first extracted table as a pandas DataFrame
df_table1 = tables[0].df 

In [18]:
df_table1

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,Server case #,Client request message,,Server capability,,,Server response,,Comments \nto \nserver response
1,,Address- \ning \nscheme,sub-function \n(suppress-\nPosRspMsg-\nIndicat...,SI supported,SubFunction \nsupported,Data parameter \nsupported \n(only if applicable),Message,NRC,
2,a) \nb) \nc) \nd) \ne) \nf) \ng) \nh) \ni) \nj),physical,FALSE \n(bit = 0),YES,YES,At least 1,PosRsp,---,Server sends positive \nresponse
3,,,,,,At least 1,NegRsp,NRC= \n0xXX,Server \nsends \nnegative \nresponse \nbecause...
4,,,,,,,,NRC= \nROOR,Negative \nresponse \nwith NRC 0x31
5,,,,NO,--,--,,NRC= \nSNS or \nSNSIAS,Negative \nresponse \nwith NRC \n0x11 \nor \n...
6,,,,YES,NO,--,,NRC= \nSFNS or \nSFNSIAS,Negative \nresponse \nwith NRC \n0x12\nor \nN...
7,,,TRUE \n(bit = 1),YES,YES,At least 1,NoRsp,---,Server \ndoes \nNOT \nsend a response
8,,,,,,At least 1,NegRsp,NRC= \n0xXX,Server \nsends \nnegative \nresponse \nbecause...
9,,,,,,,,NRC= \nROOR,Negative \nresponse \nwith NRC 0x31


In [26]:
# Source - https://stackoverflow.com/a
# Posted by Himanshu Poddar, modified by community. See post 'Timeline' for change history
# Retrieved 2025-12-24, License - CC BY-SA 4.0

import pandas as pd
from tabula.io import read_pdf
# file = file_path
path = file_path
df = read_pdf(path, pages = '30', multiple_tables = True)
print(df)


Failed to import jpype dependencies. Fallback to subprocess.
No module named 'jpype'


JavaNotFoundError: `java` command is not found from this Python process.Please ensure Java is installed and PATH is set for `java`

In [27]:
!pip install mineru-vl-utils[transformers]
!pip install mineru-vl-utils[vllm]
!pip install pdf2image pillow


Collecting vllm<=0.11.0,>=0.10.0 (from mineru-vl-utils[vllm])
  Downloading vllm-0.11.0.tar.gz (10.8 MB)
     ---------------------------------------- 0.0/10.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/10.8 MB ? eta -:--:--
     --------- ------------------------------ 2.6/10.8 MB 13.1 MB/s eta 0:00:01
     ------------------------------ --------- 8.1/10.8 MB 19.6 MB/s eta 0:00:01
     ---------------------------------------- 10.8/10.8 MB 19.7 MB/s  0:00:00
  Installing build dependencies: started
  Installing build dependencies: still running...
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting sentencepiece (from vllm<=0.11.0,>=0.10.0->mineru-vl-utils[vllm])
  Downloading sentencepiece-0.2.1-cp313-cp313-win_amd64.

In [29]:
file_path

'data/ISO_14229-1_2013.en.PDF.pdf'

In [30]:
from pdf2image import convert_from_path

pdf_path = file_path
pages_to_extract = [30, 31, 32]  # zero-based indices for pages 1, 3, 6
poppler_path = r"C:\Users\aendra.shukla\Downloads\Release-23.11.0-0\poppler-23.11.0\Library\bin"

images = convert_from_path(pdf_path, first_page=min(pages_to_extract)+1, last_page=max(pages_to_extract)+1, poppler_path = poppler_path,)

# Filter only the images of the pages you want
selected_images = [images[i - min(pages_to_extract)] for i in pages_to_extract]


In [39]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import os
os.environ["HF_HUB_DISABLE_SSL_VERIFICATION"] = "1"


In [40]:
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
from mineru_vl_utils import MinerUClient
from PIL import Image

In [41]:
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "opendatalab/MinerU2.5-2509-1.2B",
    dtype="auto",
    device_map="auto"
)

'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /opendatalab/MinerU2.5-2509-1.2B/resolve/main/config.json (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate in certificate chain (_ssl.c:1032)')))"), '(Request ID: 1bca72fb-029c-4b80-8c7a-b9129d50d133)')' thrown while requesting HEAD https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B/resolve/main/config.json
Retrying in 1s [Retry 1/5].
'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /opendatalab/MinerU2.5-2509-1.2B/resolve/main/config.json (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate in certificate chain (_ssl.c:1032)')))"), '(Request ID: a081eadd-9536-49e8-86a5-02cd46abf8d0)')' thrown while requesting HEAD https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B

SSLError: (MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /opendatalab/MinerU2.5-2509-1.2B/resolve/main/config.json (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate in certificate chain (_ssl.c:1032)')))"), '(Request ID: fc306b5c-7b49-4410-94d1-92bd30f1f1e5)')