In [7]:
import fitz
from ollama import Client
import json

In [8]:
def extract_text_from_pdf(pdf_path: str) -> str:
    """Extract all text from a PDF using PyMuPDF."""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text") + "\n"
    return text


In [None]:
text = extract_text_from_pdf('Uniersity Queensland AHE Thesis.pdf')

In [12]:
text

"Faculty of Engineering, Architecture and Information Technology \n \n \n \n \n \n \n \n \n \n \n \nTHE UNIVERSITY OF QUEENSLAND \n \n \nEnhanced Rail Test Automation \n \n \n \nStudent Name:  Isha, JOSHI \n \nCourse Code:  ENGG7290 \n \nSupervisor:  Graeme Smith – Associate Professor \nSchool of Information Technology and Electrical Engineering \n \nSubmission date:  25th June 2020 \n \n \n\nPage | III  \n \nEXECUTIVE SUMMARY \nHitachi Rail STS (Hitachi) has been contracted to deliver and maintain an Automated Train Operation \n(ATO) system for Rio Tinto Iron Ore as a part of their AutoHaul® project. This ATO system facilitates \nthe driverless movement of trains in a railway network in Western Australia. Because of its safety \ncritical nature, any modifications made to the AutoHaul® system require extensive testing before they \ncan be rolled out. Presently, this testing is performed manually and uses a lot of testing time and \nresources. Hitachi has commissioned the design and dev

In [14]:
import pymupdf4llm

file_path = 'Uniersity Queensland AHE Thesis.pdf'
md_text = pymupdf4llm.to_markdown(file_path)

import pathlib
pathlib.Path("UQ AHE Thesis.md").write_bytes(md_text.encode())

124555

In [20]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
    ("#####", "Header 4")
]

markdown_document_path = 'UQ AHE Thesis.md'

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)

with open(markdown_document_path) as md_file:
    text = md_file.read()
    md_header_splits = markdown_splitter.split_text(text)

for i in range(10):
    print(f"{md_header_splits[i]}\n\n")

page_content='_Faculty of Engineering, Architecture and Information Technology_' metadata={'Header 1': '**T HE U NIVERSITY OF Q UEENSLAND**', 'Header 3': 'Enhanced Rail Test Automation', 'Header 4': 'Student Name: Isha, JOSHI Course Code: ENGG7290 Supervisor: Graeme Smith – Associate Professor School of Information Technology and Electrical Engineering Submission date: 25 [th] June 2020'}


page_content='Hitachi Rail STS (Hitachi) has been contracted to deliver and maintain an Automated Train Operation  
(ATO) system for Rio Tinto Iron Ore as a part of their AutoHaul® project. This ATO system facilitates  
the driverless movement of trains in a railway network in Western Australia. Because of its safety  
critical nature, any modifications made to the AutoHaul® system require extensive testing before they  
can be rolled out. Presently, this testing is performed manually and uses a lot of testing time and  
resources. Hitachi has commissioned the design and development of an automated 

In [22]:
i = [len(chunk.page_content.split()) for chunk in md_header_splits]
print(i)

[7, 455, 185, 417, 402, 320, 238, 84, 593, 2023, 1498, 162, 526, 145, 139, 38, 2193, 1289, 1310, 912, 32, 217, 283, 173, 663, 634, 595, 150, 142, 329, 88, 126, 89, 10, 26, 9, 11]


In [31]:
j = []
for num in enumerate(i):
    j.append(tuple(num))
print(j)

[(0, 7), (1, 455), (2, 185), (3, 417), (4, 402), (5, 320), (6, 238), (7, 84), (8, 593), (9, 2023), (10, 1498), (11, 162), (12, 526), (13, 145), (14, 139), (15, 38), (16, 2193), (17, 1289), (18, 1310), (19, 912), (20, 32), (21, 217), (22, 283), (23, 173), (24, 663), (25, 634), (26, 595), (27, 150), (28, 142), (29, 329), (30, 88), (31, 126), (32, 89), (33, 10), (34, 26), (35, 9), (36, 11)]


In [36]:
sorted_list = sorted(j, key=lambda item: item[1], reverse=True)

In [37]:
sorted_list[0]

(16, 2193)

In [38]:
print(md_header_splits[16])

page_content='The aim of testing the FIP – IXL interface was to check the behaviour of the FIP by ensuring that its  
responses to IXL messages are correct. Hence, it was decided that a FIP testing tool (FIP Tester) would  
be developed. This tool would simulate IXL devices in a fashion similar to how aTest (Hitachi’s  
proprietary ATOC simulation tool) operates in the TCS testing.  
4.1.1 Test Framework Design  
The design process used to design this testing tool was based on the top-down methodology  
recommended by Méndez-Porras et al. [31] in their paper. As recommended by Méndez-Porras, the  
system requirements were defined, then a high level system was designed.  
**System Requirements**  
The requirements for the FIP Tester are provided in Table 9.  
Page | 17  
_Table 9 System requirements for the FIP Tester_  
|Requirement|Col2|
|---|---|
|**Implementation Requirements**|**Implementation Requirements**|
|**1 **|All IXL simulations shall connect to the FIP in the same way that

In [None]:
import markdown

def markdown_to_html_file(md_text, output_file="output.html"):
    
    # Convert Markdown to HTML, enabling table support
    html_body = markdown.markdown(md_text, extensions=['tables', 'fenced_code'])

    # Wrap in a full HTML template with some basic styling
    html_template = f"""
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>Markdown Preview</title>
        <style>
            body {{
                font-family: Arial, sans-serif;
                margin: 40px;
                line-height: 1.6;
            }}
            table {{
                border-collapse: collapse;
                width: 100%;
                margin: 20px 0;
            }}
            th, td {{
                border: 1px solid #333;
                padding: 8px;
                text-align: left;
            }}
            th {{
                background-color: #f2f2f2;
            }}
            pre {{
                background-color: #f4f4f4;
                padding: 10px;
                overflow-x: auto;
            }}
        </style>
    </head>
    <body>
        {html_body}
    </body>
    </html>
    """

    # Save to file
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(html_template)

    print(f"HTML file generated: {output_file}")

# Example usage
md_text = open(markdown_document_path, "r", encoding="utf-8").read()
markdown_to_html_file(md_text, output_file="document_preview.html")
