# Notebook for testing

### scrape ird documents

Using scrapy to scrape ird documents.

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess

class IrdTableSpider(scrapy.Spider):
    name = "ird_table_spider"
    start_urls = ['https://www.ird.gov.hk/eng/ppr/arc.htm']

    def parse(self, response):
        # Select the table rows, skip the header
        rows = response.xpath('//table[contains(@class, "border_table")]/tbody/tr[position()>1]')
        for row in rows:
            case_no = row.xpath('td[1]/a/text()').get()
            case_link = row.xpath('td[1]/a/@href').get()
            provision = row.xpath('td[2]/text()').get()
            # Get all <li> items in the 3rd column
            index_items = row.xpath('td[3]//li/text()').getall()
            yield {
                'case_no': case_no,
                'case_link': response.urljoin(case_link) if case_link else None,
                'provision': provision.strip() if provision else None,
                'index': [item.strip() for item in index_items],
            }

def run_spider():
    process = CrawlerProcess(settings={
        "LOG_LEVEL": "ERROR",
        "FEEDS": {"results.json": {"format": "json"}},
    })
    process.crawl(IrdTableSpider)
    process.start()

if __name__ == "__main__":
    run_spider()

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess

class IrdCaseContentSpider(scrapy.Spider):
    name = "ird_case_content_spider"
    start_urls = [f'https://www.ird.gov.hk/eng/ppr/advance{i}.htm' for i in [13, 16, 26, 44]]

    def parse(self, response):
        # get the title
        title = response.css('div.content-title-div h1.content-title::text').get()
        print(f'title: {title}')

        yield {
            'title': title.strip() if title else None,
        }

def run_spider():
    process = CrawlerProcess(settings={
        "LOG_LEVEL": "ERROR",
        "FEEDS": {"results.json": {"format": "json"}},
    })
    process.crawl(IrdCaseContentSpider)
    process.start()

if __name__ == "__main__":
    run_spider()

RuntimeError: This event loop is already running

: 

Using Selenium to scrape ird documents.

In [2]:
# write a python script using selenium to scrape the table data from https://www.ird.gov.hk/eng/ppr/arc.htm and save it as a json file
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import json
import time

In [None]:
def scrape_ird_table():
    options = Options()
    options.headless = True
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get('https://www.ird.gov.hk/eng/ppr/arc.htm')
    time.sleep(5)  # wait for the page to load

    table = driver.find_element(By.CSS_SELECTOR, 'table.border_table')
    rows = table.find_elements(By.TAG_NAME, 'tr')[1:]  # skip header row

    results = []
    for row in rows:
        cols = row.find_elements(By.TAG_NAME, 'td')

        case_no = cols[0].find_element(By.TAG_NAME, 'a').get_attribute('innerHTML') if cols[0].find_elements(By.TAG_NAME, 'a') else None
        case_link = cols[0].find_element(By.TAG_NAME, 'a').get_attribute('href') if cols[0].find_elements(By.TAG_NAME, 'a') else None
        provision = cols[1].get_attribute('innerHTML') if cols[1] else None
        index_items = [li.get_attribute('innerHTML') for li in cols[2].find_elements(By.TAG_NAME, 'li')]
        results.append({
            'case_no': case_no,
            'case_link': case_link,
            'provision': provision,
            'index': index_items,
        })

    with open('./output_files/ird_results.json', 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=4)

    driver.quit()

if __name__ == "__main__":
    scrape_ird_table()

<selenium.webdriver.remote.webelement.WebElement (session="d6d308dab4a380b0608ce8dbce7a0b47", element="f.E406D1A056F22A83968F0DBC1D753D7B.d.D010D05BAB9ABE93990C532DA6EDC61D.e.70")>
<selenium.webdriver.remote.webelement.WebElement (session="d6d308dab4a380b0608ce8dbce7a0b47", element="f.E406D1A056F22A83968F0DBC1D753D7B.d.D010D05BAB9ABE93990C532DA6EDC61D.e.68")>
[<selenium.webdriver.remote.webelement.WebElement (session="d6d308dab4a380b0608ce8dbce7a0b47", element="f.E406D1A056F22A83968F0DBC1D753D7B.d.D010D05BAB9ABE93990C532DA6EDC61D.e.71")>, <selenium.webdriver.remote.webelement.WebElement (session="d6d308dab4a380b0608ce8dbce7a0b47", element="f.E406D1A056F22A83968F0DBC1D753D7B.d.D010D05BAB9ABE93990C532DA6EDC61D.e.72")>]
<selenium.webdriver.remote.webelement.WebElement (session="d6d308dab4a380b0608ce8dbce7a0b47", element="f.E406D1A056F22A83968F0DBC1D753D7B.d.D010D05BAB9ABE93990C532DA6EDC61D.e.76")>
<selenium.webdriver.remote.webelement.WebElement (session="d6d308dab4a380b0608ce8dbce7a0b47"

### Download ird pdf documents

In [6]:
import subprocess

In [7]:
i = 63
f"{i:02d}"

'63'

In [8]:
# download pdf documents to output_files/ird_pdfs

destination_directory = "./output_files/ird_pdfs"

try:
    for i in range(1, 64):
        pdf_url = f"https://www.ird.gov.hk/eng/pdf/dipn{i:02d}.pdf"
        wget_command = ["wget", "-P", destination_directory, pdf_url]
        subprocess.run(wget_command, check=True, capture_output=True, text=True)
        print(f"File downloaded successfully to: {destination_directory}")
except subprocess.CalledProcessError as e:
    print(f"Error downloading file: {e}")
    print(f"Stderr: {e.stderr}")

File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to

In [9]:
try:
    # download pdf 13A document
    pdf_url = f"https://www.ird.gov.hk/eng/pdf/dipn13a.pdf"
    wget_command = ["wget", "-P", destination_directory, pdf_url]
    subprocess.run(wget_command, check=True, capture_output=True, text=True)
    print(f"File downloaded successfully to: {destination_directory}")
except subprocess.CalledProcessError as e:
    print(f"Error downloading file: {e}")
    print(f"Stderr: {e.stderr}")

File downloaded successfully to: ./output_files/ird_pdfs


### Scrape pdf files metadata

In [2]:
import re

In [6]:
# utils functions
def extract_only_alphanumeric(text: str) -> str:
    '''
    Extract only the first continuous sequence of digits and alphabets from the input text.

    Args:
        text (str): The input text from which to extract numbers.
    Returns:
        str: The extracted number as a string, or an empty string if no digits and alphabets are found.
    '''

    match = re.search(r'[A-Za-z0-9]+', text)
    return match.group(0) if match else ''
    

def remove_html_tags(text: str) -> str:
    '''
    Remove HTML tags from the input text.

    Args:
        text (str): The input text containing HTML tags.
    Returns:
        str: The text with HTML tags removed.
    '''
    
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)
    

In [7]:
def scrape_ird_pdf_metadata():
    options = Options()
    options.headless = True
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get('https://www.ird.gov.hk/eng/ppr/dip.htm')
    time.sleep(5)  # wait for the page to load

    table = driver.find_element(By.CSS_SELECTOR, 'table.border_table')
    rows = table.find_elements(By.TAG_NAME, 'tr')[1:]  # skip header row

    results = []
    for row in rows:
        cols = row.find_elements(By.TAG_NAME, 'td')

        pdf_no = cols[0].find_element(By.TAG_NAME, 'a').get_attribute('innerHTML') if cols[0].find_elements(By.TAG_NAME, 'a') else None
        pdf_link = cols[0].find_element(By.TAG_NAME, 'a').get_attribute('href') if cols[0].find_elements(By.TAG_NAME, 'a') else None
        pdf_notes = cols[1].find_element(By.TAG_NAME, 'a').get_attribute('innerHTML') if cols[1] else None
        pdf_notes = remove_html_tags(text=pdf_notes) if pdf_notes else None
        pdf_date = cols[2].get_attribute('innerHTML') if cols[2] else None
        results.append({
            'pdf_no': extract_only_alphanumeric(text=pdf_no),
            'pdf_link': pdf_link,
            'pdf_notes': pdf_notes,
            'index': pdf_date,
        })

    with open('./output_files/ird_pdf_results.json', 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=4)

    driver.quit()

if __name__ == "__main__":
    scrape_ird_pdf_metadata()

### Parsing pdf file using llama-parse

In [8]:
import nest_asyncio
nest_asyncio.apply()
import os
from dotenv import load_dotenv
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader
load_dotenv(override=True)

# create llama parse instance
parser = LlamaParse(
    api_key=os.getenv('LLAMAINDEX_KEY'),
    result_type="markdown",
    num_workers=4,
    language='en',
    verbose=True
)

# start to parse the pdf files in output_files/ird_pdfs
# using SimpleDirectoryReader
file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader(
    input_dir="./output_files/ird_pdfs",
    file_extractor=file_extractor
).load_data()

documents


2025-08-23 21:54:29,889 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id 62dca517-8c4a-435e-bc2c-25820356aedf


2025-08-23 21:54:31,239 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/62dca517-8c4a-435e-bc2c-25820356aedf "HTTP/1.1 200 OK"
2025-08-23 21:54:33,620 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/62dca517-8c4a-435e-bc2c-25820356aedf "HTTP/1.1 200 OK"
2025-08-23 21:54:36,967 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/62dca517-8c4a-435e-bc2c-25820356aedf "HTTP/1.1 200 OK"
2025-08-23 21:54:41,310 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/62dca517-8c4a-435e-bc2c-25820356aedf "HTTP/1.1 200 OK"
2025-08-23 21:54:48,571 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/62dca517-8c4a-435e-bc2c-25820356aedf "HTTP/1.1 200 OK"
2025-08-23 21:54:54,422 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/62dca517-8c4a-435e-bc2c-25820356aedf "HTTP/1.1 200 OK"
2025-08-23 21:54:55,088 - INFO - HTTP Request: GET https://api.cloud.llamain

Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 21:55:00,472 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id 6a0dc642-7d1c-48fb-8786-bf3a7408b6c4


2025-08-23 21:55:01,823 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/6a0dc642-7d1c-48fb-8786-bf3a7408b6c4 "HTTP/1.1 200 OK"
2025-08-23 21:55:04,175 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/6a0dc642-7d1c-48fb-8786-bf3a7408b6c4 "HTTP/1.1 200 OK"
2025-08-23 21:55:07,534 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/6a0dc642-7d1c-48fb-8786-bf3a7408b6c4 "HTTP/1.1 200 OK"
2025-08-23 21:55:11,889 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/6a0dc642-7d1c-48fb-8786-bf3a7408b6c4 "HTTP/1.1 200 OK"
2025-08-23 21:55:18,355 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/6a0dc642-7d1c-48fb-8786-bf3a7408b6c4 "HTTP/1.1 200 OK"
2025-08-23 21:55:19,854 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/6a0dc642-7d1c-48fb-8786-bf3a7408b6c4/result/markdown "HTTP/1.1 200 OK"


Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 21:55:23,147 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id 6631f29c-4905-4d01-822c-9f1948552f19


2025-08-23 21:55:24,463 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/6631f29c-4905-4d01-822c-9f1948552f19 "HTTP/1.1 200 OK"
2025-08-23 21:55:27,122 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/6631f29c-4905-4d01-822c-9f1948552f19 "HTTP/1.1 200 OK"
2025-08-23 21:55:30,472 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/6631f29c-4905-4d01-822c-9f1948552f19 "HTTP/1.1 200 OK"
2025-08-23 21:55:35,025 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/6631f29c-4905-4d01-822c-9f1948552f19 "HTTP/1.1 200 OK"
2025-08-23 21:55:41,087 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/6631f29c-4905-4d01-822c-9f1948552f19 "HTTP/1.1 200 OK"
2025-08-23 21:55:46,914 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/6631f29c-4905-4d01-822c-9f1948552f19 "HTTP/1.1 200 OK"
2025-08-23 21:55:47,520 - INFO - HTTP Request: GET https://api.cloud.llamain

Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 21:55:51,818 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id 20f34868-e239-48e4-ad6f-e21339a424d4


2025-08-23 21:55:53,190 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/20f34868-e239-48e4-ad6f-e21339a424d4 "HTTP/1.1 200 OK"
2025-08-23 21:55:55,537 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/20f34868-e239-48e4-ad6f-e21339a424d4 "HTTP/1.1 200 OK"
2025-08-23 21:55:58,897 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/20f34868-e239-48e4-ad6f-e21339a424d4 "HTTP/1.1 200 OK"
2025-08-23 21:56:03,262 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/20f34868-e239-48e4-ad6f-e21339a424d4 "HTTP/1.1 200 OK"
2025-08-23 21:56:09,431 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/20f34868-e239-48e4-ad6f-e21339a424d4 "HTTP/1.1 200 OK"
2025-08-23 21:56:15,271 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/20f34868-e239-48e4-ad6f-e21339a424d4 "HTTP/1.1 200 OK"
2025-08-23 21:56:21,310 - INFO - HTTP Request: GET https://api.cloud.llamain

Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 21:56:25,969 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id 046907d4-6bbd-4819-b164-e20c11c202d7


2025-08-23 21:56:27,669 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/046907d4-6bbd-4819-b164-e20c11c202d7 "HTTP/1.1 200 OK"
2025-08-23 21:56:30,150 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/046907d4-6bbd-4819-b164-e20c11c202d7 "HTTP/1.1 200 OK"
2025-08-23 21:56:33,550 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/046907d4-6bbd-4819-b164-e20c11c202d7 "HTTP/1.1 200 OK"
2025-08-23 21:56:37,922 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/046907d4-6bbd-4819-b164-e20c11c202d7 "HTTP/1.1 200 OK"
2025-08-23 21:56:44,203 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/046907d4-6bbd-4819-b164-e20c11c202d7 "HTTP/1.1 200 OK"
2025-08-23 21:56:44,765 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/046907d4-6bbd-4819-b164-e20c11c202d7/result/markdown "HTTP/1.1 200 OK"


Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 21:56:49,146 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id 11ef45d4-d1f0-4cac-abf0-614b765af766


2025-08-23 21:56:50,506 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/11ef45d4-d1f0-4cac-abf0-614b765af766 "HTTP/1.1 200 OK"
2025-08-23 21:56:52,870 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/11ef45d4-d1f0-4cac-abf0-614b765af766 "HTTP/1.1 200 OK"
2025-08-23 21:56:56,235 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/11ef45d4-d1f0-4cac-abf0-614b765af766 "HTTP/1.1 200 OK"
2025-08-23 21:57:00,571 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/11ef45d4-d1f0-4cac-abf0-614b765af766 "HTTP/1.1 200 OK"
2025-08-23 21:57:07,603 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/11ef45d4-d1f0-4cac-abf0-614b765af766 "HTTP/1.1 200 OK"
2025-08-23 21:57:14,364 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/11ef45d4-d1f0-4cac-abf0-614b765af766 "HTTP/1.1 200 OK"
2025-08-23 21:57:20,211 - INFO - HTTP Request: GET https://api.cloud.llamain

Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 21:57:27,565 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id 7d7682cb-90b2-46da-9e0c-7cd9e80d77bc


2025-08-23 21:57:29,044 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/7d7682cb-90b2-46da-9e0c-7cd9e80d77bc "HTTP/1.1 200 OK"
2025-08-23 21:57:31,387 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/7d7682cb-90b2-46da-9e0c-7cd9e80d77bc "HTTP/1.1 200 OK"
2025-08-23 21:57:34,732 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/7d7682cb-90b2-46da-9e0c-7cd9e80d77bc "HTTP/1.1 200 OK"
2025-08-23 21:57:39,117 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/7d7682cb-90b2-46da-9e0c-7cd9e80d77bc "HTTP/1.1 200 OK"
2025-08-23 21:57:45,021 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/7d7682cb-90b2-46da-9e0c-7cd9e80d77bc "HTTP/1.1 200 OK"
2025-08-23 21:57:45,466 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/7d7682cb-90b2-46da-9e0c-7cd9e80d77bc/result/markdown "HTTP/1.1 200 OK"


Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 21:57:48,578 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id 799ce9e9-9cf3-4f4a-bdab-fe9ccd402c5c


2025-08-23 21:57:49,963 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/799ce9e9-9cf3-4f4a-bdab-fe9ccd402c5c "HTTP/1.1 200 OK"
2025-08-23 21:57:52,347 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/799ce9e9-9cf3-4f4a-bdab-fe9ccd402c5c "HTTP/1.1 200 OK"
2025-08-23 21:57:55,758 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/799ce9e9-9cf3-4f4a-bdab-fe9ccd402c5c "HTTP/1.1 200 OK"
2025-08-23 21:58:00,132 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/799ce9e9-9cf3-4f4a-bdab-fe9ccd402c5c "HTTP/1.1 200 OK"
2025-08-23 21:58:06,036 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/799ce9e9-9cf3-4f4a-bdab-fe9ccd402c5c "HTTP/1.1 200 OK"
2025-08-23 21:58:12,777 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/799ce9e9-9cf3-4f4a-bdab-fe9ccd402c5c "HTTP/1.1 200 OK"
2025-08-23 21:58:13,326 - INFO - HTTP Request: GET https://api.cloud.llamain

Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 21:58:17,538 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id f38e5812-aca7-408e-ab3c-91d7283a66f5


2025-08-23 21:58:18,914 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/f38e5812-aca7-408e-ab3c-91d7283a66f5 "HTTP/1.1 200 OK"
2025-08-23 21:58:21,264 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/f38e5812-aca7-408e-ab3c-91d7283a66f5 "HTTP/1.1 200 OK"
2025-08-23 21:58:25,013 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/f38e5812-aca7-408e-ab3c-91d7283a66f5 "HTTP/1.1 200 OK"
2025-08-23 21:58:29,365 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/f38e5812-aca7-408e-ab3c-91d7283a66f5 "HTTP/1.1 200 OK"
2025-08-23 21:58:35,276 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/f38e5812-aca7-408e-ab3c-91d7283a66f5 "HTTP/1.1 200 OK"
2025-08-23 21:58:35,763 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/f38e5812-aca7-408e-ab3c-91d7283a66f5/result/markdown "HTTP/1.1 200 OK"


Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 21:58:39,861 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id 07fdf5ae-f1b9-4f04-ad04-1a51fa5e66e3


2025-08-23 21:58:41,196 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/07fdf5ae-f1b9-4f04-ad04-1a51fa5e66e3 "HTTP/1.1 200 OK"
2025-08-23 21:58:43,549 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/07fdf5ae-f1b9-4f04-ad04-1a51fa5e66e3 "HTTP/1.1 200 OK"
2025-08-23 21:58:46,995 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/07fdf5ae-f1b9-4f04-ad04-1a51fa5e66e3 "HTTP/1.1 200 OK"
2025-08-23 21:58:51,351 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/07fdf5ae-f1b9-4f04-ad04-1a51fa5e66e3 "HTTP/1.1 200 OK"
2025-08-23 21:58:57,575 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/07fdf5ae-f1b9-4f04-ad04-1a51fa5e66e3 "HTTP/1.1 200 OK"
2025-08-23 21:59:03,472 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/07fdf5ae-f1b9-4f04-ad04-1a51fa5e66e3 "HTTP/1.1 200 OK"
2025-08-23 21:59:04,232 - INFO - HTTP Request: GET https://api.cloud.llamain

Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 21:59:09,074 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id 384a6aa3-8fbc-484f-a030-2b76a9582a52


2025-08-23 21:59:10,431 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/384a6aa3-8fbc-484f-a030-2b76a9582a52 "HTTP/1.1 200 OK"
2025-08-23 21:59:12,784 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/384a6aa3-8fbc-484f-a030-2b76a9582a52 "HTTP/1.1 200 OK"
2025-08-23 21:59:16,247 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/384a6aa3-8fbc-484f-a030-2b76a9582a52 "HTTP/1.1 200 OK"
2025-08-23 21:59:20,615 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/384a6aa3-8fbc-484f-a030-2b76a9582a52 "HTTP/1.1 200 OK"
2025-08-23 21:59:26,480 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/384a6aa3-8fbc-484f-a030-2b76a9582a52 "HTTP/1.1 200 OK"
2025-08-23 21:59:27,011 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/384a6aa3-8fbc-484f-a030-2b76a9582a52/result/markdown "HTTP/1.1 200 OK"


Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 21:59:30,742 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id f0de8478-fe5c-4868-b52a-f1c1b5512900


2025-08-23 21:59:32,137 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/f0de8478-fe5c-4868-b52a-f1c1b5512900 "HTTP/1.1 200 OK"
2025-08-23 21:59:34,461 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/f0de8478-fe5c-4868-b52a-f1c1b5512900 "HTTP/1.1 200 OK"
2025-08-23 21:59:37,909 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/f0de8478-fe5c-4868-b52a-f1c1b5512900 "HTTP/1.1 200 OK"
2025-08-23 21:59:42,247 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/f0de8478-fe5c-4868-b52a-f1c1b5512900 "HTTP/1.1 200 OK"
2025-08-23 21:59:48,221 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/f0de8478-fe5c-4868-b52a-f1c1b5512900 "HTTP/1.1 200 OK"
2025-08-23 21:59:48,747 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/f0de8478-fe5c-4868-b52a-f1c1b5512900/result/markdown "HTTP/1.1 200 OK"


Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 21:59:51,802 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id 423c5c9f-dfd1-42fb-8ac9-32835fb4b648


2025-08-23 21:59:53,189 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/423c5c9f-dfd1-42fb-8ac9-32835fb4b648 "HTTP/1.1 200 OK"
2025-08-23 21:59:55,541 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/423c5c9f-dfd1-42fb-8ac9-32835fb4b648 "HTTP/1.1 200 OK"
2025-08-23 21:59:58,917 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/423c5c9f-dfd1-42fb-8ac9-32835fb4b648 "HTTP/1.1 200 OK"
2025-08-23 22:00:03,523 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/423c5c9f-dfd1-42fb-8ac9-32835fb4b648 "HTTP/1.1 200 OK"
2025-08-23 22:00:09,329 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/423c5c9f-dfd1-42fb-8ac9-32835fb4b648 "HTTP/1.1 200 OK"
2025-08-23 22:00:09,767 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/423c5c9f-dfd1-42fb-8ac9-32835fb4b648/result/markdown "HTTP/1.1 200 OK"


Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 22:00:12,612 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id fa1e7593-0bfe-460d-888f-03f36b81c7f9


2025-08-23 22:00:13,967 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/fa1e7593-0bfe-460d-888f-03f36b81c7f9 "HTTP/1.1 200 OK"
2025-08-23 22:00:16,356 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/fa1e7593-0bfe-460d-888f-03f36b81c7f9 "HTTP/1.1 200 OK"
2025-08-23 22:00:19,762 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/fa1e7593-0bfe-460d-888f-03f36b81c7f9 "HTTP/1.1 200 OK"
2025-08-23 22:00:24,871 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/fa1e7593-0bfe-460d-888f-03f36b81c7f9 "HTTP/1.1 200 OK"
2025-08-23 22:00:25,437 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/fa1e7593-0bfe-460d-888f-03f36b81c7f9/result/markdown "HTTP/1.1 200 OK"


Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 22:00:28,198 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id 335726b2-946c-44fc-a37c-f7c02f8db2b0


2025-08-23 22:00:29,531 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/335726b2-946c-44fc-a37c-f7c02f8db2b0 "HTTP/1.1 200 OK"
2025-08-23 22:00:31,936 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/335726b2-946c-44fc-a37c-f7c02f8db2b0 "HTTP/1.1 200 OK"
2025-08-23 22:00:35,321 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/335726b2-946c-44fc-a37c-f7c02f8db2b0 "HTTP/1.1 200 OK"
2025-08-23 22:00:39,671 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/335726b2-946c-44fc-a37c-f7c02f8db2b0 "HTTP/1.1 200 OK"
2025-08-23 22:00:46,458 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/335726b2-946c-44fc-a37c-f7c02f8db2b0 "HTTP/1.1 200 OK"
2025-08-23 22:00:46,908 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/335726b2-946c-44fc-a37c-f7c02f8db2b0/result/markdown "HTTP/1.1 200 OK"


Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 22:00:49,910 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id 57883ca4-7c5a-4f6d-8f6a-9698cae3f8a7


2025-08-23 22:00:51,258 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/57883ca4-7c5a-4f6d-8f6a-9698cae3f8a7 "HTTP/1.1 200 OK"
2025-08-23 22:00:53,613 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/57883ca4-7c5a-4f6d-8f6a-9698cae3f8a7 "HTTP/1.1 200 OK"
2025-08-23 22:00:56,992 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/57883ca4-7c5a-4f6d-8f6a-9698cae3f8a7 "HTTP/1.1 200 OK"
2025-08-23 22:01:01,379 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/57883ca4-7c5a-4f6d-8f6a-9698cae3f8a7 "HTTP/1.1 200 OK"
2025-08-23 22:01:08,061 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/57883ca4-7c5a-4f6d-8f6a-9698cae3f8a7 "HTTP/1.1 200 OK"
2025-08-23 22:01:08,650 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/57883ca4-7c5a-4f6d-8f6a-9698cae3f8a7/result/markdown "HTTP/1.1 200 OK"


Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 22:01:12,520 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id 72757e42-5300-421e-a1ed-6a364e208ec9


2025-08-23 22:01:13,891 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/72757e42-5300-421e-a1ed-6a364e208ec9 "HTTP/1.1 200 OK"
2025-08-23 22:01:17,145 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/72757e42-5300-421e-a1ed-6a364e208ec9 "HTTP/1.1 200 OK"
2025-08-23 22:01:20,481 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/72757e42-5300-421e-a1ed-6a364e208ec9 "HTTP/1.1 200 OK"
2025-08-23 22:01:24,828 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/72757e42-5300-421e-a1ed-6a364e208ec9 "HTTP/1.1 200 OK"
2025-08-23 22:01:31,588 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/72757e42-5300-421e-a1ed-6a364e208ec9 "HTTP/1.1 200 OK"
2025-08-23 22:01:37,742 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/72757e42-5300-421e-a1ed-6a364e208ec9 "HTTP/1.1 200 OK"
2025-08-23 22:01:44,176 - INFO - HTTP Request: GET https://api.cloud.llamain

.

2025-08-23 22:02:11,617 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/72757e42-5300-421e-a1ed-6a364e208ec9 "HTTP/1.1 200 OK"
2025-08-23 22:02:17,417 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/72757e42-5300-421e-a1ed-6a364e208ec9 "HTTP/1.1 200 OK"
2025-08-23 22:02:17,917 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/72757e42-5300-421e-a1ed-6a364e208ec9/result/markdown "HTTP/1.1 200 OK"


Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 22:02:25,348 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id 6ce0a8f8-30bf-45f3-bba5-a0cb2e203b80


2025-08-23 22:02:26,695 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/6ce0a8f8-30bf-45f3-bba5-a0cb2e203b80 "HTTP/1.1 200 OK"
2025-08-23 22:02:29,059 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/6ce0a8f8-30bf-45f3-bba5-a0cb2e203b80 "HTTP/1.1 200 OK"
2025-08-23 22:02:32,387 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/6ce0a8f8-30bf-45f3-bba5-a0cb2e203b80 "HTTP/1.1 200 OK"
2025-08-23 22:02:36,724 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/6ce0a8f8-30bf-45f3-bba5-a0cb2e203b80 "HTTP/1.1 200 OK"
2025-08-23 22:02:42,877 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/6ce0a8f8-30bf-45f3-bba5-a0cb2e203b80 "HTTP/1.1 200 OK"
2025-08-23 22:02:49,503 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/6ce0a8f8-30bf-45f3-bba5-a0cb2e203b80 "HTTP/1.1 200 OK"
2025-08-23 22:02:50,139 - INFO - HTTP Request: GET https://api.cloud.llamain

Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 22:02:52,691 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id 3db02e76-1d82-4ac2-9648-b5d5ef2f27dc


2025-08-23 22:02:54,033 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/3db02e76-1d82-4ac2-9648-b5d5ef2f27dc "HTTP/1.1 200 OK"
2025-08-23 22:02:57,349 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/3db02e76-1d82-4ac2-9648-b5d5ef2f27dc "HTTP/1.1 200 OK"
2025-08-23 22:03:01,019 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/3db02e76-1d82-4ac2-9648-b5d5ef2f27dc "HTTP/1.1 200 OK"
2025-08-23 22:03:05,381 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/3db02e76-1d82-4ac2-9648-b5d5ef2f27dc "HTTP/1.1 200 OK"
2025-08-23 22:03:11,196 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/3db02e76-1d82-4ac2-9648-b5d5ef2f27dc "HTTP/1.1 200 OK"
2025-08-23 22:03:17,021 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/3db02e76-1d82-4ac2-9648-b5d5ef2f27dc "HTTP/1.1 200 OK"
2025-08-23 22:03:17,479 - INFO - HTTP Request: GET https://api.cloud.llamain

Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 22:03:22,591 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id b5d89fbe-2efb-4510-a4a8-6dc47557b16a


2025-08-23 22:03:23,942 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b5d89fbe-2efb-4510-a4a8-6dc47557b16a "HTTP/1.1 200 OK"
2025-08-23 22:03:26,301 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b5d89fbe-2efb-4510-a4a8-6dc47557b16a "HTTP/1.1 200 OK"
2025-08-23 22:03:29,717 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b5d89fbe-2efb-4510-a4a8-6dc47557b16a "HTTP/1.1 200 OK"
2025-08-23 22:03:34,079 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b5d89fbe-2efb-4510-a4a8-6dc47557b16a "HTTP/1.1 200 OK"
2025-08-23 22:03:39,938 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b5d89fbe-2efb-4510-a4a8-6dc47557b16a "HTTP/1.1 200 OK"
2025-08-23 22:03:45,820 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b5d89fbe-2efb-4510-a4a8-6dc47557b16a "HTTP/1.1 200 OK"
2025-08-23 22:03:46,361 - INFO - HTTP Request: GET https://api.cloud.llamain

Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 22:03:50,553 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id 8162c0c7-ae1e-4209-8c85-dead16a406ea


2025-08-23 22:03:51,882 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/8162c0c7-ae1e-4209-8c85-dead16a406ea "HTTP/1.1 200 OK"
2025-08-23 22:03:54,211 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/8162c0c7-ae1e-4209-8c85-dead16a406ea "HTTP/1.1 200 OK"
2025-08-23 22:03:57,567 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/8162c0c7-ae1e-4209-8c85-dead16a406ea "HTTP/1.1 200 OK"
2025-08-23 22:04:01,976 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/8162c0c7-ae1e-4209-8c85-dead16a406ea "HTTP/1.1 200 OK"
2025-08-23 22:04:07,817 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/8162c0c7-ae1e-4209-8c85-dead16a406ea "HTTP/1.1 200 OK"
2025-08-23 22:04:13,655 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/8162c0c7-ae1e-4209-8c85-dead16a406ea "HTTP/1.1 200 OK"
2025-08-23 22:04:14,102 - INFO - HTTP Request: GET https://api.cloud.llamain

Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 22:04:19,527 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id 45c30a27-66ee-46c8-b433-d6f03ade6269


2025-08-23 22:04:20,869 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/45c30a27-66ee-46c8-b433-d6f03ade6269 "HTTP/1.1 200 OK"
2025-08-23 22:04:23,204 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/45c30a27-66ee-46c8-b433-d6f03ade6269 "HTTP/1.1 200 OK"
2025-08-23 22:04:26,532 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/45c30a27-66ee-46c8-b433-d6f03ade6269 "HTTP/1.1 200 OK"
2025-08-23 22:04:30,871 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/45c30a27-66ee-46c8-b433-d6f03ade6269 "HTTP/1.1 200 OK"
2025-08-23 22:04:36,786 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/45c30a27-66ee-46c8-b433-d6f03ade6269 "HTTP/1.1 200 OK"
2025-08-23 22:04:44,865 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/45c30a27-66ee-46c8-b433-d6f03ade6269 "HTTP/1.1 200 OK"
2025-08-23 22:04:45,497 - INFO - HTTP Request: GET https://api.cloud.llamain

Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 22:04:56,641 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id 063a49df-71d4-4290-828a-c0845afda18d


2025-08-23 22:04:58,024 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/063a49df-71d4-4290-828a-c0845afda18d "HTTP/1.1 200 OK"
2025-08-23 22:05:01,000 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/063a49df-71d4-4290-828a-c0845afda18d "HTTP/1.1 200 OK"
2025-08-23 22:05:04,354 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/063a49df-71d4-4290-828a-c0845afda18d "HTTP/1.1 200 OK"
2025-08-23 22:05:08,853 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/063a49df-71d4-4290-828a-c0845afda18d "HTTP/1.1 200 OK"
2025-08-23 22:05:15,921 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/063a49df-71d4-4290-828a-c0845afda18d "HTTP/1.1 200 OK"
2025-08-23 22:05:21,875 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/063a49df-71d4-4290-828a-c0845afda18d "HTTP/1.1 200 OK"
2025-08-23 22:05:22,405 - INFO - HTTP Request: GET https://api.cloud.llamain

Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 22:05:32,750 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id 30ab4a2b-957d-4c3d-b688-691cb668b1f3


2025-08-23 22:05:34,517 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/30ab4a2b-957d-4c3d-b688-691cb668b1f3 "HTTP/1.1 200 OK"
2025-08-23 22:05:36,849 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/30ab4a2b-957d-4c3d-b688-691cb668b1f3 "HTTP/1.1 200 OK"
2025-08-23 22:05:40,182 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/30ab4a2b-957d-4c3d-b688-691cb668b1f3 "HTTP/1.1 200 OK"
2025-08-23 22:05:44,563 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/30ab4a2b-957d-4c3d-b688-691cb668b1f3 "HTTP/1.1 200 OK"
2025-08-23 22:05:50,505 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/30ab4a2b-957d-4c3d-b688-691cb668b1f3 "HTTP/1.1 200 OK"
2025-08-23 22:05:50,942 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/30ab4a2b-957d-4c3d-b688-691cb668b1f3/result/markdown "HTTP/1.1 200 OK"


Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 22:05:58,094 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id bcacf26e-ff3e-4396-a328-cddcfb4c918b


2025-08-23 22:05:59,444 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/bcacf26e-ff3e-4396-a328-cddcfb4c918b "HTTP/1.1 200 OK"
2025-08-23 22:06:01,809 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/bcacf26e-ff3e-4396-a328-cddcfb4c918b "HTTP/1.1 200 OK"
2025-08-23 22:06:05,162 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/bcacf26e-ff3e-4396-a328-cddcfb4c918b "HTTP/1.1 200 OK"
2025-08-23 22:06:09,531 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/bcacf26e-ff3e-4396-a328-cddcfb4c918b "HTTP/1.1 200 OK"
2025-08-23 22:06:15,441 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/bcacf26e-ff3e-4396-a328-cddcfb4c918b "HTTP/1.1 200 OK"
2025-08-23 22:06:23,329 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/bcacf26e-ff3e-4396-a328-cddcfb4c918b "HTTP/1.1 200 OK"
2025-08-23 22:06:23,837 - INFO - HTTP Request: GET https://api.cloud.llamain

Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 22:06:40,913 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id 3a5199ae-c7e3-4f43-9e53-601c10875f43


2025-08-23 22:06:42,288 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/3a5199ae-c7e3-4f43-9e53-601c10875f43 "HTTP/1.1 200 OK"
2025-08-23 22:06:45,017 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/3a5199ae-c7e3-4f43-9e53-601c10875f43 "HTTP/1.1 200 OK"
2025-08-23 22:06:48,706 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/3a5199ae-c7e3-4f43-9e53-601c10875f43 "HTTP/1.1 200 OK"
2025-08-23 22:06:53,061 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/3a5199ae-c7e3-4f43-9e53-601c10875f43 "HTTP/1.1 200 OK"
2025-08-23 22:07:00,011 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/3a5199ae-c7e3-4f43-9e53-601c10875f43 "HTTP/1.1 200 OK"
2025-08-23 22:07:06,192 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/3a5199ae-c7e3-4f43-9e53-601c10875f43 "HTTP/1.1 200 OK"
2025-08-23 22:07:07,053 - INFO - HTTP Request: GET https://api.cloud.llamain

Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 22:07:15,833 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id b06c7a10-e75b-4ad9-ba10-5987d85eccd2


2025-08-23 22:07:17,179 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b06c7a10-e75b-4ad9-ba10-5987d85eccd2 "HTTP/1.1 200 OK"
2025-08-23 22:07:19,573 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b06c7a10-e75b-4ad9-ba10-5987d85eccd2 "HTTP/1.1 200 OK"
2025-08-23 22:07:22,930 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b06c7a10-e75b-4ad9-ba10-5987d85eccd2 "HTTP/1.1 200 OK"
2025-08-23 22:07:27,403 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b06c7a10-e75b-4ad9-ba10-5987d85eccd2 "HTTP/1.1 200 OK"
2025-08-23 22:07:33,486 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b06c7a10-e75b-4ad9-ba10-5987d85eccd2 "HTTP/1.1 200 OK"
2025-08-23 22:07:34,054 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b06c7a10-e75b-4ad9-ba10-5987d85eccd2/result/markdown "HTTP/1.1 200 OK"


Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 22:07:43,790 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id 326c3623-de69-463e-bc47-806470c3126c


2025-08-23 22:07:45,144 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/326c3623-de69-463e-bc47-806470c3126c "HTTP/1.1 200 OK"
2025-08-23 22:07:47,530 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/326c3623-de69-463e-bc47-806470c3126c "HTTP/1.1 200 OK"
2025-08-23 22:07:50,914 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/326c3623-de69-463e-bc47-806470c3126c "HTTP/1.1 200 OK"
2025-08-23 22:07:55,285 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/326c3623-de69-463e-bc47-806470c3126c "HTTP/1.1 200 OK"
2025-08-23 22:08:01,433 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/326c3623-de69-463e-bc47-806470c3126c "HTTP/1.1 200 OK"
2025-08-23 22:08:07,246 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/326c3623-de69-463e-bc47-806470c3126c "HTTP/1.1 200 OK"
2025-08-23 22:08:08,159 - INFO - HTTP Request: GET https://api.cloud.llamain

Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 22:08:17,918 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id 1e83a901-4c29-4aa2-96dc-51e0bd2b2467


2025-08-23 22:08:19,325 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/1e83a901-4c29-4aa2-96dc-51e0bd2b2467 "HTTP/1.1 200 OK"
2025-08-23 22:08:21,701 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/1e83a901-4c29-4aa2-96dc-51e0bd2b2467 "HTTP/1.1 200 OK"
2025-08-23 22:08:25,055 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/1e83a901-4c29-4aa2-96dc-51e0bd2b2467 "HTTP/1.1 200 OK"
2025-08-23 22:08:29,409 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/1e83a901-4c29-4aa2-96dc-51e0bd2b2467 "HTTP/1.1 200 OK"
2025-08-23 22:08:35,703 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/1e83a901-4c29-4aa2-96dc-51e0bd2b2467 "HTTP/1.1 200 OK"
2025-08-23 22:08:41,806 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/1e83a901-4c29-4aa2-96dc-51e0bd2b2467 "HTTP/1.1 200 OK"
2025-08-23 22:08:42,307 - INFO - HTTP Request: GET https://api.cloud.llamain

Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 22:08:48,003 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id be38ed11-7b33-4283-9b32-d888432c2fa7


2025-08-23 22:08:49,450 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/be38ed11-7b33-4283-9b32-d888432c2fa7 "HTTP/1.1 200 OK"
2025-08-23 22:08:51,832 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/be38ed11-7b33-4283-9b32-d888432c2fa7 "HTTP/1.1 200 OK"
2025-08-23 22:08:55,178 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/be38ed11-7b33-4283-9b32-d888432c2fa7 "HTTP/1.1 200 OK"
2025-08-23 22:08:59,519 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/be38ed11-7b33-4283-9b32-d888432c2fa7 "HTTP/1.1 200 OK"
2025-08-23 22:09:05,438 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/be38ed11-7b33-4283-9b32-d888432c2fa7 "HTTP/1.1 200 OK"
2025-08-23 22:09:11,419 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/be38ed11-7b33-4283-9b32-d888432c2fa7 "HTTP/1.1 200 OK"
2025-08-23 22:09:11,889 - INFO - HTTP Request: GET https://api.cloud.llamain

Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 22:09:20,861 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id e2861734-642c-4324-b43a-fa7079e8679c


2025-08-23 22:09:22,295 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/e2861734-642c-4324-b43a-fa7079e8679c "HTTP/1.1 200 OK"
2025-08-23 22:09:24,855 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/e2861734-642c-4324-b43a-fa7079e8679c "HTTP/1.1 200 OK"
2025-08-23 22:09:28,507 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/e2861734-642c-4324-b43a-fa7079e8679c "HTTP/1.1 200 OK"
2025-08-23 22:09:32,872 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/e2861734-642c-4324-b43a-fa7079e8679c "HTTP/1.1 200 OK"
2025-08-23 22:09:40,934 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/e2861734-642c-4324-b43a-fa7079e8679c "HTTP/1.1 200 OK"
2025-08-23 22:09:47,547 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/e2861734-642c-4324-b43a-fa7079e8679c "HTTP/1.1 200 OK"
2025-08-23 22:09:48,058 - INFO - HTTP Request: GET https://api.cloud.llamain

Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-23 22:09:56,911 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id cad1f0ba-f81a-4803-8770-321a9af9ac1d


2025-08-23 22:09:58,291 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/cad1f0ba-f81a-4803-8770-321a9af9ac1d "HTTP/1.1 200 OK"
2025-08-23 22:10:00,691 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/cad1f0ba-f81a-4803-8770-321a9af9ac1d "HTTP/1.1 200 OK"
2025-08-23 22:10:04,057 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/cad1f0ba-f81a-4803-8770-321a9af9ac1d "HTTP/1.1 200 OK"
2025-08-23 22:10:08,443 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/cad1f0ba-f81a-4803-8770-321a9af9ac1d "HTTP/1.1 200 OK"
2025-08-23 22:10:15,554 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/cad1f0ba-f81a-4803-8770-321a9af9ac1d "HTTP/1.1 200 OK"
2025-08-23 22:10:21,434 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/cad1f0ba-f81a-4803-8770-321a9af9ac1d "HTTP/1.1 200 OK"
2025-08-23 22:10:30,739 - INFO - HTTP Request: GET https://api.cloud.llamain

Error while parsing the file '<bytes/buffer>': Event loop is closed


[Document(id_='57e57e01-b5b6-4b9e-8e37-8a97d6b68470', embedding=None, metadata={'file_path': '/Users/tongcc/dev/projects/interview/ird_data_pipeline/tests/output_files/ird_pdfs/dipn01.pdf', 'file_name': 'dipn01.pdf', 'file_type': 'application/pdf', 'file_size': 502041, 'creation_date': '2020-09-30', 'last_modified_date': '2020-09-30'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='\n# Inland Revenue Department\n\n# The Government of the Hong Kong Special Administrative Region of the People’s Republic of China\n\n# DEPARTMENTAL INTERPRETATION AND PRACTICE NOTES\n\n# NO. 1 (REVISED)\n\n# PROFITS TAX\n\n# PART A: COMPUTING ASSESSABLE PROFIT

In [1]:
import pickle

In [10]:
# pickle dump the documents
with open('./objects/ird_llamaindex_pdf_documents.pkl', 'wb') as f:
    pickle.dump(documents, f)

In [2]:
# pickle load the documents
with open('./objects/ird_llamaindex_pdf_documents.pkl', 'rb') as f:
    documents_loaded = pickle.load(f)

documents_loaded[:5]


[Document(id_='57e57e01-b5b6-4b9e-8e37-8a97d6b68470', embedding=None, metadata={'file_path': '/Users/tongcc/dev/projects/interview/ird_data_pipeline/tests/output_files/ird_pdfs/dipn01.pdf', 'file_name': 'dipn01.pdf', 'file_type': 'application/pdf', 'file_size': 502041, 'creation_date': '2020-09-30', 'last_modified_date': '2020-09-30'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='\n# Inland Revenue Department\n\n# The Government of the Hong Kong Special Administrative Region of the People’s Republic of China\n\n# DEPARTMENTAL INTERPRETATION AND PRACTICE NOTES\n\n# NO. 1 (REVISED)\n\n# PROFITS TAX\n\n# PART A: COMPUTING ASSESSABLE PROFIT

In [3]:
documents = documents_loaded

In [4]:
len(documents)

1142

### Converting pdf files to markdown files

##### using pandoc

In [14]:
import subprocess

In [18]:
try:
    # using subprocess with pandoc to convert pdf to markdown in tests/output_files/ird_pdfs and output the markdown files in ird_md_pandoc
    directory_path = "./output_files/ird_pdfs"
    output_directory = "./output_files/ird_md_pandoc"
    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(directory_path, filename)
            md_filename = filename.replace(".pdf", ".md")
            md_path = os.path.join(output_directory, md_filename)
            print(f'pdf_path: {pdf_path}, md_path: {md_path}')
            pandoc_command = ["Pandoc", "-f", "pdf", "-t", "markdown", "-o", md_path, pdf_path]
            subprocess.run(pandoc_command, check=True, capture_output=True, text=True)
            print(f"Converted {pdf_path} to {md_path}")
except subprocess.CalledProcessError as e:
    print(f"Error downloading file: {e}")
    print(f"Stderr: {e.stderr}")

pdf_path: ./output_files/ird_pdfs/dipn19.pdf, md_path: ./output_files/ird_md_pandoc/dipn19.md
Error downloading file: Command '['Pandoc', '-f', 'pdf', '-t', 'markdown', '-o', './output_files/ird_md_pandoc/dipn19.md', './output_files/ird_pdfs/dipn19.pdf']' returned non-zero exit status 21.
Stderr: Unknown input format pdf
Pandoc can convert to PDF, but not from PDF.



##### using markitdown

In [25]:
from markitdown import MarkItDown

# Create a converter instance
converter = MarkItDown()

# Convert PDF to Markdown
directory_path = "./output_files/ird_pdfs"
output_directory = "./output_files/ird_md_markitdown"
for filename in os.listdir(directory_path):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(directory_path, filename)
        md_filename = filename.replace(".pdf", ".md")
        md_path = os.path.join(output_directory, md_filename)
        markdown_text = converter.convert(pdf_path)

        # Persist the result
        with open(md_path, "w", encoding="utf-8") as f:
            f.write(markdown_text.text_content)


In [24]:
markdown_text.text_content

"Inland Revenue Department\nThe Government of the Hong Kong Special Administrative Region\nof the People's Republic of China\n\nDEPARTMENTAL INTERPRETATION AND PRACTICE NOTES\n\nNO. 14 (REVISED)\n\nPROPERTY TAX\n\nThese  notes  are  issued  for  the  information  of  taxpayers  and  their  tax\nrepresentatives.  They  contain  the  Department’s  interpretation  and  practices  in\nrelation to the law as it stood at the date of publication.  Taxpayers are reminded\nthat their right of objection against the assessment and their right of appeal to the\nCommissioner,  the  Board  of  Review  or  the  Court  are  not  affected  by  the\napplication of these notes.\n\nThese notes replace those issued in February 2005.\n\nCHU Yam-yuen\nCommissioner of Inland Revenue\n\nMarch 2011\n\nOur web site : www.ird.gov.hk\n\n\x0cDEPARTMENTAL INTERPRETATION AND PRACTICE NOTES\n\nNo. 14 (REVISED)\n\nCONTENT\n\nParagraph\n\nIntroduction\n\nCharge of property tax\n\nMeanings of “Owners”\nMeanings of “Land 

##### using unstructured.io

In [32]:
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import elements_to_md

# Convert PDF to Markdown
directory_path = "./output_files/ird_pdfs"
output_directory = "./output_files/ird_md_unstructuredio"
for filename in os.listdir(directory_path):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(directory_path, filename)
        md_filename = filename.replace(".pdf", ".md")
        md_path = os.path.join(output_directory, md_filename)
        
        elements = partition_pdf(filename=pdf_path, stragtegy="hi_res")
        elements_to_md(elements=elements, filename=md_path)




### TokenTextSplitter research

In [5]:
import sys
sys.path.append('/Users/tongcc/dev/projects/interview/ird_data_pipeline')
from llama_index.core.node_parser import TokenTextSplitter
from src.config.settings import CHUNK_SIZE, CHUNK_OVERLAP

In [6]:
splitter = TokenTextSplitter(
    chunk_size=CHUNK_SIZE, 
    chunk_overlap=CHUNK_OVERLAP,
)
token_nodes = splitter.get_nodes_from_documents(documents=documents, show_progress=True)

  from .autonotebook import tqdm as notebook_tqdm
Parsing nodes: 100%|██████████| 1142/1142 [00:00<00:00, 2015.35it/s]


In [7]:
token_nodes[:5]

[TextNode(id_='e1b48477-8502-418e-a932-c3fbd03127b0', embedding=None, metadata={'file_path': '/Users/tongcc/dev/projects/interview/ird_data_pipeline/tests/output_files/ird_pdfs/dipn01.pdf', 'file_name': 'dipn01.pdf', 'file_type': 'application/pdf', 'file_size': 502041, 'creation_date': '2020-09-30', 'last_modified_date': '2020-09-30'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='57e57e01-b5b6-4b9e-8e37-8a97d6b68470', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': '/Users/tongcc/dev/projects/interview/ird_data_pipeline/tests/output_files/ird_pdfs/dipn01.pdf', 'file_name': 'dipn01.pdf', 'file_type': 'application/pdf', 'file_size': 502041, 'creation_date': '2020-09-30', 'last_modified_date': 

### Generating embedding research with Llama-Index

In [8]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import torch
import sys
sys.path.append('/Users/tongcc/dev/projects/interview/ird_data_pipeline')
from src.config.settings import EMBEDDING_MODEL_NAME, MAX_LENGTH

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [10]:

embedding_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL_NAME, max_length=MAX_LENGTH, device=device)

In [11]:
embeddings = embedding_model.get_text_embedding("box")
dim = len(embeddings)
print("embedding dimension of example text ===>",dim)

embedding dimension of example text ===> 768


### Opensearch research with Llama-Index

In [12]:
from llama_index.vector_stores.opensearch import (
    OpensearchVectorStore,
    OpensearchVectorClient,
)

In [22]:
opensearch_endpoint = "http://localhost:9200"
index_name = "my-nlp-index"

text_field = "context_text" # OpensearchVectorClient store text in this field by default
embedding_field = "passage_embedding" # OpensearchVectorClient store embedding vector in this field by default
search_pipeline="nlp-search-pipeline"

opensearch_client = OpensearchVectorClient(
    endpoint=opensearch_endpoint,
    index=index_name,
    dim=dim,
    embedding_field=embedding_field,
    text_field=text_field,
    search_pipeline=search_pipeline,
)

In [23]:
# initialize the vector store
vector_store = OpensearchVectorStore(opensearch_client)

### VectorStoreIndex and Store Embeddings with Llama-Index

In [24]:
from llama_index.core import VectorStoreIndex, StorageContext

In [25]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [26]:
index = VectorStoreIndex(
    token_nodes, storage_context=storage_context, embed_model=embedding_model
)

### Retriever with Llama-Index

In [27]:
from llama_index.core.vector_stores.types import VectorStoreQueryMode
# import sys
# sys.path.append('/Users/tongcc/dev/projects/interview/ird_data_pipeline')
# from src.config.settings import TOP_K
TOP_K = 3

In [28]:
retriever = index.as_retriever(
    similarity_top_k=TOP_K,
    vector_store_query_mode=VectorStoreQueryMode.HYBRID
)

In [29]:
# Retrieve relevant documents
query = "What is the tax treatment of a gain arising from the sale of a capital asset?"

prompt = retriever.retrieve(query)

for r in prompt:
    print(r.metadata)
    print(r)
    print()

{'file_path': '/Users/tongcc/dev/projects/interview/ird_data_pipeline/tests/output_files/ird_pdfs/dipn42.pdf', 'file_name': 'dipn42.pdf', 'file_type': 'application/pdf', 'file_size': 458508, 'creation_date': '2021-06-08', 'last_modified_date': '2021-06-08'}
Node ID: 20e9ce9a-1504-460f-8519-50b2dcf7e268
Text: 85. Exchange gains or losses are neither taxable nor allowable
if they are of a capital nature. In *CIR v General Garment Manufactory
(Hong Kong) Ltd 4 HKTC 532, the exchange loss was found deductible
because, notwithstanding the Board’s limited analysis, the intention
at the time of acquisition of the foreign currency was to dispose of
it quick...
Score:  1.000


{'file_path': '/Users/tongcc/dev/projects/interview/ird_data_pipeline/tests/output_files/ird_pdfs/dipn42.pdf', 'file_name': 'dipn42.pdf', 'file_type': 'application/pdf', 'file_size': 458508, 'creation_date': '2021-06-08', 'last_modified_date': '2021-06-08'}
Node ID: c57e3f1d-2903-4cfd-9951-5af2c554692c
Text: and the gain 

### Testing scripts

In [1]:
import nest_asyncio
nest_asyncio.apply()
import os
import json
import re
from dotenv import load_dotenv
from llama_parse import LlamaParse
from llama_index.core import Document
from llama_index.core import SimpleDirectoryReader
load_dotenv(override=True)

# create llama parse instance
parser = LlamaParse(
    api_key=os.getenv('LLAMAINDEX_KEY'),
    result_type="markdown",
    num_workers=4,
    language='en',
    verbose=False
)

# start to parse the pdf files in output_files/ird_pdfs
# using SimpleDirectoryReader
file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader(
    input_dir="./output_files/ird_pdfs",
    file_extractor=file_extractor
)



In [2]:
def preprocess_text(text: str) -> str:
    """
    It is a function to preprocess the input text by removing HTML tags, special characters, and extra spaces.

    Args:
        text (str): The input text to be preprocessed.
    
    Returns:
        str: The preprocessed text.
    """

    # Basic text preprocessing can be done here
    # remove special characters and extra spaces
    html_pattern = re.compile(r'<.*?>')
    text = re.sub(html_pattern, '', text)  # remove HTML tags
    # text = re.sub(r'\W+', ' ', text)
    text = text.strip()
    return text

In [3]:
# load ird pdf metadata
with open('./output_files/ird_pdf_results.json', 'r', encoding='utf-8') as f:
    ird_pdf_metadata = json.load(f)
    ird_pdf_metadata = sorted(ird_pdf_metadata, key=lambda x: x['pdf_link'])

docs_pdf = []
for index, docs in enumerate(documents.iter_data()):
    print(len(docs))
    print('------')
    for doc in docs:
        document = Document(text=preprocess_text(doc.text), metadata=ird_pdf_metadata[index])
        docs_pdf.append(document)

2025-08-27 07:33:34,778 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"
2025-08-27 07:33:36,122 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/d3de1712-d292-49d0-987b-dcb63383e3f8 "HTTP/1.1 200 OK"
2025-08-27 07:33:38,453 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/d3de1712-d292-49d0-987b-dcb63383e3f8 "HTTP/1.1 200 OK"
2025-08-27 07:33:38,976 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/d3de1712-d292-49d0-987b-dcb63383e3f8/result/markdown "HTTP/1.1 200 OK"


74
------
Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-27 07:33:41,142 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"
2025-08-27 07:33:42,475 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/4b537910-413a-4667-92e0-b126107066f6 "HTTP/1.1 200 OK"
2025-08-27 07:33:44,812 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/4b537910-413a-4667-92e0-b126107066f6 "HTTP/1.1 200 OK"
2025-08-27 07:33:45,261 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/4b537910-413a-4667-92e0-b126107066f6/result/markdown "HTTP/1.1 200 OK"


6
------
Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-08-27 07:33:47,473 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"
2025-08-27 07:33:48,799 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/86c53bed-de71-48f2-8b73-9e6dc6848b32 "HTTP/1.1 200 OK"
2025-08-27 07:33:51,134 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/86c53bed-de71-48f2-8b73-9e6dc6848b32 "HTTP/1.1 200 OK"
2025-08-27 07:33:51,672 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/86c53bed-de71-48f2-8b73-9e6dc6848b32/result/markdown "HTTP/1.1 200 OK"


25
------


In [4]:
all_docs = list(documents.load_data(num_workers=10))

2025-08-27 07:34:36,731 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"
2025-08-27 07:34:36,731 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"
2025-08-27 07:34:36,929 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"
2025-08-27 07:34:37,243 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"
2025-08-27 07:34:37,243 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"
2025-08-27 07:34:38,036 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/58cd41f1-6d79-42c6-ad93-1ccddbc46286 "HTTP/1.1 200 OK"
2025-08-27 07:34:38,069 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/a2bd8f35-4a55-4ae7-807a-19dbf5fc9805 "HTTP/1.1 200 OK"
2025-08-27 07:34:38,322 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/

In [5]:
all_docs

[Document(id_='6c869203-e642-4402-b7d4-b738a8d8719f', embedding=None, metadata={'file_path': '/Users/tongcc/dev/projects/interview/ird_data_pipeline/tests/output_files/ird_pdfs/dipn01.pdf', 'file_name': 'dipn01.pdf', 'file_type': 'application/pdf', 'file_size': 502041, 'creation_date': '2020-09-30', 'last_modified_date': '2020-09-30'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='\n# Inland Revenue Department\n\n# The Government of the Hong Kong Special Administrative Region of the People’s Republic of China\n\n# DEPARTMENTAL INTERPRETATION AND PRACTICE NOTES\n\n# NO. 1 (REVISED)\n\n# PROFITS TAX\n\n# PART A: COMPUTING ASSESSABLE PROFIT

In [8]:
import pickle

In [9]:
# pickle dump the documents
with open('/Users/tongcc/dev/projects/interview/ird_data_pipeline/tests/objects/test_documents.pkl', 'wb') as f:
    pickle.dump(all_docs, f)

In [10]:
# pickle load the documents
with open('/Users/tongcc/dev/projects/interview/ird_data_pipeline/tests/objects/test_documents.pkl', 'rb') as f:
    all_docs = pickle.load(f)


In [11]:
all_docs

[Document(id_='281b1788-ebf7-42fa-b316-a0cdad2bad6f', embedding=None, metadata={'file_path': '/Users/tongcc/dev/projects/interview/ird_data_pipeline/tests/output_files/ird_pdfs/dipn01.pdf', 'file_name': 'dipn01.pdf', 'file_type': 'application/pdf', 'file_size': 502041, 'creation_date': '2020-09-30', 'last_modified_date': '2020-09-30'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='\n# Inland Revenue Department\n\n# The Government of the Hong Kong Special Administrative Region of the People’s Republic of China\n\n# DEPARTMENTAL INTERPRETATION AND PRACTICE NOTES\n\n# NO. 1 (REVISED)\n\n# PROFITS TAX\n\n# PART A: COMPUTING ASSESSABLE PROFIT

### Convert pdf files to markdown

In [10]:
import aspose.words as aw

doc = aw.Document("./output_files/ird_pdfs/dipn01.pdf")
doc.save("./output_files/ird_md_apose/dipn01.md")

<aspose.words.saving.SaveOutputParameters object at 0x1222ec1b0>

In [12]:
# Example using PyMuPDF (simplified)
import fitz # PyMuPDF

doc = fitz.open("./output_files/ird_pdfs/dipn01.pdf")
markdown_output = ""
for page in doc:
    markdown_output += page.get_text("mark?down") # Extracts text as Markdown
with open("./output_files/ird_md_fitz/dipn01.md", "w") as f:
    f.write(markdown_output)

RuntimeError: Directory 'static/' does not exist

### Convert pdf files to xml