# Notebook for testing

### scrape ird documents

Using scrapy to scrape ird documents.

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess

class IrdTableSpider(scrapy.Spider):
    name = "ird_table_spider"
    start_urls = ['https://www.ird.gov.hk/eng/ppr/arc.htm']

    def parse(self, response):
        # Select the table rows, skip the header
        rows = response.xpath('//table[contains(@class, "border_table")]/tbody/tr[position()>1]')
        for row in rows:
            case_no = row.xpath('td[1]/a/text()').get()
            case_link = row.xpath('td[1]/a/@href').get()
            provision = row.xpath('td[2]/text()').get()
            # Get all <li> items in the 3rd column
            index_items = row.xpath('td[3]//li/text()').getall()
            yield {
                'case_no': case_no,
                'case_link': response.urljoin(case_link) if case_link else None,
                'provision': provision.strip() if provision else None,
                'index': [item.strip() for item in index_items],
            }

def run_spider():
    process = CrawlerProcess(settings={
        "LOG_LEVEL": "ERROR",
        "FEEDS": {"results.json": {"format": "json"}},
    })
    process.crawl(IrdTableSpider)
    process.start()

if __name__ == "__main__":
    run_spider()

Using Selenium to scrape ird documents.

In [21]:
# write a python script using selenium to scrape the table data from https://www.ird.gov.hk/eng/ppr/arc.htm and save it as a json file
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import json
import time

In [None]:
def scrape_ird_table():
    options = Options()
    options.headless = True
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get('https://www.ird.gov.hk/eng/ppr/arc.htm')
    time.sleep(5)  # wait for the page to load

    table = driver.find_element(By.CSS_SELECTOR, 'table.border_table')
    rows = table.find_elements(By.TAG_NAME, 'tr')[1:]  # skip header row

    results = []
    for row in rows:
        cols = row.find_elements(By.TAG_NAME, 'td')

        case_no = cols[0].find_element(By.TAG_NAME, 'a').get_attribute('innerHTML') if cols[0].find_elements(By.TAG_NAME, 'a') else None
        case_link = cols[0].find_element(By.TAG_NAME, 'a').get_attribute('href') if cols[0].find_elements(By.TAG_NAME, 'a') else None
        provision = cols[1].get_attribute('innerHTML') if cols[1] else None
        index_items = [li.get_attribute('innerHTML') for li in cols[2].find_elements(By.TAG_NAME, 'li')]
        results.append({
            'case_no': case_no,
            'case_link': case_link,
            'provision': provision,
            'index': index_items,
        })

    with open('./output_files/ird_results.json', 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=4)

    driver.quit()

if __name__ == "__main__":
    scrape_ird_table()

<selenium.webdriver.remote.webelement.WebElement (session="d6d308dab4a380b0608ce8dbce7a0b47", element="f.E406D1A056F22A83968F0DBC1D753D7B.d.D010D05BAB9ABE93990C532DA6EDC61D.e.70")>
<selenium.webdriver.remote.webelement.WebElement (session="d6d308dab4a380b0608ce8dbce7a0b47", element="f.E406D1A056F22A83968F0DBC1D753D7B.d.D010D05BAB9ABE93990C532DA6EDC61D.e.68")>
[<selenium.webdriver.remote.webelement.WebElement (session="d6d308dab4a380b0608ce8dbce7a0b47", element="f.E406D1A056F22A83968F0DBC1D753D7B.d.D010D05BAB9ABE93990C532DA6EDC61D.e.71")>, <selenium.webdriver.remote.webelement.WebElement (session="d6d308dab4a380b0608ce8dbce7a0b47", element="f.E406D1A056F22A83968F0DBC1D753D7B.d.D010D05BAB9ABE93990C532DA6EDC61D.e.72")>]
<selenium.webdriver.remote.webelement.WebElement (session="d6d308dab4a380b0608ce8dbce7a0b47", element="f.E406D1A056F22A83968F0DBC1D753D7B.d.D010D05BAB9ABE93990C532DA6EDC61D.e.76")>
<selenium.webdriver.remote.webelement.WebElement (session="d6d308dab4a380b0608ce8dbce7a0b47"

### Download ird pdf documents

In [16]:
import subprocess

In [18]:
i = 63
f"{i:02d}"

'63'

In [None]:
# download pdf documents to output_files/ird_pdfs

destination_directory = "./output_files/ird_pdfs"

try:
    for i in range(1, 64):
        pdf_url = f"https://www.ird.gov.hk/eng/pdf/dipn{i:02d}.pdf"
        wget_command = ["wget", "-P", destination_directory, pdf_url]
        subprocess.run(wget_command, check=True, capture_output=True, text=True)
        print(f"File downloaded successfully to: {destination_directory}")
except subprocess.CalledProcessError as e:
    print(f"Error downloading file: {e}")
    print(f"Stderr: {e.stderr}")

File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to: ./output_files/ird_pdfs
File downloaded successfully to

In [20]:
try:
    # download pdf 13A document
    pdf_url = f"https://www.ird.gov.hk/eng/pdf/dipn13a.pdf"
    wget_command = ["wget", "-P", destination_directory, pdf_url]
    subprocess.run(wget_command, check=True, capture_output=True, text=True)
    print(f"File downloaded successfully to: {destination_directory}")
except subprocess.CalledProcessError as e:
    print(f"Error downloading file: {e}")
    print(f"Stderr: {e.stderr}")

File downloaded successfully to: ./output_files/ird_pdfs


### Scrape pdf files metadata

In [None]:
def scrape_ird_pdf_metadata():
    options = Options()
    options.headless = True
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get('https://www.ird.gov.hk/eng/ppr/dip.htm')
    time.sleep(5)  # wait for the page to load

    table = driver.find_element(By.CSS_SELECTOR, 'table.border_table')
    rows = table.find_elements(By.TAG_NAME, 'tr')[1:]  # skip header row

    results = []
    for row in rows:
        cols = row.find_elements(By.TAG_NAME, 'td')

        pdf_no = cols[0].find_element(By.TAG_NAME, 'a').get_attribute('innerHTML') if cols[0].find_elements(By.TAG_NAME, 'a') else None
        pdf_link = cols[0].find_element(By.TAG_NAME, 'a').get_attribute('href') if cols[0].find_elements(By.TAG_NAME, 'a') else None
        pdf_notes = cols[1].find_element(By.TAG_NAME, 'a').get_attribute('innerHTML') if cols[1] else None
        pdf_notes = 
        pdf_date = cols[2].get_attribute('innerHTML') if cols[2] else None
        results.append({
            'pdf_no': pdf_no,
            'pdf_link': pdf_link,
            'pdf_notes': pdf_notes,
            'index': pdf_date,
        })

    with open('./output_files/ird_pdf_results.json', 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=4)

    driver.quit()

if __name__ == "__main__":
    scrape_ird_pdf_metadata()

### Parsing pdf file using llama-parse