The following cell is the modifed cell, where I used @jit "a decorator from Numba" to compile Python code to machine code for faster execution.

The original timing is 0.8582 seconds; and the new timing for the cell is 0.7699 seconds.

In [15]:
# Import required modules
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import time
from numba import jit

# Path to your XML file
xml_file_path = r'C:\Users\boa\Desktop\xml_containing_html.xml'

# Step 1: Extract HTML content from XML
def extract_html_from_xml(xml_content):
    start_time = time.time()  # Start timing
    
    # Parse the XML content
    root = ET.fromstring(xml_content)
    
    # List to store extracted HTML content
    html_pages = []
    
    # Iterate over all <content> elements in the XML
    for content in root.findall('.//content'):
        html_string = content.text.strip() if content.text else ""
        html_pages.append(html_string)
    
    end_time = time.time()  # End timing
    print(f"Time taken to extract HTML from XML: {end_time - start_time:.4f} seconds")
    
    return html_pages

# Step 2: Optimize link counting function with numba
@jit(nopython=True)
def count_links(link_texts):
    return len(link_texts)

# Step 3: Parse HTML content and count links
def parse_html_and_count_links(html_pages):
    for index, html in enumerate(html_pages):
        start_time = time.time()  # Start timing for each HTML page

        soup = BeautifulSoup(html, 'html.parser')
        
        # Extract the title (if available)
        title = soup.title.string if soup.title else "No title found"
        
        # Find all <a> tags (links)
        links = soup.find_all('a')
        num_links = count_links([link.get('href') for link in links])  # Use numba-optimized count

        # Print out the title and number of links
        print(f"HTML Page {index + 1}:")
        print(f"Title: {title}")
        print(f"Number of links: {num_links}")
        
        # Print all the links found
        for i, link in enumerate(links, 1):
            print(f"  Link {i}: {link.get('href')} -> {link.text}")
        
        end_time = time.time()  # End timing for each HTML page
        print(f"Time taken to parse HTML Page {index + 1}: {end_time - start_time:.4f} seconds\n")

# Main execution
# Read the XML file
with open(xml_file_path, 'r', encoding='utf-8') as file:
    xml_content = file.read()

# Measure time for the entire process
start_time_total = time.time()

# Extract HTML content from the XML
html_pages = extract_html_from_xml(xml_content)

# Parse HTML content and count links
parse_html_and_count_links(html_pages)

# End time for the entire process
end_time_total = time.time()
print(f"Total time taken for the entire process: {end_time_total - start_time_total:.4f} seconds")


Time taken to extract HTML from XML: 0.0100 seconds
HTML Page 1:
Title: observations on the role of IT trainer
Number of links: 5
  Link 1: https://www.munoz-peterson.com/ -> to
  Link 2: https://www.munoz-peterson.com/ -> to
  Link 3: https://richardson.net/ -> fact.
  Link 4: https://smith.com/ -> of
  Link 5: https://smith.com/ -> of
Time taken to parse HTML Page 1: 0.4732 seconds

HTML Page 2:
Title: observations on the role of Waste management officer
Number of links: 5
  Link 1: http://carter.net/ -> the
  Link 2: http://carter.net/ -> the
  Link 3: http://huff-hudson.com/ -> to
  Link 4: http://huff-hudson.com/ -> to
  Link 5: https://fuentes.net/ -> forms
Time taken to parse HTML Page 2: 0.0050 seconds

HTML Page 3:
Title: observations on the role of Data processing manager
Number of links: 4
  Link 1: https://avila-thompson.com/ -> quite
  Link 2: https://avila-thompson.com/ -> quite
  Link 3: https://copeland.org/ -> selectional
  Link 4: http://www.grant-williamson.com/ -> F