In [1]:
def singleton(class_):
    instances = {}
    def getinstance(*args, **kwargs):
        if class_ not in instances:
            instances[class_] = class_(*args, **kwargs)
        return instances[class_]
    return getinstance

In [2]:
@singleton
class FileOrganizer():
    CLEANED_HTML_FILE_POSTFIX = "_clean.html"
    HTML_FILE_POSTFIX = ".html"

    def get_html_file_name(self, name: str):
        return name + self.HTML_FILE_POSTFIX
    
    def get_html_cleaned_file_name(self, name: str):
        return name + self.CLEANED_HTML_FILE_POSTFIX

In [3]:
from bs4 import BeautifulSoup
def read_html(name):
    with open(FileOrganizer().get_html_file_name(name), "r") as html_page:
        html = BeautifulSoup(html_page.read(), "html.parser")
    return html


In [4]:
def write_html(html: BeautifulSoup, name: str):
    with open(FileOrganizer().get_html_cleaned_file_name(name), "w") as file:
        file.write(str(html))
    

In [5]:
name: str = "jumia"

In [6]:
html = read_html(name)
write_html(html.prettify(), name)

In [7]:
#html

In [8]:
class CleanElement():
    """This class is to represent a html element in a clean way
    It is similar to BeautifulSoup but we only keep necessary information
    """
    def __init__(self, tag: str, original_html: BeautifulSoup) -> None:
        self.tag = tag
        self.original_html = original_html


In [9]:
from enum import Enum


class HEADER_TAG(str, Enum):
    H1 = "h1"
    H2 = "h2"
    H3 = "h3"
    
class Header(CleanElement):
    
    def __init__(self, tag: HEADER_TAG, original_html: BeautifulSoup, text: str) -> None:
        super().__init__(tag, original_html)
        self.text = text
        # self.text_size = text_size # might use text size information later

In [10]:
from typing import List


HEADERS = ["h1", "h2", "h3"]

def find_all_headers(html: BeautifulSoup):
    header_tags: List[str] = list(HEADER_TAG)
    for h in html.find_all(header_tags):
        print(h.name + ":" + h.text.strip())
        

In [11]:
headers = find_all_headers(html)

h2:Share this product
h1:Binatone Dry Iron (Di1255)
h2:Promotions
h2:Delivery & Returns
h3:
h3:Choose your location
h3:Return Policy
h3:Warranty
h2:Seller Information
h2:Product details
h2:Specifications
h2:Key Features
h2:Specifications
h2:Verified Customer Feedback
h2:Verified Ratings (646)
h2:Comments from Verified Purchases(303)
h3:Nice
h3:Hot
h3:Good iron
h3:Binatone Dry Iron (Di1255)
h2:Recently Viewed
h2:


In [12]:
BeautifulSoup("""<body></body>""", "html.parser").text

''

In [13]:
def tranverse_tree_print(html: BeautifulSoup, indent: str= ""):
    return_str = ""
    children =  html.findChildren(recursive=False)
    is_include = len(children) > 0 or html.text.strip() is not None
    if is_include:
        texts = html.find_all(text=True, recursive=False)
        texts = [t.strip() for t in texts]
        texts = [t for t in texts if t]
        
        # print name        
        return_str += indent + html.name 
        # print text
        if len(texts)>0:
            for t in texts:
                return_str+=indent + t 
        # next line
        return_str+="\n"
        for e in children:
            return_str += tranverse_tree_print(e, indent=indent+"    ")
    return return_str


def test_tranverse_tree_print():
    soup = BeautifulSoup("""<div class="col4 -df -d-co -pvs">
      <span class="f-t -pbm">
       ABOUT JUMIA
      </span>
      <ul class="-lsn">
       <li>
        <a class="_link -pbxs" href="https://www.jumia.com.ng/about_us/">
         About us
        </a>
       </li>
       <li>
        <a class="_link -pbxs" href="https://www.jumia.com.ng/careers/">
         Jumia careers
        </a>
       </li>
      </ul>
     </div>""", "html.parser")
    expect = """[document]
    div
        span        ABOUT JUMIA
        ul
            li
                a                About us
            li
                a                Jumia careers"""
    result = tranverse_tree_print(soup)
    print(result)
    assert expect.strip() == result.strip()

test_tranverse_tree_print()

[document]
    div
        span        ABOUT JUMIA
        ul
            li
                a                About us
            li
                a                Jumia careers



In [14]:
for script in html("script"):
    script.decompose()
print(tranverse_tree_print(html))

[document]
    html
        head
            meta
            title            Binatone Dry Iron (Di1255) | Jumia Nigeria
            meta
            meta
            meta
            meta
            meta
            meta
            meta
            meta
            meta
            meta
            link
            link
            meta
            meta
            meta
            meta
            meta
            meta
            link
            link
            link
            link
            link
            link
            link
            link
            link
            link
            link
            link
            link
            link
            meta
            meta
            meta
            meta
            meta
            meta
            meta
            meta
            meta
            meta
            meta
            meta
        body
            div
                div
                    div
                        a
                            img