In [2]:
def singleton(class_):
    instances = {}
    def getinstance(*args, **kwargs):
        if class_ not in instances:
            instances[class_] = class_(*args, **kwargs)
        return instances[class_]
    return getinstance

In [3]:
@singleton
class FileOrganizer():
    CLEANED_HTML_FILE_POSTFIX = "_clean.html"
    HTML_FILE_POSTFIX = ".html"
    TREE_FILE_POSTFIX = "_tree.txt"
    
    def get_html_file_name(self, name: str):
        return name + self.HTML_FILE_POSTFIX
    
    def get_html_cleaned_file_name(self, name: str):
        return name + self.CLEANED_HTML_FILE_POSTFIX
    
    def get_tree_file_name(self, name: str):
        return name + self.TREE_FILE_POSTFIX

In [4]:
from bs4 import BeautifulSoup
def read_html(name):
    with open(FileOrganizer().get_html_file_name(name), "r") as html_page:
        html = BeautifulSoup(html_page.read(), "html.parser")
    return html


In [5]:
def write_html(html: BeautifulSoup, name: str):
    with open(FileOrganizer().get_html_cleaned_file_name(name), "w") as file:
        file.write(str(html))

In [6]:
name: str = "jumia"

In [7]:
html = read_html(name)
write_html(html.prettify(), name)

In [8]:
from enum import Enum

class HEADER_TAG(str, Enum):
    H1 = "h1"
    H2 = "h2"
    H3 = "h3"
    
IMG_TAG = "img"

In [48]:
def count_imgs(html: BeautifulSoup):
    count = 0
    if html.name == IMG_TAG:
        count += 1
    count += len(html.find_all(IMG_TAG))
    return count

print(count_imgs(html))

39


In [49]:
def count_headers(html: BeautifulSoup):
    count = 0
    if html.name in list(HEADER_TAG):
        count = 1
    count += len(html.find_all(list(HEADER_TAG)))
    return count

print(count_headers(html))

18


In [90]:
def tranverse_tree_print(html: BeautifulSoup, indent: str= ""):

    is_include = True
    is_include = count_imgs(html) > 0 or (count_headers(html) > 0 and html.text.strip() != "")
    if not is_include:
        return ""
    
    return_str = ""
    if is_include:
        texts = html.find_all(text=True, recursive=False)
        texts = [t.strip() for t in texts]
        texts = [t for t in texts if t]
        # name        
        return_str += indent + html.name 
        # id
        if html.has_attr("id"):
            return_str += "[" + html['id'] + "]"
        # text
        if len(texts)>0:
            for t in texts:
                return_str+= ": " + t 
        # or image source
        if html.name == "img" and html.has_attr("src"):
            if html.has_attr("class"):
                return_str += ": " + ", ".join(html['class'])
            return_str += " >>" + html['src']
        # next line
        return_str+="\n"
        children =  html.findChildren(recursive=False)
        for e in children:
            return_str += tranverse_tree_print(e, indent=indent+"    ")
    return return_str


def test_tranverse_tree_print():
    soup = BeautifulSoup("""
    <div>
      <span>
       ABOUT JUMIA
      </span>
      <ul>
       <li>
        <h1 class="h1" href="https://www.jumia.com.ng/about_us/">
         About us
        </h1>
       </li>
       <li>
        <h2 class="h2" href="https://www.jumia.com.ng/careers/">
         Jumia careers
        </h2>
        <h3>
        </h3>
       </li>
       <img class="image" alt="product_image_name-Binatone-Dry Iron (Di1255)-1" class="-fw -fh" data-src="https://ng.jumia.is/unsafe/fit-in/500x500/filters:fill(white)/product/54/8889201/1.jpg?5797" src="https://ng.jumia.is/unsafe/fit-in/500x500/filters:fill(white)/product/54/8889201/1.jpg?5797"/>
      </ul>
     </div>
     <div/>""", "html.parser")
    expect = """[document]
    div
        ul
            li
                h1: About us
            li
                h2: Jumia careers
            img: -fw, -fh >>https://ng.jumia.is/unsafe/fit-in/500x500/filters:fill(white)/product/54/8889201/1.jpg?5797"""
    result = tranverse_tree_print(soup)
    print(result)
    assert expect.strip() == result.strip()

test_tranverse_tree_print()

[document]
    div
        ul
            li
                h1: About us
            li
                h2: Jumia careers
            img: -fw, -fh >>https://ng.jumia.is/unsafe/fit-in/500x500/filters:fill(white)/product/54/8889201/1.jpg?5797



In [91]:
def clean_by_tag(html: BeautifulSoup, tag_name: str):
    for script in html(tag_name):
        script.decompose()
    return html

In [92]:
def write_tree(html: BeautifulSoup, name: str):
    html = clean_by_tag(html, "script")
    html = clean_by_tag(html, "head")
    html = clean_by_tag(html, "header")
    html = clean_by_tag(html, "footer")
    # html = clean_by_tag(html, "link")
    str_to_write = tranverse_tree_print(html)
    with open(FileOrganizer().get_tree_file_name(name), "w") as file:
        file.write(str_to_write)
        
write_tree(html, name)