## Imports

In [7]:
import re
import json
from tqdm import tqdm
from bs4 import BeautifulSoup
import bs4
import os
import sys

In [8]:
# Python has a recursion limit of 3K.
# We raise it up to 10K
sys.setrecursionlimit(10000)

## HTMLCleaner

In [9]:
class HTMLCleaner(object):
    def __init__(self, input_file, output_file):
        self.input_file = input_file
        self.output_file = output_file
        self.line = ""
        
    # CLEAN HTML String --------------------------------------- start
    def clean_tag(self, x):
        '''Split the matched tag into components
        Check if the first component starts with '<'
        If it does, reconstruct the tag with the first and second components
        If not, reconstruct the tag with the first component
        Return the cleaned tag'''

        x = x.group().split()
        if x[0] == '<': 
            y = x[0] + x[1].strip('>') + '>'
        else:
            y = x[0].strip('>') + '>'
        return y
    

    def clean_html(self):
        '''Preprocess an HTML string and cleans it

        Define regular expressions for different HTML elements
        Compile regular expressions into patterns
        Initialize lists to store different elements

        Open file
        - Extract annotations
        - Extract close tags
        - Extract styles
        - Extract all tags

        Store the cleaned line'''

        style_regex = "(?:<style.*?>(?:.|[\r\n])*?</style>|<script.*?>(?:.|[\r\n])*?</script>)"
        all_tag_regex = "(?:<(?:!|/?[a-zA-Z]+).*?/?>)"
        close_tag_regex = '(<(?:!|/?[a-zA-Z]+)[^>]*?/>){1}?'
        annotation_regex = '(?:<!--(?:.|[\r\n])*?-->)'
        close_tag_pattern = re.compile(close_tag_regex)
        annotation_pattern = re.compile(annotation_regex)
        all_tag_pattern = re.compile(all_tag_regex)
        style_pattern = re.compile(style_regex)
        tags = []
        annotation = []
        close_tag = []
        all_tag = []
        style = []
        with open(self.input_file, 'r') as f:
            line = f.read()
            annotation.extend(annotation_pattern.findall(line))
            #print(annotation)
            line = re.sub(annotation_regex, '', line)
            close_tag.extend(close_tag_pattern.findall(line))
            #print(close_tag)
            line = re.sub(close_tag_regex, '', line)
            style.extend(style_pattern.findall(line))
            #print(style)
            line = re.sub(style_regex, '', line)
            all_tag.extend(all_tag_pattern.findall(line))
            #print(all_tag)
            line = re.sub(all_tag_regex, self.clean_tag, line)  # Note: self.clean_tag is missing
            line = re.sub(r'ï»¿', '', line)
        #print(line)
        self.line = line
    # CLEAN HTML String --------------------------------------- end



    # BUILD THREE --------------------------------------- start
    def if_text(self, node):
        '''
        Check if the text content of the HTML or XML node is empty
        Return 0 if the text is empty (considered "falsy")
        Return 1 if the text is not empty (considered "truthy")'''
        if node.get_text('|', strip=True) == "":
            return 0
        else:
            return 1
    
    def get_tree(self,root):
        '''
        Recursively processes the HTML/XML tree rooted at 'root', removing elements with empty text content.
        Args: root (bs4.element.Tag): The root of the HTML/XML tree.
        Returns: bs4.element.Tag: The modified root after removing elements with empty text content.'''
        delete_list = []
        # Iterate through the children of the root
        for child in root.children:
            # Check if the child is a bs4.element.Tag
            if type(child) == bs4.element.Tag:
                # Check if the text content of the tag is empty
                if not self.if_text(child):
                    delete_list.append(child)
                else:
                    # Recursively process child if it's an HTML tag
                    child = self.get_tree(child)
            elif type(child) == bs4.element.NavigableString:
                # Check if the NavigableString (text) is empty
                if str(child).strip() == "":
                    delete_list.append(child)
        # Remove items in the delete_list from the tree
        for item in delete_list:
            item.extract()
        # Return the modified root
        return root
    
    def merge_tree(self,root,k):
        '''
        Merges children of an HTML/XML tree node if the number of children is less than or equal to 'k'.
        Args: root (bs4.element.Tag): The root of the HTML/XML tree node.
              k (int): The threshold for the number of children to trigger merging.
        Returns:  bs4.element.Tag: The modified root after merging children if necessary.'''
        i = 0
        # Iterate through the children of the root
        while(i<len(root.contents)):
            # Check if the child is an HTML tag
            if(type(root.contents[i])==bs4.element.Tag):
                # Check if the number of children is less than or equal to 'k'
                if len(root.contents[i].contents)<=k:
                    tmp = root.contents[i]
                    j = i
                    # Remove the current child from the tree
                    del root.contents[i]
                    # Insert the contents of the removed child at the current position
                    for item in tmp.contents:
                        root.contents.insert(j,item)
                        j+=1
                else:
                    # Recursively merge children if the number of children exceeds 'k'
                    root.contents[i] = self.merge_tree(root.contents[i],k)
                    i+=1
            else:
                # Skip non-tag elements
                i+=1
        # Return the modified root after merging children
        return root
                    
    def clean_tree(self):
        root = BeautifulSoup(self.line,'html.parser').html
        root = self.get_tree(root)
        root = self.merge_tree(root,2)
        self.root = root
    # BUILD THREE --------------------------------------- end


    def store(self):
        with open(self.output_file,'w+')as f:
            f.write(str(self.root))

m_path = "./data/endata/auto/auto-aol(2000)/0000.htm"
m_clean_path = "./data/endata_new_clean/autoauto-aol/clean_0000.htm"
cleaner = HTMLCleaner(m_path, m_clean_path)
cleaner.clean_html()
cleaner.clean_tree()

## HTMLStorer

In [10]:
class HTMLStorer(object):
    def __init__(self,input_file=None,html=None):
        '''
        Accepts as input_file or html, after: cleaner.clean_html() AND cleaner.clean_tree()
        '''
        # Check if neither input_file nor html is provided
        if not input_file and not html:
            #print("CASE-ERROR")
            raise ValueError("lack of input file or html")
        # If input_file is provided, read its contents and assign to html_text
        if input_file:
            #print("CASE-1")
            with open(input_file, 'r') as f:
                self.html_text = f.read()
        # If html is provided, assign it to html_text
        else:
            #print("CASE-2")
            self.html_text = html
        self.soup = BeautifulSoup(self.html_text, 'html.parser')  # Create BeautifulSoup object using html_text and html.parser
        self.root = self.soup      # Set the root of the HTML tree to the BeautifulSoup object
        self.root.depth = 0        # Set the depth of the root to 0
        self.depth = {}            # Initialize an empty dictionary for depth information
        self.idx = 0               # Initialize an index variable
        self.data = []             # Initialize an empty list for storing data
        self.root = self.add_text_node(self.root)  # Add text nodes to the HTML tree and update the root
        self.get_index(self.root)  # Get the index of each node in the HTML tree
        self.get_data(self.root)   # Get data from each node in the HTML tree

    
    def add_text_node(self, root):
        '''
        Recursively traverse the HTML tree and wrap NavigableString nodes in a 'textnode' tag.
        Parameters: root (bs4.element.Tag): The current node in the HTML tree.
        Returns: bs4.element.Tag: The modified HTML tree with text nodes wrapped.
        
        try:
            if: If the child is a Tag and not named 'textnode', recursively call add_text_node
            elif: If the child is a NavigableString, wrap it in a 'textnode' tag
        except:
            Handle exceptions (consider providing a more specific exception type)
        '''
        for child in root.children:
            try:
                if isinstance(child, bs4.element.Tag) and child.name != "textnode":
                    #print('atn-RECURSION')
                    child = self.add_text_node(child)
                elif isinstance(child, bs4.element.NavigableString):
                    #print('atn-DONE')
                    child.wrap(self.soup.new_tag("textnode"))
            except Exception as e:
                print(f"Exception while processing node: {root}. Exception: {e}")
        return root


    def get_index(self, node):
        '''
        Recursively traverse the HTML tree and assign index and depth values to each Tag.
        Parameters: node (bs4.element.Tag): The current node in the HTML tree.
        Returns: None'''
        for child in node.children:
            if type(child) != bs4.element.Tag:
                continue
            # Assign index and depth values to the current child Tag
            child.idx = self.idx
            child.depth = node.depth + 1
            # Update the depth dictionary with the count of tags at the current depth
            if child.depth in self.depth:
                self.depth[child.depth] += 1
            else:
                self.depth[child.depth] = 1
            # Increment the index for the next Tag
            self.idx += 1
            # Recursively call get_index for the child Tag
            self.get_index(child)
    
    def get_tag_text(self, node):
        '''
        Extract and concatenate text content from NavigableString children of the given HTML node.
        Parameters:  node (bs4.element.Tag): The HTML node from which to extract text content.
        Returns: str: Concatenated and formatted text content. '''
        line = ""  # Initialize an empty string to store the concatenated text
        for child in node.children:
            if isinstance(child, bs4.element.NavigableString):
                x = str(child).strip().replace('\n', ' ')  # Convert NavigableString to string, strip, replace newlines, and concatenate
                if x != "":
                    line = line + '\t' + x.strip()

        return line.strip()  # Return the concatenated and formatted text content
    
    def get_data(self, node):
        '''
        Recursively traverse the HTML tree and gather information about each Tag.
        Parameters: node (bs4.element.Tag): The current node in the HTML tree.
        Returns: None
        '''
        for child_node in node.children:
            if not isinstance(child_node, bs4.element.Tag):
                continue # Skip non-Tag elements
            else:
                # Extract information about the current Tag
                name = child_node.name
                node_id = child_node.idx
                node_text = self.get_tag_text(child_node)
                # Gather indices of children that are Tags
                node_child_idx = []
                for item in child_node.children:
                    if type(item) == bs4.element.Tag:
                        node_child_idx.append(item.idx)
                # Create a dictionary representing the current Tag
                line = {"name": name, "id": node_id, "text": node_text, "children": node_child_idx}
                # Append the dictionary to the data list
                self.data.append(line)
                # Recursively call get_data for the child Tag
                self.get_data(child_node)

    def store(self,output_file):
        with open(output_file,'a')as g:
            g.write(str(self.idx)+'\t'+json.dumps(self.depth,ensure_ascii=False)+'\n')



storer = HTMLStorer(input_file=None,html=cleaner.line)


## MAIN

In [11]:
DIR_RAW_DATA_PATH = './data/endata/'
DIR_CLEAN_DATA = './data/endata_new_clean/'
JSON_TRAIN_CORPUS = './data/wiki_html_all.json'

In [15]:

# Open Train Corpus file
with open(JSON_TRAIN_CORPUS,'w')as g:
    for root, dirs, files in os.walk(DIR_RAW_DATA_PATH):

        iter_i = 0
        # Iter ZERO
        for dir in dirs:
            #print("DIR: "+dir)
            sub_DIR_RAW_DATA_PATH = os.path.join(root,dir)
            # Get folder
            iter_j=0
            # Iter ONE
            for sub_root,sub_dirs,sub_files in os.walk(sub_DIR_RAW_DATA_PATH):
                if iter_j > 0:
                    extracted_name = sub_root.split("\\")[1].split("(")[0]
                    #print("FOLDER: "+extracted_name)
                iter_j += 1
                # Get file
                iter_k = 0
                # Iter TWO
                for item_parsed in sub_files:
                    path = os.path.join(sub_root,item_parsed)
                    path = re.sub(r'\\', '/', path)
                    clean_path = os.path.join(DIR_CLEAN_DATA,dir+extracted_name+"/clean_"+item_parsed)
                    clean_path = re.sub(r'\((\d+)\)', '-', clean_path)
                    print(f"{iter_i} - {iter_j} - {iter_k}")
                    print(f"PATH: {path}")
                    print(f"FILE: {clean_path}")
                    cleaner = HTMLCleaner(path,clean_path)
                    cleaner.clean_html()
                    #cleaner.clean_tree()
                    #storer = HTMLStorer(input_file=None,html=cleaner.line)
                    print("")
                    if iter_k == 0:
                        break
                    iter_k += 1

                if iter_j == 2:
                    break
            iter_i =+1 

                

        """ 
        for dir in tqdm(dirs):
            sub_DIR_RAW_DATA_PATH = os.path.join(root,dir)
            for sub_root,sub_dirs,sub_files in os.walk(sub_DIR_RAW_DATA_PATH):
                # foeach file in dir
                for item_parsed in tqdm(sub_files):
                    #index = item_parsed.split('.')[0]
                    path = os.path.join(sub_root,item_parsed)
                    clean_path = DIR_CLEAN_DATA+'_clean.html'
                    cleaner = HTMLCleaner(path,clean_path)
                    cleaner.clean_html()
                    #cleaner.clean_tree()
                    storer = HTMLStorer(input_file=None,html=cleaner.line)
                    g.write(json.dumps(storer.data,ensure_ascii=False)+'\n') """


0 - 2 - 0
PATH: ./data/endata/auto/auto-aol(2000)/0000.htm
FILE: ./data/endata_new_clean/autoauto-aol/clean_0000.htm

1 - 2 - 0
PATH: ./data/endata/book/book-abebooks(2000)/0000.htm
FILE: ./data/endata_new_clean/bookbook-abebooks/clean_0000.htm

1 - 2 - 0
PATH: ./data/endata/camera/camera-amazon(1767)/0000.htm
FILE: ./data/endata_new_clean/cameracamera-amazon/clean_0000.htm

1 - 2 - 0
PATH: ./data/endata/job/job-careerbuilder(2000)/0000.htm
FILE: ./data/endata_new_clean/jobjob-careerbuilder/clean_0000.htm

1 - 2 - 0
PATH: ./data/endata/movie/movie-allmovie(2000)/0000.htm
FILE: ./data/endata_new_clean/moviemovie-allmovie/clean_0000.htm

1 - 2 - 0
PATH: ./data/endata/nbaplayer/nbaplayer-espn(434)/0000.htm
FILE: ./data/endata_new_clean/nbaplayernbaplayer-espn/clean_0000.htm

1 - 2 - 0
PATH: ./data/endata/restaurant/restaurant-fodors(2000)/0000.htm
FILE: ./data/endata_new_clean/restaurantrestaurant-fodors/clean_0000.htm

1 - 2 - 0
PATH: ./data/endata/university/university-collegeboard(2000

UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 129214: character maps to <undefined>