In [None]:
import os
import re
# import json
import time
# import dateparser
import pandas as pd
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
# from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.remote.webdriver import WebDriver
from selenium.webdriver.remote.webelement import WebElement
# from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC

In [None]:
class KomdigiScraper:

    def __init__(self, web_driver: WebDriver):
        self.web_driver = web_driver
        self.ALPHABET = 'abcdefghijklmnopqrstuvwxyz'
        self.OL_DICT = {
            'a': 'lower-alpha', 
            'lower-alpha': 'lower-alpha',
            'decimal': 'decimal'
        }


    def __check_ol_tag(self, web_element: WebElement) -> str:
        # Mendapatkan isi full tag <ol> HTML dari element tersebut
        outer_html = web_element.get_attribute('outerHTML')
        tag_html = re.search(r'<\s*([a-zA-Z0-9]+)([^>]*)>', outer_html)[0]
        # Mendapatkan jenis <ol>: ['a', 'lower-alpha', 'decimal'], di mana 'a' == 'lower-alpha'
        ol_type = re.search(r'\b(lower-alpha|decimal|a)\b', tag_html)
        ol_type = ol_type[1] if ol_type is not None else 'decimal'
        return self.OL_DICT[ol_type]
    

    def __process_parent_element_text(self, web_element: WebElement, level: int, index: int) -> str:
        text = web_element.text.strip()

        if level == 1:
            # |(^\d+. .*)
            special_token_pattern_1 = \
                r'(^bab \w+$)|(^pasal \w+$)|(^bagian \w+$)|(^paragraf \w+$)|(^menimbang$)|(^mengingat$)|(^memutuskan$)|(^menetapkan$)|(^memperhatikan$)'
            special_token_pattern_2 = \
                r'(^dengan rahmat Tuhan Yang Maha Esa)|(^dengan persetujuan)|(^agar setiap orang mengetahuinya)|(^ditetapkan di)|(^disahkan di)|(^diundangkan di)'
            
            if re.search(special_token_pattern_1, text, re.IGNORECASE):
                return f'\n\n## {text}'
            elif re.search(special_token_pattern_2, text, re.IGNORECASE):
                return f'\n\n{text}'
            else: 
                return f'\n{text}'
            
        elif level == 2:
            return f'\n({index}) {text}'
        elif level == 3:
            return f'\n\t{self.ALPHABET[index - 1]}. {text}'
        
    
    def __process_child_element_text(self, web_element: WebElement, ol_type: str, level: int, index: int) -> str:
        text = web_element.text.strip()
        if level == 1:
            index = f'({index + 1})' if ol_type == 'decimal' else f'{self.ALPHABET[index]}.'
            if text != '':
                return f'\n{index} {text}'
            else:
                return f'\n{text}'
        elif level == 2:
            index = f'{index + 1}.' if ol_type == 'decimal' else f'{self.ALPHABET[index]}.'
            return f'\n\t{index} {text}'
        else:
            index = f'{index + 1}.' if ol_type == 'decimal' else f'{self.ALPHABET[index]}.'
            return f'\n\t\t{index} {text}'


    def __regulation_product_content_element(self, web_element: WebElement, index: int, level: int = 1) -> str:
        # Hasil akhir
        result = ''

        # Untuk tag <ol>
        if web_element.tag_name == 'ol':
            ol_type = self.__check_ol_tag(web_element=web_element)                  # Dapatkan jenis tag <ol>: lower-alpha ata decimal
            web_element = web_element.find_elements(By.XPATH, './*')                # Dapatkan list isi pasal (ayat)

            for i, sub_element in enumerate(web_element):                           # Iterasi semua isi pasal (ayat): <li>
                sub_element_component = sub_element.find_elements(By.XPATH, './*')  # Ambil semua child element di dalam ayat <li>
                num_sub_element_component = len(sub_element_component)              # Cek apakah ayat <li> punya > 1 child element
                
                if num_sub_element_component > 1:  # Cek apakah setiap ayat <li> punya > 1 child element

                    for sub_sub_element in sub_element_component:  # Iterasi semua child element di dalam ayat <li>
                        # Cek apakah child element nya ada <br>, jika ada maka hanya raw text
                        if sub_sub_element.tag_name == 'br':
                            text = sub_element.text.strip()
                            index = f'({i + 1})' if ol_type == 'decimal' else f'{self.ALPHABET[i]}.'
                            result += f'\n{index} {text}'
                            break  # Jika tidak break, maka akan copy output sebanyak jumlah br
                        # Jika tidak ada <br>, maka pasti <p> atau <ol> lagi
                        else:
                            result += self.__regulation_product_content_element(
                                web_element=sub_sub_element,
                                index=i + 1,
                                level=level + 1
                            )
                
                elif num_sub_element_component == 0:  # Cek apakah ayat <li> tidak punya child element
                    # Jika tidak punya child element, maka hanya raw text saja
                    result += self.__process_child_element_text(
                        web_element=sub_element,
                        ol_type=ol_type,
                        level=level,
                        index=i
                    )
                
                else:  # Jika ayat <li> hanya punya 1 child element, maka pasti <p> saja atau <br> saja
                    # Cek apakah ada <br>                               
                    for sub_sub_element in sub_element_component:
                        if sub_sub_element.tag_name == 'br':
                            text = sub_element.text.strip()
                            index = f'({i + 1})' if ol_type == 'decimal' else f'{self.ALPHABET[i]}.'
                            result += f'\n{index} {text}'
                            break  # Jika tidak break, maka akan copy output sebanyak jumlah br
                    
                    # Jika tidak ada <br> maka pasti <p> saja
                    result += self.__process_child_element_text(
                        web_element=sub_element_component[0],
                        ol_type=ol_type,
                        level=level,
                        index=i
                    )
                
        # Untuk tag <p> atau no-tag
        else:
            result += self.__process_parent_element_text(
                web_element=web_element,
                level=level,
                index=index
            )
        
        return result
    

    def regulation_product_content(self, regulation_names_and_links: list[dict], output_dir: str, verbose: bool = True) -> None:
        
        regulation_box_css_selector = 'div#produk-content'
        os.makedirs(output_dir, exist_ok=True)
        durations = list()
        success = 0
        failed = 0

        for regulation in tqdm(iterable=regulation_names_and_links, desc='Scraping regulation content', disable=not verbose):        
            start = time.time()

            try:
                result = ''
                self.web_driver.get(regulation['url'])
                wait = WebDriverWait(self.web_driver, timeout=10)
                wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, regulation_box_css_selector)))
                # Mendapatkan box peraturan perundang-undangan
                regulation_box = self.web_driver.find_element(By.CSS_SELECTOR, regulation_box_css_selector)
                # Dapatkan akses ke setiap element di dalam peraturan perundang-undangan: [<p>, <ol>]
                regulation_contents = regulation_box.find_elements(By.XPATH, './*')
                # Mengakses setiap element di dalam peraturan perundang-undangan: [<p>, <ol>]
                
                for index, regulation_content_element in enumerate(regulation_contents):
                    result += self.__regulation_product_content_element(
                        web_element=regulation_content_element,
                        index=index,
                        level=1
                    )

                result = result.strip()
                result = re.sub(r'\n{3,}', '\n\n', result)
                result = re.sub(r'(## pasal \w+)(\n{2})', r'\1\n', result, flags=re.IGNORECASE)

                # https://stackoverflow.com/questions/27092833/unicodeencodeerror-charmap-codec-cant-encode-characters
                file_output = os.path.join(output_dir, f'{regulation["name"]}.md')
                with open(file_output, 'w', encoding='utf-8') as file:
                    file.write(result)
            
                success += 1
                durations.append(time.time() - start)
                time.sleep(2)  # Break for 2 seconds
            
            except Exception as e:
                failed += 1
                if verbose:
                    print(f'ERROR scraping content for {regulation["name"]}')
                    print(e)
        
        self.web_driver.quit()

        if verbose:
            print('=' * 76)
            print(f'Output directory  : {os.path.join(output_dir)}')
            print(f'Total regulations : {len(regulation_names_and_links)} regulations')
            print(f'Total success     : {success} regulations')
            print(f'Total failed      : {failed} regulations')
            print(f'Total time        : {round(sum(durations), 3)} seconds')
            print(f'Average time      : {round(sum(durations) / success, 3)} seconds')
            print('NOTE! Time records do not include the 2 seconds break between each regulation')
            print('=' * 76)

In [None]:
def load_excel_selected_regulations(file_path: str, sheet_name: str, url_type: str = 'url_1',
                                    url_only: bool = True ) -> list[str] | list[dict]:

    # Read All Regulation with 'used' == 1
    selected_regulations = pd.read_excel(file_path, sheet_name=sheet_name)
    selected_regulations = selected_regulations.loc[selected_regulations['used'] == 1].copy()
    if url_only:
        return selected_regulations[url_type].tolist()
    else:
        selected_regulations = selected_regulations.loc[selected_regulations[url_type].notna()].copy()
        selected_regulations = selected_regulations.loc[:, ['name', url_type]].copy()
        selected_regulations.rename(columns={'name': 'name', url_type: 'url'}, inplace=True)
        return selected_regulations.to_dict(orient='records')

# Change These Input
FILENAME = 'dataset.xlsx'
DIR_PATH = os.path.join('data', 'active')
FILE_PATH = os.path.join(DIR_PATH, FILENAME)

uu = load_excel_selected_regulations(file_path=FILE_PATH, sheet_name='UU', url_type='url_2', url_only=False)
pp = load_excel_selected_regulations(file_path=FILE_PATH, sheet_name='PP', url_type='url_2', url_only=False)
permenkominfo = load_excel_selected_regulations(file_path=FILE_PATH, sheet_name='PERMENKOMINFO', url_type='url_2', url_only=False)

regulation_names_and_links = uu + pp + permenkominfo
print(f'Total regulations: {len(regulation_names_and_links)}')
display(regulation_names_and_links)

In [None]:
# TODO: PERMENKOMINFO_005_2021: Pasal 136 (keyword = '≠') tidak ada nomor huruf g.
web_driver = webdriver.Firefox()
output_dir = os.path.join('data', 'markdown', 'raw', 'komdigi')
scraper = KomdigiScraper(web_driver=web_driver)
scraper.regulation_product_content(
    regulation_names_and_links=regulation_names_and_links,
    output_dir=output_dir,
    verbose=True
)

In [None]:
# df = pd.read_excel('Dataset Peraturan.xlsx', 'PERMENKOMINFO')
# df = df.loc[df['used'] == 1].copy()
# df = df.loc[:, ['name', 'url_2']].copy()
# df.rename(columns={'name': 'name', 'url_2': 'url'}, inplace=True)
# regulation_names_and_links = df.to_dict(orient='records')
# regulation_names_and_links[:5]

In [None]:
# regulations = [
#     'https://jdih.komdigi.go.id/produk_hukum/view/id/695/t/peraturan+pemerintah+nomor+71+tahun+2019',
#     'https://jdih.komdigi.go.id/produk_hukum/view/id/463/t/peraturan+menteri+komunikasi+dan+informatika+nomor+20pmkominfo82006+tanggal+14+agustus+2006',
#     'https://jdih.komdigi.go.id/produk_hukum/view/id/431/t/peraturan+menteri+komunikasi+dan+informatika+nomor+11permkominfo042007+tanggal+13+april+2007',
#     'https://jdih.komdigi.go.id/produk_hukum/view/id/449/t/peraturan+menteri+komunikasi+dan+informatika+nomor+32permkominfo092007+tanggal+20+september+2007',
#     'https://jdih.komdigi.go.id/produk_hukum/view/id/769/t/peraturan+menteri+komunikasi+dan+informatika+nomor+6+tahun+2021',
#     'https://jdih.komdigi.go.id/produk_hukum/view/id/778/t/peraturan+menteri+komunikasi+dan+informatika+nomor+11+tahun+2021',
#     'https://jdih.komdigi.go.id/produk_hukum/view/id/760/t/peraturan+menteri+komunikasi+dan+informatika+nomor+1+tahun+2021',
#     'https://jdih.komdigi.go.id/produk_hukum/view/id/768/t/peraturan+menteri+komunikasi+dan+informatika+nomor+5+tahun+2021'
# ]

In [None]:
# from bs4 import BeautifulSoup

# html_text = """<ol type="a">
# <li>
# <p style="border-width: 0px; border-style: solid; box-sizing: border-box; margin: 0px;">bahwa dengan adanya perkembangan teknologi informasi yang sangat pesat dalam rangka mendorong pertumbuhan ekonomi digital dan penegakan kedaulatan negara atas informasi elektronik di wilayah Negara Kesatuan Republik Indonesia, perlu pengaturan secara menyeluruh pemanfaatan teknologi informasi dan transaksi elektronik;</p>
# </li>
# <li>
# <p style="border-width: 0px; border-style: solid; box-sizing: border-box; margin: 0px;">bahwa Peraturan Pemerintah Nomor 82 Tahun 2012 tentang Penyelenggaraan Sistem dan Transaksi Elektronik sudah tidak sesuai lagi dengan perkembangan kebutuhan hukum masyarakat sehingga perlu diganti;</p>
# </li>
# <li>
# <p style="border-width: 0px; border-style: solid; box-sizing: border-box; margin: 0px;">bahwa berdasarkan pertimbangan sebagaimana dimaksud dalam huruf a dan huruf b, perlu menetapkan Peraturan Pemerintah tentang Penyelenggaraan Sistem dan Transaksi Elektronik;</p>
# </li>
# </ol>"""

# # Parsing HTML dengan BeautifulSoup
# soup = BeautifulSoup(html_text, "html.parser")

# # Mengambil teks tanpa tag HTML
# clean_text = soup.get_text(separator="\n", strip=True)

# print(clean_text)

In [None]:
# import re

# def get_first_tag(html_text):
#     match = re.match(r"<\s*([a-zA-Z0-9]+)([^>]*)>", html_text)
#     return match.group(0) if match else None

# # Contoh penggunaan
# examples = [
#     '<p style="text-align:center">&nbsp;</p>',
#     '<p style="text-align:center"><strong>PERATURAN MENTERI KOMUNIKASI DAN INFORMATIKA</strong></p>',
#     '<ol style="list-style-type:decimal">'
# ]

# for ex in examples:
#     print(get_first_tag(ex))

In [None]:
# regulations = pd.read_csv('data.csv')
# regulations = regulations['link'].to_list()
# regulations

In [None]:
# def check_ol_tag(web_element: WebElement) -> str:
#     ol_dict = {
#         'a': 'lower-alpha', 
#         'lower-alpha': 'lower-alpha',
#         'decimal': 'decimal'
#     }

#     # Mendapatkan isi full tag <ol> HTML dari element tersebut
#     outer_html = web_element.get_attribute('outerHTML')
#     tag_html = re.search(r'<\s*([a-zA-Z0-9]+)([^>]*)>', outer_html)[0]

#     # Mendapatkan jenis <ol>: ['a', 'lower-alpha', 'decimal'], di mana 'a' == 'lower-alpha'
#     ol_type = re.search(r'\b(lower-alpha|decimal|a)\b', tag_html)
#     ol_type = ol_type[1] if ol_type is not None else 'decimal'

#     return ol_dict[ol_type]


# def scrape_regulation_content_element(web_element: WebElement, index: int, level: int = 1):
    
#     # Hasil akhir
#     result = ''

#     # Urutan lower-alpha
#     alphabet = 'abcdefghijklmnopqrstuvwxyz'

#     # Untuk tag <ol>
#     if web_element.tag_name == 'ol':
#         ol_type = check_ol_tag(web_element=web_element)                         # Dapatkan jenis tag <ol>: lower-alpha ata decimal
#         web_element = web_element.find_elements(By.XPATH, './*')                # Dapatkan list isi pasal (ayat)

#         for i, sub_element in enumerate(web_element):                           # Iterasi semua isi pasal (ayat): <li>
#             sub_element_component = sub_element.find_elements(By.XPATH, './*')  # Ambil semua child element di dalam ayat <li>
#             num_sub_element_component = len(sub_element_component)              # Cek apakah ayat <li> punya > 1 child element
            
#             if num_sub_element_component > 1:  # Cek apakah setiap ayat <li> punya > 1 child element

#                 for sub_sub_element in sub_element_component:  # Iterasi semua child element di dalam ayat <li>
#                     # Cek apakah child element nya ada <br>, jika ada maka hanya raw text
#                     if sub_sub_element.tag_name == 'br':
#                         text = sub_element.text.strip()
#                         index = f'({i + 1})' if ol_type == 'decimal' else f'{alphabet[i]}.'
#                         result += f'\n{index} {text}'
#                         break  # Jika tidak break, maka akan copy output sebanyak jumlah br
#                     # Jika tidak ada <br>, maka pasti <p> atau <ol> lagi
#                     else:
#                         result += scrape_regulation_content_element(web_element=sub_sub_element, index=i + 1, level=level + 1)
            
#             elif num_sub_element_component == 0:  # Cek apakah ayat <li> tidak punya child element
#                 # Jika tidak punya child element, maka hanya raw text saja
#                 text = sub_element.text.strip()
#                 if level == 1:
#                     index = f'({i + 1})' if ol_type == 'decimal' else f'{alphabet[i]}.'
#                     result += f'\n{index} {text}'
#                 elif level == 2:
#                     index = f'{i + 1}.' if ol_type == 'decimal' else f'{alphabet[i]}.'
#                     result += f'\n\t{index} {text}'
#                 elif level == 3:
#                     index = f'{i + 1}.' if ol_type == 'decimal' else f'{alphabet[i]}.'
#                     result += f'\n\t\t{index} {text}'
            
#             else:  # Jika ayat <li> hanya punya 1 child element, maka pasti <p> saja atau <br> saja
#                 # Cek apakah ada <br>                               
#                 for sub_sub_element in sub_element_component:
#                     if sub_sub_element.tag_name == 'br':
#                         text = sub_element.text.strip()
#                         index = f'({i + 1})' if ol_type == 'decimal' else f'{alphabet[i]}.'
#                         result += f'\n{index} {text}'
#                         break  # Jika tidak break, maka akan copy output sebanyak jumlah br
                
#                 # Jika tidak ada <br> maka pasti <p> saja
#                 text = sub_element_component[0].text.strip()
                
#                 if level == 1:
#                     index = f'({i + 1})' if ol_type == 'decimal' else f'{alphabet[i]}.'
#                     if text != '':
#                         result += f'\n{index} {text}'
#                     else:
#                         result += f'\n{text}'
#                 elif level == 2:
#                     index = f'{i + 1}.' if ol_type == 'decimal' else f'{alphabet[i]}.'
#                     result += f'\n\t{index} {text}'
#                 elif level == 3:
#                     index = f'{i + 1}.' if ol_type == 'decimal' else f'{alphabet[i]}.'
#                     result += f'\n\t\t{index} {text}'
            
#     # Untuk tag <p> atau no-tag
#     else:
#         text = web_element.text.strip()
#         if level == 1:
#             special_token_pattern_1 = \
#             r'(^bab \w+)|(^pasal \w+)|(^bagian \w+)|(^paragraf \w+)|(^menimbang)|(^mengingat)|(^memutuskan)|(^menetapkan)'
#             # |(^\d+. .*)
#             special_token_pattern_2 = r'(^agar setiap orang mengetahuinya)|(^ditetapkan di)|(^dengan rahmat Tuhan Yang Maha Esa)'
#             if re.search(special_token_pattern_1, text, re.IGNORECASE):
#                 result += f'\n\n## {text}'
#             elif re.search(special_token_pattern_2, text, re.IGNORECASE):
#                 result += f'\n\n{text}'
#             else: 
#                 result += f'\n{text}'
#         elif level == 2:
#             result += f'\n({index}) {text}'
#         elif level == 3:
#             result += f'\n\t{alphabet[index - 1]}. {text}'
    
#     return result


# for idx, url in enumerate([regulations[7]]):
#     result = ''
#     driver.get(url)
#     wait = WebDriverWait(driver, timeout=10)
#     wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div#produk-content')))

#     # Mendapatkan box peraturan perundang-undangan
#     regulation_box = driver.find_element(By.CSS_SELECTOR, 'div#produk-content')

#     # Dapatkan akses ke setiap element di dalam peraturan perundang-undangan: [<p>, <ol>]
#     regulation_contents = regulation_box.find_elements(By.XPATH, './*')
    
#     # Mengakses setiap element di dalam peraturan perundang-undangan: [<p>, <ol>]
#     start = time.time()
#     for index, regulation_content_element in enumerate(regulation_contents):
#         result = result + scrape_regulation_content_element(web_element=regulation_content_element, index=index, level=1)

#     result = result.strip()
#     result = re.sub(r'\n{3,}', '\n\n', result)
#     result = re.sub(r'(## pasal \w+)(\n{2})', r'\1\n', result, flags=re.IGNORECASE)

#     duration = time.time() - start
    
#     print(result, '\n')
#     print('=' * 100)
#     print(duration)
#     # print('GANTI PERATURAN')
#     # print('=' * 100, '\n')

In [None]:
# def check_ol_tag(web_element: WebElement) -> str:
#     ol_dict = {
#         'a': 'lower-alpha', 
#         'lower-alpha': 'lower-alpha',
#         'decimal': 'decimal'
#     }

#     # Mendapatkan isi full tag <ol> HTML dari element tersebut
#     outer_html = web_element.get_attribute('outerHTML')
#     tag_html = re.search(r'<\s*([a-zA-Z0-9]+)([^>]*)>', outer_html)[0]

#     # Mendapatkan jenis <ol>: ['a', 'lower-alpha', 'decimal'], di mana 'a' == 'lower-alpha'
#     ol_type = re.search(r'\b(lower-alpha|decimal|a)\b', tag_html)
#     ol_type = ol_type[1] if ol_type is not None else 'decimal'

#     return ol_dict[ol_type]

# def scrape_regulation_content_element(web_element: WebElement, index: int, level: int = 1):
    
#     alphabet = 'abcdefghijklmnopqrstuvwxyz'

#     # Untuk tag <ol>
#     if web_element.tag_name == 'ol':
#         ol_type = check_ol_tag(web_element=web_element)
#         web_element = web_element.find_elements(By.XPATH, './*')                # List isi pasal
#         for i, sub_element in enumerate(web_element):                           # Iterasi semua isi pasal (ayat)
#             sub_element_component = sub_element.find_elements(By.XPATH, './*')  # Cek apakah ayat punya > 1 child element
#             num_sub_element_component = len(sub_element_component)
            
#             if num_sub_element_component > 1:  # Cek apakah ayat punya > 1 child element
#                 for sub_sub_element in sub_element_component:
#                     if sub_sub_element.tag_name == 'br':
#                         index = f'({i + 1})' if ol_type == 'decimal' else f'{alphabet[i]}.'
#                         print(f'{index} {sub_element.text}')
#                         break
#                     else:
#                         scrape_regulation_content_element(web_element=sub_sub_element, index=i + 1, level=level + 1)
            
#             elif num_sub_element_component == 0:  # Cek apakah ayat tidak punya child element
#                 if level == 1:
#                     index = f'({i + 1})' if ol_type == 'decimal' else f'{alphabet[i]}.'
#                     print(f'{index} {sub_element.text}')
#                 elif level == 2:
#                     index = f'{i + 1}.' if ol_type == 'decimal' else f'{alphabet[i]}.'
#                     print(f'{index} {sub_element.text}')
#                 elif level == 3:
#                     index = f'{i + 1}.' if ol_type == 'decimal' else f'{alphabet[i]}.'
#                     print(f'\t\t{index} {sub_element.text}')
            
#             else:  # Jika tidak punya > 1 child element, maka pasti <p> atau no-tag 4                               
#                 for sub_sub_element in sub_element_component:
#                     if sub_sub_element.tag_name == 'br':
#                         index = f'({i + 1})' if ol_type == 'decimal' else f'{alphabet[i]}.'
#                         print(f'{index} {sub_element.text}')
#                         break
#                 if level == 1:
#                     text = sub_element_component[0].text.strip()
#                     index = f'({i + 1})' if ol_type == 'decimal' else f'{alphabet[i]}.'
#                     print(f'{index} {text}') if text != '' else print(text)
#                 elif level == 2:
#                     index = f'{i + 1}.' if ol_type == 'decimal' else f'{alphabet[i]}.'
#                     print(f'\t{index} {sub_element_component[0].text}')
#                 elif level == 3:
#                     index = f'{i + 1}.' if ol_type == 'decimal' else f'{alphabet[i]}.'
#                     print(f'\t\t{index} {sub_element_component[0].text}')
            
#     # Untuk tag <p> atau no-tag
#     else:
#         if level == 1:
#             text = web_element.text.strip()
#             special_token_pattern = r'(^bab \w+)|(^pasal \w+)|(^bagian \w+)|(^paragraf \w+)|(^menimbang)|(^mengingat)|(^menetapkan)'
#             is_article = re.search(special_token_pattern, text, re.IGNORECASE)
#             print(f'\n## {text}') if is_article else print(text)
#         elif level == 2:
#             print(f'({index}) {web_element.text}')
#         elif level == 3:
#             print(f'\t{alphabet[index - 1]}. {web_element.text}')

# for idx, url in enumerate([regulations[0]]):
    
#     result = ''
#     driver.get(url)
#     wait = WebDriverWait(driver, timeout=10)
#     wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div#produk-content')))

#     # Mendapatkan box peraturan perundang-undangan
#     regulation_box = driver.find_element(By.CSS_SELECTOR, 'div#produk-content')

#     # Dapatkan akses ke setiap element di dalam peraturan perundang-undangan: [<p>, <ol>]
#     regulation_contents = regulation_box.find_elements(By.XPATH, './*')
    
#     # Mengakses setiap element di dalam peraturan perundang-undangan: [<p>, <ol>]
#     for index, regulation_content_element in enumerate(regulation_contents):
#         scrape_regulation_content_element(web_element=regulation_content_element, index=index, level=1)
    
#     print()
#     print('=' * 100)
#     print('GANTI PERATURAN')
#     print('=' * 100)
#     print()

In [None]:
# def check_ol_tag(web_element: WebElement) -> str:
#     ol_dict = {
#         'a': 'lower-alpha', 
#         'lower-alpha': 'lower-alpha',
#         'decimal': 'decimal'
#     }

#     # Mendapatkan isi full tag <ol> HTML dari element tersebut
#     outer_html = web_element.get_attribute('outerHTML')
#     tag_html = re.search(r'<\s*([a-zA-Z0-9]+)([^>]*)>', outer_html)[0]

#     # Mendapatkan jenis <ol>: ['a', 'lower-alpha', 'decimal'], di mana 'a' == 'lower-alpha'
#     ol_type = re.search(r'\b(lower-alpha|decimal|a)\b', tag_html)
#     ol_type = ol_type[1] if ol_type is not None else 'decimal'
#     ol_type = ol_dict[ol_type]

#     return ol_type


# def scrape_regulation_content_element(web_element: WebElement, index: int, level: int = 1):
    
#     alphabet = 'abcdefghijklmnopqrstuvwxyz'

#     # Untuk tag <ol>
#     if web_element.tag_name == 'ol':
#         ol_type = check_ol_tag(web_element=web_element)
#         web_element = web_element.find_elements(By.XPATH, './*')        # List isi pasal
#         for i, sub_element in enumerate(web_element):                   # Iterasi semua isi pasal (ayat)
#             sub_element_component = sub_element.find_elements(By.XPATH, './*')    # Cek apakah ayat punya > 1 child element
#             num_sub_element_component = len(sub_element_component)
#             if num_sub_element_component > 1:                                    # Cek apakah ayat punya > 1 child element
#                 # print('======' + str(num_sub_element_component))
#                 for sub_sub_element in sub_element_component:
#                     if sub_sub_element.tag_name =='br':
#                         index = f'({i + 1})'
#                         print(f'{index} {sub_element.text}')
#                         break
#                     else:
#                         scrape_regulation_content_element(web_element=sub_sub_element, index=i + 1, level=level + 1)
#             elif num_sub_element_component == 0:
#                 # pass
#                 if level == 1:
#                     index = f'({i + 1})' if ol_type == 'decimal' else f'{alphabet[i]}.'
#                     print(f'{index} {sub_element.text}')
#                 elif level == 2:
#                     index = f'({i + 1})' if ol_type == 'decimal' else f'{alphabet[i]}.'
#                     if ol_type == 'decimal':
#                         print(f'{index} {sub_element.text}')
#                     else:
#                         print(f'\t{index} {sub_element.text}')
#                 elif level == 3:
#                     index = f'{i + 1}.' if ol_type == 'decimal' else f'{alphabet[i]}.'
#                     if ol_type == 'decimal':
#                         print(f'\t\t{index} {sub_element.text}')
#                     else:
#                         print(f'\t{index} {sub_element.text}')
#             else:        
#                 # pass                                               # Jika tidak punya > 1 child element, maka pasti <p> atau no-tag 
#                 if level == 1:
#                     index = f'({i + 1})' if ol_type == 'decimal' else f'{alphabet[i]}.'
#                     print(f'{index} {sub_element_component[0].text}')
#                 elif level == 2:
#                     index = f'({i + 1})' if ol_type == 'decimal' else f'{alphabet[i]}.'
#                     if ol_type == 'decimal':
#                         print(f'{index} {sub_element_component[0].text}')
#                     else:
#                         print(f'\t{index} {sub_element_component[0].text}')
#                 elif level == 3:
#                     index = f'{i + 1}.' if ol_type == 'decimal' else f'{alphabet[i]}.'
#                     if ol_type == 'decimal':
#                         print(f'\t\t{index} {sub_element_component[0].text}')
#                     else:
#                         print(f'\t{index} {sub_element_component[0].text}')
#                 # print(f'LEVEL: {level} ..... {index} {sub_element[0].text}')
            
#     # Untuk tag <p> atau no-tag
#     else:
#         # pass
#         if level == 1:
#             is_article = re.search(r'(^bab \w+)|(^pasal \w+)|(^bagian \w+)', web_element.text, re.IGNORECASE)
#             if is_article:
#                 print(f'\n## {web_element.text}')
#             else:
#                 print(web_element.text)
#         elif level == 2:
#             print(f'({index}) {web_element.text}')
#         elif level == 3:
#             print(f'\t{alphabet[index - 1]}. {web_element.text}')


# for idx, url in enumerate(regulations):
    
#     driver.get(url)
#     wait = WebDriverWait(driver, timeout=5)
#     wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div#produk-content')))

#     # Mendapatkan box peraturan perundang-undangan
#     regulation_box = driver.find_element(By.CSS_SELECTOR, 'div#produk-content')

#     # Dapatkan akses ke setiap element di dalam peraturan perundang-undangan: [<p>, <ol>]
#     regulation_contents = regulation_box.find_elements(By.XPATH, './*')
    
#     # Mengakses setiap element di dalam peraturan perundang-undangan: [<p>, <ol>]
#     for index, regulation_content_element in enumerate(regulation_contents):
#         scrape_regulation_content_element(web_element=regulation_content_element, index=index, level=1)

In [None]:
# # def scrape_komdigi_jdih(url: str) -> list[dict]:


# def check_ol_tag(web_element: WebElement) -> str:
#     ol_dict = {
#         'a': 'lower-alpha', 
#         'lower-alpha': 'lower-alpha',
#         'decimal': 'decimal'
#     }

#     # Mendapatkan isi full tag <ol> HTML dari element tersebut
#     outer_html = web_element.get_attribute('outerHTML')
#     tag_html = re.search(r'<\s*([a-zA-Z0-9]+)([^>]*)>', outer_html)[0]

#     # Mendapatkan jenis <ol>: ['a', 'lower-alpha', 'decimal'], di mana 'a' == 'lower-alpha'
#     ol_type = re.search(r'\b(lower-alpha|decimal|a)\b', tag_html)
#     ol_type = ol_type[1] if ol_type is not None else 'decimal'
#     ol_type = ol_dict[ol_type]

#     return ol_type

# def scrape_regulation_content_element(web_element: WebElement):
    
#     alphabet = 'abcdefghijklmnopqrstuvwxyz'

#     # Untuk tag <ol>
#     if web_element.tag_name == 'ol':
#         ol_type = check_ol_tag(web_element=web_element)
#         web_element = web_element.find_elements(By.XPATH, './*')        # List isi pasal
#         for i, sub_element in enumerate(web_element):                   # Iterasi semua isi pasal (ayat)
#             sub_element = sub_element.find_elements(By.XPATH, './*')    # Cek apakah ayat punya > 1 child element
#             if len(sub_element) > 1:                                    # Cek apakah ayat punya > 1 child element
#                 for j, sub_sub_element in enumerate(sub_element):
#                     scrape_regulation_content_element(sub_sub_element)
#             else:                                                       # Jika tidak punya > 1 child element, maka pasti <p> atau no-tag 
#                 print(sub_element.text)
            
#     # Untuk tag <p> atau no-tag
#     else:
#         print(web_element.text)

# for idx, url in enumerate([regulations[0]]):
    
#     driver.get(url)
#     wait = WebDriverWait(driver, timeout=5)
#     wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div#produk-content')))

#     # Mendapatkan box peraturan perundang-undangan
#     regulation_box = driver.find_element(By.CSS_SELECTOR, 'div#produk-content')

#     # Dapatkan akses ke setiap element di dalam peraturan perundang-undangan: [<p>, <ol>]
#     regulation_contents = regulation_box.find_elements(By.XPATH, './*')
    
#     # Mengakses setiap element di dalam peraturan perundang-undangan: [<p>, <ol>]
#     for i, regulation_content_element in enumerate(regulation_contents):

#         # LEVEL PASAL

#         # Jika ketemu tag <ol> di awal maka sudah pasti ada beberapa pasal di dalamnya
#         if regulation_content_element.tag_name == 'ol':  # TAG <ol>

#             tag_ol_type = check_ol_tag(web_element=regulation_content_element)
            
#             # Dapatkan akses ke setiap pasal <li>
#             regulation_sub_contents = regulation_content_element.find_elements(By.XPATH, './*')
            
#             # Iterasi setiap pasal <li>
#             for j, regulation_sub_content_element in enumerate(regulation_sub_contents):
            
#                 # Dapatkan seluruh isi di dalam pasal <li>
#                 regulation_sub_sub_contents = regulation_sub_content_element.find_elements(By.XPATH, './*')

#                 # Jika isinya lebih dari satu, maka sudah pasti ada sub pasal
#                 if len(regulation_sub_sub_contents) > 1:
    
#                     ## LEVEL SUB PASAL

#                     ## Iterasi setiap sub pasal di dalam pasal <li>
#                     for k, regulation_sub_sub_content_element in enumerate(regulation_sub_sub_contents):

#                         ## Jika ketemu tag <ol> di dalam sub pasal maka sudah pasti sub <li> lagi di dalamnya
#                         if regulation_sub_sub_content_element.tag_name == 'ol':

#                             tag_ol_type = check_ol_tag(web_element=regulation_sub_sub_content_element)

#                             ## Dapatkan seluruh sub-sub pasal di dalam sub pasal <li> tersebut
#                             regulation_sub_sub_sub_contents = regulation_sub_sub_content_element.find_elements(By.XPATH, './*')

#                             ### Mengakses setiap <li>
#                             for l, regulation_sub_sub_sub_content_element in enumerate(regulation_sub_sub_sub_contents):

#                                 # Dapatkan seluruh isi di dalam pasal <li>
#                                 regulation_sub_sub_sub_sub_contents = regulation_sub_sub_sub_content_element.find_elements(By.XPATH, './*')

#                                 # Jika isinya lebih dari satu, maka sudah pasti ada sub pasal
#                                 if len(regulation_sub_sub_sub_sub_contents) > 1:

#                                     for m, regulation_sub_sub_sub_sub_content_element in enumerate(regulation_sub_sub_sub_sub_contents):

#                                         regulation_sub_sub_sub_sub_sub_contents = regulation_sub_sub_sub_sub_content_element.find_elements(By.XPATH, './*')

#                                         for n, regulation_sub_sub_sub_sub_sub_content_element in enumerate(regulation_sub_sub_sub_sub_sub_contents):
                                            
#                                             print(n + 1, regulation_sub_sub_sub_sub_sub_content_element.text)
#                                             # if regulation_sub_sub_sub_sub_content_element.tag_name == 'ol':

#                                 else:
#                                     pass
#                                     # print(l + 1, regulation_sub_sub_contents[0].text)
#                                 # ## Jika ketemu tag <ol> di dalam sub pasal maka sudah pasti sub <li> lagi di dalamnya
#                                 # print(l + 1, regulation_sub_sub_sub_content_element.text)
#                                 # # pass
            
#                 else:
#                     pass
#                     # print(j + 1, regulation_sub_sub_contents[0].text)
            
#             print('=' * 10)


#                 # Jika tidak ketemu tag <ol> lagi, maka sudah pasti ketemu tag <p> atau tidak ada inner tag
#                 # else:
#                 # print(j + 1, regulation_sub_content_element.text)
#                 # pass

#             # print('=' * 10)
#             # print(regulation_elements.text)
#             # print('=' * 10)

#         # Jika tidak ketemu tag <ol> di awal, maka sudah pasti ketemu tag <p>
#         else:
#             pass
#             # print(regulation_content_element.text)

#             # atribute = re.search(r'style=\"(.*)\"|type=\"(.*)\"', outer_html)
#             # if atribute:
#             #     atribute = atribute[1] or atribute[2]
#             #     atribute_type = 
#             #     print(outer_html + ' : ' + atribute)
#             # else:
#             #     print(outer_html)
#         # else:  # TAG <p>
#             # print(outer_html)
#         # print('=' * 20)

In [None]:
# # def scrape_komdigi_jdih(url: str) -> list[dict]:

# for idx, url in enumerate([regulations[0]]):
    
#     driver.get(url)
#     wait = WebDriverWait(driver, timeout=5)
#     wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div#produk-content')))

#     # Mendapatkan box peraturan perundang-undangan
#     regulation_box = driver.find_element(By.CSS_SELECTOR, 'div#produk-content')

#     # Dapatkan akses ke setiap element di dalam peraturan perundang-undangan: [<p>, <ol>]
#     regulation_contents = regulation_box.find_elements(By.XPATH, './*')
    
#     # Mengakses setiap element di dalam peraturan perundang-undangan: [<p>, <ol>]
#     for i, regulation_content_element in enumerate(regulation_contents):

#         # LEVEL PASAL

#         # Jika ketemu tag <ol> di awal maka sudah pasti ada beberapa pasal di dalamnya
#         if regulation_content_element.tag_name == 'ol':  # TAG <ol>

#             # Mendapatkan jenis isi full tag <ol> HTML dari element tersebut
#             outer_html = regulation_content_element.get_attribute('outerHTML')
#             outer_html = re.search(r'<\s*([a-zA-Z0-9]+)([^>]*)>', outer_html)[0]

#             # Mendapatkan jenis <ol>: ['a', 'lower-alpha', 'decimal'], di mana 'a' == 'lower-alpha'
#             tag_ol_type = re.search(r'\b(lower-alpha|decimal|a)\b', outer_html)
#             tag_ol_type = tag_ol_type[1] if tag_ol_type is not None else 'decimal'
            
            
#             # Dapatkan akses ke setiap pasal <li>
#             regulation_sub_contents = regulation_content_element.find_elements(By.XPATH, './*')
            
#             # Iterasi setiap pasal <li>
#             for j, regulation_sub_content_element in enumerate(regulation_sub_contents):
            
#                 # Dapatkan seluruh isi di dalam pasal <li>
#                 regulation_sub_sub_contents = regulation_sub_content_element.find_elements(By.XPATH, './*')

#                 # Jika isinya lebih dari satu, maka sudah pasti ada sub pasal
#                 if len(regulation_sub_sub_contents) > 1:
    
#                     ## LEVEL SUB PASAL

#                     ## Iterasi setiap sub pasal di dalam pasal <li>
#                     for k, regulation_sub_sub_content_element in enumerate(regulation_sub_sub_contents):

#                         ## Jika ketemu tag <ol> di dalam sub pasal maka sudah pasti sub <li> lagi di dalamnya
#                         if regulation_sub_sub_content_element.tag_name == 'ol':

#                             ## Mendapatkan jenis isi full tag <ol> HTML dari element tersebut
#                             outer_html = regulation_sub_sub_content_element.get_attribute('outerHTML')
#                             outer_html = re.search(r'<\s*([a-zA-Z0-9]+)([^>]*)>', outer_html)[0]

#                             ## Mendapatkan jenis <ol>: ['a', 'lower-alpha', 'decimal'], di mana 'a' == 'lower-alpha'
#                             tag_ol_type = re.search(r'\b(lower-alpha|decimal|a)\b', outer_html)
#                             tag_ol_type = tag_ol_type[1] if tag_ol_type is not None else 'decimal'

#                             ## Dapatkan seluruh sub-sub pasal di dalam sub pasal <li> tersebut
#                             regulation_sub_sub_sub_contents = regulation_sub_sub_content_element.find_elements(By.XPATH, './*')

#                             ### Mengakses setiap <li>
#                             for l, regulation_sub_sub_sub_content_element in enumerate(regulation_sub_sub_sub_contents):

#                                 # Dapatkan seluruh isi di dalam pasal <li>
#                                 regulation_sub_sub_sub_sub_contents = regulation_sub_sub_sub_content_element.find_elements(By.XPATH, './*')

#                                 # Jika isinya lebih dari satu, maka sudah pasti ada sub pasal
#                                 if len(regulation_sub_sub_sub_sub_contents) > 1:

#                                     for m, regulation_sub_sub_sub_sub_content_element in enumerate(regulation_sub_sub_sub_sub_contents):

#                                         regulation_sub_sub_sub_sub_sub_contents = regulation_sub_sub_sub_sub_content_element.find_elements(By.XPATH, './*')

#                                         for n, regulation_sub_sub_sub_sub_sub_content_element in enumerate(regulation_sub_sub_sub_sub_sub_contents):
                                            
#                                             print(n + 1, regulation_sub_sub_sub_sub_sub_content_element.text)
#                                             # if regulation_sub_sub_sub_sub_content_element.tag_name == 'ol':

#                                 else:
#                                     pass
#                                     # print(l + 1, regulation_sub_sub_contents[0].text)
#                                 # ## Jika ketemu tag <ol> di dalam sub pasal maka sudah pasti sub <li> lagi di dalamnya
#                                 # print(l + 1, regulation_sub_sub_sub_content_element.text)
#                                 # # pass
            
#                 else:
#                     pass
#                     # print(j + 1, regulation_sub_sub_contents[0].text)
            
#             print('=' * 10)


#                 # Jika tidak ketemu tag <ol> lagi, maka sudah pasti ketemu tag <p> atau tidak ada inner tag
#                 # else:
#                 # print(j + 1, regulation_sub_content_element.text)
#                 # pass

#             # print('=' * 10)
#             # print(regulation_elements.text)
#             # print('=' * 10)

#         # Jika tidak ketemu tag <ol> di awal, maka sudah pasti ketemu tag <p>
#         else:
#             pass
#             # print(regulation_content_element.text)

#             # atribute = re.search(r'style=\"(.*)\"|type=\"(.*)\"', outer_html)
#             # if atribute:
#             #     atribute = atribute[1] or atribute[2]
#             #     atribute_type = 
#             #     print(outer_html + ' : ' + atribute)
#             # else:
#             #     print(outer_html)
#         # else:  # TAG <p>
#             # print(outer_html)
#         # print('=' * 20)