In [None]:
import os
import re
import json
import time
# import dateparser
import pandas as pd
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.remote.webdriver import WebDriver
# from selenium.webdriver.remote.webelement import WebElement
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC

In [None]:
class BPKScraper:

    def __init__(self, web_driver: WebDriver):
        self.web_driver = web_driver
        self.ENCODE = {
            'type': {
                'UU': '01',
                'PERPPU': '02',
                'PP': '03',
                'PERPRES': '04',
                'PERMENKOMINFO': '05'
            },
            'section': {
                'document': '1',
                'considering': '2',
                'observing': '3',
                'definition': '4',
                'chapter': '5',
                'article': '6',
                'section': '7',
            }
        }
    

    @staticmethod  # https://stackoverflow.com/questions/735975/static-methods-in-python
    def list_of_dict_to_json(regulation_data: list[dict], output_path: str) -> None:
        if not output_path.endswith('.json'):
            output_path = output_path + '.json'
        with open(output_path, 'w', encoding='utf-8') as file:
            json.dump(regulation_data, file, indent=4)
    

    @staticmethod
    def list_of_dict_to_excel(regulation_data: list[dict], output_path: str) -> None:
        if not output_path.endswith('.xlsx'):
            output_path = output_path + '.xlsx'
        df = pd.DataFrame(regulation_data)
        df.to_excel(output_path, index=False)


    def active_regulation(self, url: str, regulation_type: str, verbose: bool = True) -> list[dict]:
        regulations_box_xpath = '/html/body/div/div/div[2]/div[2]/div[2]'
        regulations_css_selector = 'div.row.mb-8[class="row mb-8"]'
        regulation_href_css_selector = 'div.col-lg-10.fs-2.fw-bold.pe-4 a'
        pagination_box_css_selector = 'ul.pagination.justify-content-center'
        regulation_number_css_selector = 'div.col-lg-8.fw-semibold.fs-5.text-gray-600'
        regulation_title_css_selector = 'div.col-lg-10.fs-2.fw-bold.pe-4'
        regulation_subjects_css_selector = 'span.badge.badge-light-primary.mb-2'
        page_pattern = r'p=(\d+)'
        
        # Final result
        active_regulations = list()
        durations = list()
        new_url = ''

        # Check page numbering in URL
        if re.search(page_pattern, url):
            new_url = re.sub(page_pattern, 'p={page}', url)
        else:
            new_url = url + '&p={page}'

        # Get last page number
        self.web_driver.get(new_url.format(page=1))
        wait = WebDriverWait(self.web_driver, timeout=10)
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, pagination_box_css_selector)))
        
        pagination_box = self.web_driver.find_element(By.CSS_SELECTOR, pagination_box_css_selector)
        last_page_button = pagination_box.find_element(By.XPATH, "./*[last()]")
        last_page_href = last_page_button.find_element(By.CSS_SELECTOR, '[href]').get_attribute('href')
        last_page_number = int(re.search(page_pattern, last_page_href)[1])

        # Iterate for every page
        for page in tqdm(iterable=range(1, last_page_number + 1), desc='Scraping active regulations', disable=not verbose):
            start = time.time()
            
            # Go to the page
            access_page = False
            trial_number = 10

            # TODO: Apply trial access ini ke semua akses web di file lain
            for _ in range(trial_number):
                try:
                    # Try access the page
                    self.web_driver.get(new_url.format(page=page))
                    wait = WebDriverWait(self.web_driver, timeout=10)
                    wait.until(EC.presence_of_element_located((By.XPATH, regulations_box_xpath)))
                    access_page = True
                    break
                except TimeoutException as e:
                    # If timeout, wait for 2 seconds
                    time.sleep(2)
            
            if not access_page:
                if verbose:
                    print(f'Unable to access {url} on page={page} after {trial_number} attempts')
                    print(f'Skip the scraping process to page={page + 1}')
                continue
            
            # Get all regulation instance
            regulations_box = self.web_driver.find_element(By.XPATH, regulations_box_xpath)
            regulations_all = regulations_box.find_elements(By.CSS_SELECTOR, regulations_css_selector)

            # Iterate for every regulation
            for regulation in regulations_all:
                # Ignore all ineffective regulations
                if not re.findall(r'Dicabut dengan', regulation.text):
                    
                    # Get regulation number and year element
                    regulation_number_and_year = regulation.find_element(By.CSS_SELECTOR, regulation_number_css_selector).text.lower()
                    
                    # Get regulation number
                    new_regulation_number = re.search(r'\b(?:nomor|no\.)\s+(\d+)', regulation_number_and_year)
                    old_regulation_number = re.search(r'(\d+)\/', new_regulation_number[0])
                    regulation_number = new_regulation_number[1] if old_regulation_number is None else old_regulation_number[1]

                    # Get regulation year
                    regulation_year = re.search(r'tahun\s+(\d+)', regulation_number_and_year)
                    regulation_year = regulation_year[1] if regulation_year is not None else ''

                    # Get regulation title
                    regulation_title = regulation.find_element(By.CSS_SELECTOR, regulation_title_css_selector).text.strip()
                    
                    # Get regulation subjects
                    regulation_subjects = list()
                    regulation_subject_elements = regulation.find_elements(By.CSS_SELECTOR, regulation_subjects_css_selector)
                    if regulation_subject_elements:
                        for subject in regulation_subject_elements:
                            regulation_subjects.append(subject.text)
                    
                    # Get regulation URL link
                    regulation_href = regulation.find_element(By.CSS_SELECTOR, regulation_href_css_selector).get_attribute("href")
                    
                    # Create regulation temporary ID, just for ordering
                    # number = f'{regulation_year}{regulation_number.zfill(3)}'
                    regulation_id = f'{regulation_type}_{regulation_year}_{regulation_number.zfill(3)}'

                    # Append all data
                    active_regulations.append({
                        # 'no': number,
                        'name': regulation_id,
                        'about': regulation_title,
                        'subjects': regulation_subjects,
                        'url_1': regulation_href,
                        'url_2': '',
                        'good_pdf': False,
                        'used': False,
                        'notes': ''
                    })
            
            durations.append(time.time() - start)
            time.sleep(2)  # Break for 2 seconds
        
        self.web_driver.quit()

        if verbose:
            print('=' * 76)
            print(f'URL               : {url}')
            print(f'Regulation type   : {regulation_type}')
            print(f'Total regulations : {len(active_regulations)} regulations')
            print(f'Total time        : {round(sum(durations), 3)} seconds')
            print(f'Average time      : {round(sum(durations) / len(active_regulations), 3)} seconds')
            print('NOTE! Time records do not include the 2 seconds break between each regulation')
            print('=' * 76)

        return active_regulations

In [None]:
# Change These Input Data
input_data = [
    {
        'regulation_type': 'UU',
        'url': 'https://peraturan.bpk.go.id/Search?keywords=&tentang=&nomor=&jenis=8&tema=55'
    },
    {
        'regulation_type': 'PP',
        'url': 'https://peraturan.bpk.go.id/Search?keywords=&tentang=&nomor=&jenis=10&tema=55'
    },
    {
        'regulation_type': 'PERMENKOMINFO',
        'url': 'https://peraturan.bpk.go.id/Search?keywords=&tentang=&nomor=&entitas=603&tema=55'
    }
]

DIR_PATH = os.path.join('data', 'active')
os.makedirs(DIR_PATH, exist_ok=True)
# all_active_regulations = list()

for data in input_data:
    # Scrape active regulation links
    web_driver = webdriver.Firefox()
    scraper = BPKScraper(web_driver=web_driver)
    active_regulations = scraper.active_regulation(
        url=data['url'],
        regulation_type=data['regulation_type'],
        verbose=True
    )

    # Save output to .json file
    output_path = os.path.join(DIR_PATH, f'{data["regulation_type"]}.json')
    BPKScraper.list_of_dict_to_json(
        regulation_data=active_regulations,
        output_path=output_path
    )

    # Save output to .XSLX file
    output_path = os.path.join(DIR_PATH, f'{data["regulation_type"]}.xlsx')
    BPKScraper.list_of_dict_to_excel(
        regulation_data=active_regulations,
        output_path=output_path
    )

    # all_active_regulations = all_active_regulations + active_regulations

# # Save all output to .json file
# output_path = os.path.join(DIR_PATH, 'active_regulations.json')
# BPKScraper.list_of_dict_to_json(
#     regulation_data=active_regulations,
#     output_path=output_path
# )

# # Save all output to .XSLX file
# output_path = os.path.join(DIR_PATH, 'active_regulations.xlsx')
# BPKScraper.list_of_dict_to_excel(
#     regulation_data=active_regulations,
#     output_path=output_path
# )

In [None]:
# def scrape_active_regulation_links(url: str, keyword: str) -> list[str]:
#     regulations_box_xpath = '/html/body/div/div/div[2]/div[2]/div[2]'
#     regulations_css_selector = 'div.row.mb-8[class="row mb-8"]'
#     regulation_href_css_selector = 'div.col-lg-10.fs-2.fw-bold.pe-4 a'
#     pagination_box_css_selector = 'ul.pagination.justify-content-center'
#     regulation_number_css_selector = 'div.col-lg-8.fw-semibold.fs-5.text-gray-600'
#     regulation_title_css_selector = 'div.col-lg-10.fs-2.fw-bold.pe-4'
#     regulation_subjects_css_selector = 'span.badge.badge-light-primary.mb-2'
#     page_pattern = r'p=(\d+)'

#     driver = webdriver.Firefox()
#     active_regulations_list = list()

#     # CHECK PAGE NUMBERING IN URL
#     if re.search(page_pattern, url):
#         url = re.sub(page_pattern, 'p={page}', url)
#     else:
#         url = url + '&p={page}'

#     # GET LAST PAGE NUMBER
#     driver.get(url.format(page=1))
#     wait = WebDriverWait(driver, timeout=5)
#     wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, pagination_box_css_selector)))
#     pagination_box = driver.find_element(By.CSS_SELECTOR, pagination_box_css_selector)
#     last_page_button = pagination_box.find_element(By.XPATH, "./*[last()]")
#     last_page_href = last_page_button.find_element(By.CSS_SELECTOR, '[href]').get_attribute('href')
#     last_page_number = int(re.search(page_pattern, last_page_href)[1])

#     # ITERATE FOR ALL PAGE
#     for page in tqdm(range(1, last_page_number + 1)):
#         driver.get(url.format(page=page))
#         wait = WebDriverWait(driver, timeout=5)
#         wait.until(EC.presence_of_element_located((By.XPATH, regulations_box_xpath)))
        
#         regulations_box = driver.find_element(By.XPATH, regulations_box_xpath)
#         regulations_all = regulations_box.find_elements(By.CSS_SELECTOR, regulations_css_selector)

#         for regulation in regulations_all:
#             if not re.findall('Dicabut dengan', regulation.text):
                
#                 # Regulation Number & Year
#                 regulation_number_and_year = regulation.find_element(By.CSS_SELECTOR, regulation_number_css_selector).text.lower()
                
#                 # Get Regulation Number
#                 new_regulation_number = re.search(r'\b(?:nomor|no\.)\s+(\d+)', regulation_number_and_year)
#                 old_regulation_number = re.search(r'(\d+)\/', new_regulation_number[0])
#                 regulation_number = new_regulation_number[1] if old_regulation_number is None else old_regulation_number[1]

#                 # Get Regulation Year
#                 regulation_year = re.search(r'tahun\s+(\d+)', regulation_number_and_year)
#                 regulation_year = regulation_year[1] if regulation_year is not None else ''

#                 # Get Regulation Title
#                 regulation_title = regulation.find_element(By.CSS_SELECTOR, regulation_title_css_selector).text.strip()
                
#                 # Get Regulation Subjects
#                 regulation_subjects = list()
#                 regulation_subject_elements = regulation.find_elements(By.CSS_SELECTOR, regulation_subjects_css_selector)
#                 if regulation_subject_elements:
#                     for subject in regulation_subject_elements:
#                         regulation_subjects.append(subject.text)
                
#                 # Get Regulation URL Link
#                 regulation_href = regulation.find_element(By.CSS_SELECTOR, regulation_href_css_selector).get_attribute("href")
                
#                 # Build Regulation ID
#                 number = f'{regulation_year}{regulation_number.zfill(3)}'
#                 regulation_id = f'{keyword}_{regulation_number.zfill(3)}_{regulation_year}'

#                 # Store All Data
#                 active_regulations_list.append({
#                     'no': number,
#                     'name': regulation_id,
#                     'about': regulation_title,
#                     'subjects': regulation_subjects,
#                     'url': regulation_href,
#                     'used': False
#                 })
        
#         time.sleep(1)
    
#     driver.quit()
#     return active_regulations_list

In [None]:
# # Change These Input Data
# # keyword = 'PERMENKOMINFO'
# # url = 'https://peraturan.bpk.go.id/Search?keywords=&tentang=&nomor=&entitas=603&tema=55'

# # keyword = 'UU'
# # url = 'https://peraturan.bpk.go.id/Search?keywords=&tentang=&nomor=&jenis=8&tema=55'

# keyword = 'PP'
# url = 'https://peraturan.bpk.go.id/Search?keywords=&tentang=&nomor=&jenis=10&tema=55'

# # Scrape Active Regulation Links
# active_regulations_list = scrape_active_regulation_links(url, keyword)
# print(f'Successfully scraping {len(active_regulations_list)} regulatory links')

# # Save Output to .XSLX File
# base_directory = os.path.join('output', 'active')
# os.makedirs(base_directory, exist_ok=True)

# output_path = os.path.join(base_directory, f'{keyword}.xlsx')
# df = pd.DataFrame(active_regulations_list)
# df.to_excel(output_path, index=False)

# print(f'Successfully saved {len(active_regulations_list)} regulatory links to {output_path}')

In [None]:
# active_regulations_list