In [None]:
import os
import pathlib

from selenium.webdriver import Firefox

from src.prep import utils
from src.prep import pdf_converter
from src.prep import regulation_parser
from src.prep.regulation_scraper import (
    BPKScraper,
    KomdigiScraper
)

### **Scrape Active Regulations**

In [None]:
# Change These Input Data
input_data = [
    {
        "regulation_type": "UU",
        "url": "https://peraturan.bpk.go.id/Search?keywords=&tentang=&nomor=&jenis=8&tema=55"
    },
    {
        "regulation_type": "PP",
        "url": "https://peraturan.bpk.go.id/Search?keywords=&tentang=&nomor=&jenis=10&tema=55"
    },
    {
        "regulation_type": "PERMENKOMINFO",
        "url": "https://peraturan.bpk.go.id/Search?keywords=&tentang=&nomor=&entitas=603&tema=55"
    }
]

dir_path = os.path.join("data", "active")
os.makedirs(dir_path, exist_ok=True)

web_driver = Firefox()
bpk_scraper = BPKScraper(web_driver=web_driver)
output_path = os.path.join(dir_path, "active_regulation.xlsx")

for data in input_data:
    # Scrape active regulation links
    active_regulations = bpk_scraper.active_regulation(
        url=data["url"],
        regulation_type=data["regulation_type"],
        verbose=True
    )

    # Save output to .XSLX file
    utils.list_of_dict_to_excel(
        data=active_regulations,
        output_path=output_path,
        sheet_name=data["regulation_type"]
    )

web_driver.quit()

Scraping active regulations: 100%|██████████| 1/1 [00:04<00:00,  4.97s/it]


URL                 : https://peraturan.bpk.go.id/Search?keywords=&tentang=&nomor=&jenis=8&tema=55
Regulation type     : UU
Total regulations   : 5 regulations
Gross time          : 4.98 seconds
Net time            : 3.965 seconds
Average gross time  : 0.996 seconds
Average net time    : 0.793 seconds


Scraping active regulations: 100%|██████████| 4/4 [00:16<00:00,  4.23s/it]


URL                 : https://peraturan.bpk.go.id/Search?keywords=&tentang=&nomor=&jenis=10&tema=55
Regulation type     : PP
Total regulations   : 17 regulations
Gross time          : 16.941 seconds
Net time            : 12.928 seconds
Average gross time  : 0.997 seconds
Average net time    : 0.76 seconds


Scraping active regulations: 100%|██████████| 39/39 [02:51<00:00,  4.40s/it]


URL                 : https://peraturan.bpk.go.id/Search?keywords=&tentang=&nomor=&entitas=603&tema=55
Regulation type     : PERMENKOMINFO
Total regulations   : 178 regulations
Gross time          : 171.583 seconds
Net time            : 132.437 seconds
Average gross time  : 0.964 seconds
Average net time    : 0.744 seconds


### **Scrape Regulation Metadata**

In [2]:
# Change These Input
file_path = os.path.join("data", "active", "selected_regulation.xlsx")

uu = utils.load_excel_selected_regulations(file_path=file_path, sheet_name="UU", url_type="url_1", url_only=True)
pp = utils.load_excel_selected_regulations(file_path=file_path, sheet_name="PP", url_type="url_1", url_only=True)
permenkominfo = utils.load_excel_selected_regulations(file_path=file_path, sheet_name="PERMENKOMINFO", url_type="url_1", url_only=True)

selected_regulations = uu + pp + permenkominfo
print(f"Total regulations: {len(selected_regulations)}")
display(selected_regulations)

Total regulations: 63


['https://peraturan.bpk.go.id/Details/45357/uu-no-36-tahun-1999',
 'https://peraturan.bpk.go.id/Details/45370/uu-no-40-tahun-1999',
 'https://peraturan.bpk.go.id/Details/37589/uu-no-11-tahun-2008',
 'https://peraturan.bpk.go.id/Details/37582/uu-no-19-tahun-2016',
 'https://peraturan.bpk.go.id/Details/229798/uu-no-27-tahun-2022',
 'https://peraturan.bpk.go.id/Details/274494/uu-no-1-tahun-2024',
 'https://peraturan.bpk.go.id/Details/53323/pp-no-52-tahun-2000',
 'https://peraturan.bpk.go.id/Details/122030/pp-no-71-tahun-2019',
 'https://peraturan.bpk.go.id/Details/126143/pp-no-80-tahun-2019',
 'https://peraturan.bpk.go.id/Details/161970/pp-no-46-tahun-2021',
 'https://peraturan.bpk.go.id/Details/159911/permenkominfo-no-11permkominfo42007-tahun-200',
 'https://peraturan.bpk.go.id/Details/159930/permenkominfo-no-26permkominfo52007-tahun-200',
 'https://peraturan.bpk.go.id/Details/159937/permenkominfo-no-38permkominfo92007-tahun-200',
 'https://peraturan.bpk.go.id/Details/159035/permenkominf

In [None]:
output_path = os.path.join("data", "json", "regulation_data.json")

web_driver = Firefox()
bpk_scraper = BPKScraper(web_driver=web_driver)

regulation_metadata = bpk_scraper.regulation_metadata(
    urls=selected_regulations,
    verbose=True
)

utils.list_of_dict_to_json(
    data=regulation_metadata,
    output_path=output_path
)

web_driver.quit()

Scraping regulation metadata: 100%|██████████| 63/63 [03:15<00:00,  3.11s/it]


Total regulations   : 63 regulations
Gross time          : 195.63 seconds
Net time            : 69.498 seconds
Average gross time  : 3.105 seconds
Average net time    : 1.103 seconds


In [4]:
input_json_file = os.path.join("data", "json", "regulation_data.json")
utils.modify_status_json_regulation(input_json_file=input_json_file, verbose=True)

Modify regulation metadata: 63it [00:00, ?it/s]

Successfully modified JSON data to data\json\regulation_data_modified.json





### **Download Regulation PDF**

In [None]:
input_json_file = os.path.join("data", "json", "regulation_data_modified.json")
json_data = utils.read_json(input_path=input_json_file)
download_data = []

for regulation in json_data:
    download_data.append({
        "name": regulation["download_name"],
        "url": regulation["download_link"],
    })

download_full_dir_path = os.path.join(os.getcwd(), "data", "pdf")

regulation_scraper.BPKScraper.download_regulation_pdf(
    download_data=download_data,
    download_full_dir_path=download_full_dir_path,
    verbose=True
)

Download files: 100%|██████████| 63/63 [05:20<00:00,  5.09s/it]
Rename files: 100%|██████████| 63/63 [00:00<00:00, 3350.98it/s]


Successfully download PDF file to data\pdf


### **Scrape Regulation Content**

In [None]:
# Change These Input
file_path = os.path.join("data", "active", "selected_regulation.xlsx")

uu = utils.load_excel_selected_regulations(file_path=file_path, sheet_name="UU", url_type="url_2", url_only=False)
pp = utils.load_excel_selected_regulations(file_path=file_path, sheet_name="PP", url_type="url_2", url_only=False)
permenkominfo = utils.load_excel_selected_regulations(file_path=file_path, sheet_name="PERMENKOMINFO", url_type="url_2", url_only=False)

regulation_names_and_links = uu + pp + permenkominfo
print(f"Total regulations: {len(regulation_names_and_links)}")
display(regulation_names_and_links)

Total regulations: 55


[{'name': 'UU_1999_036',
  'url': 'https://jdih.komdigi.go.id/produk_hukum/view/id/564/t/undangundang+nomor+36+tahun+1999'},
 {'name': 'UU_2008_011',
  'url': 'https://jdih.komdigi.go.id/produk_hukum/view/id/167/t/undangundang+nomor+11+tahun+2008'},
 {'name': 'UU_2016_019',
  'url': 'https://jdih.komdigi.go.id/produk_hukum/view/id/555/t/undangundang+nomor+19+tahun+2016'},
 {'name': 'UU_2022_027',
  'url': 'https://jdih.komdigi.go.id/produk_hukum/view/id/832/t/undangundang+nomor+27+tahun+2022'},
 {'name': 'UU_2024_001',
  'url': 'https://jdih.komdigi.go.id/produk_hukum/view/id/884/t/undangundang+nomor+1+tahun+2024'},
 {'name': 'PP_2019_071',
  'url': 'https://jdih.komdigi.go.id/produk_hukum/view/id/695/t/peraturan+pemerintah+nomor+71+tahun+2019'},
 {'name': 'PP_2021_046',
  'url': 'https://jdih.komdigi.go.id/produk_hukum/view/id/762/t/peraturan+pemerintah+nomor+46+tahun+2021'},
 {'name': 'PERMENKOMINFO_2007_011',
  'url': 'https://jdih.komdigi.go.id/produk_hukum/view/id/431/t/peraturan+

In [None]:
output_dir = os.path.join("data", "markdown", "raw", "komdigi")

web_driver = Firefox()
komdigi_scraper = KomdigiScraper(web_driver=web_driver)

komdigi_scraper.regulation_product_content(
    regulation_names_and_links=regulation_names_and_links,
    output_dir=output_dir,
    verbose=True
)

web_driver.quit()

Scraping regulation content:   0%|          | 0/55 [00:00<?, ?it/s]

Scraping regulation content: 100%|██████████| 55/55 [09:30<00:00, 10.36s/it]

Output directory    : data\markdown\raw\komdigi
Total regulations   : 55 regulations
Total success       : 55 regulations
Total failed        : 0 regulations
Gross time          : 570.067 seconds
Net time            : 459.733 seconds
Average gross time  : 10.365 seconds
Average net time    : 8.359 seconds





### **Convert PDF to TXT/MD**

In [4]:
input_dir = pathlib.Path("data").joinpath("pdf")
output_dir = pathlib.Path("data").joinpath("markdown", "raw", "bpk")

pdf_converter_obj = pdf_converter.PDFConverter()

pdf_converter_obj.pdf_to_txt(input_dir=input_dir, output_dir=output_dir)

INFO:docling.document_converter:Going to convert document batch...
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importing MultiScaleDeformableAttention: The specified module could not be found.
Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importing MultiScaleDeformableAttention: The specified module could not be found.
Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importing MultiScaleDeformableAttention: The specified module could not be found.
Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importing MultiScaleDeformableAttention: The specified module could not be found.
Could not load the custom kernel for multi-scale deformable attention: DLL load failed while importing MultiScaleDeformableAttention: The specified module could not be

### **Parse Regulation Content**

In [None]:
input_dir = os.path.join("data", "markdown", "clean", "all")
json_input = os.path.join("data", "json", "regulation_data_modified.json")
json_output = os.path.join("data", "json", "regulation_data_final.json")

parser = regulation_parser.RegulationParser()
regulation_data = parser.parse_regulations_content(
    input_dir=input_dir,
    json_input=json_input,
    json_output=json_output,
    verbose=True
)

Parsing regulations content: 100%|██████████| 63/63 [00:00<00:00, 124.31it/s]


Input directory     : data\markdown\clean\all
Input JSON          : data\json\regulation_data_modified.json
Output JSON         : data\json\regulation_data_final.json
Total regulations   : 63 regulations
Total success       : 63 regulations
Total failed        : 0 regulations
Total articles      : 2423 articles
Total time          : 0.505 seconds
Average time/file   : 8.013 miliseconds
