In [None]:
import os
import sys


sys.path.append(os.path.join(os.getcwd(), ".."))


from pathlib import Path
from selenium.webdriver import Firefox
from src.prep import (
    utils,
    PDFConverter,
    RegulationParser,
    BPKScraper,
    KomdigiScraper
)

### **Scrape Active Regulations**

In [None]:
# Change These Input Data
input_data = [
    {
        "regulation_type": "UU",
        "url": "https://peraturan.bpk.go.id/Search?keywords=&tentang=&nomor=&jenis=8&tema=55"
    },
    {
        "regulation_type": "PP",
        "url": "https://peraturan.bpk.go.id/Search?keywords=&tentang=&nomor=&jenis=10&tema=55"
    },
    {
        "regulation_type": "PERMENKOMINFO",
        "url": "https://peraturan.bpk.go.id/Search?keywords=&tentang=&nomor=&jenis=106&tema=55"
    }
]

dir_path = os.path.join("..", "data", "active")
os.makedirs(dir_path, exist_ok=True)

web_driver = Firefox()
bpk_scraper = BPKScraper(web_driver=web_driver)
output_path = os.path.join(dir_path, "active_regulation.xlsx")

for data in input_data:
    # Scrape active regulation links
    active_regulations = bpk_scraper.active_regulation(
        url=data["url"],
        regulation_type=data["regulation_type"],
        verbose=True
    )

    # Save output to .XSLX file
    utils.list_of_dict_to_excel(
        data=active_regulations,
        output_path=output_path,
        sheet_name=data["regulation_type"]
    )

web_driver.quit()

### **Scrape Regulation Metadata**

In [None]:
# Change These Input
file_path = os.path.join("..", "data", "active", "selected_regulation.xlsx")

uu = utils.load_excel_selected_regulations(
    file_path=file_path,
    sheet_name="UU",
    url_type="url_1",
    url_only=True
)
pp = utils.load_excel_selected_regulations(
    file_path=file_path,
    sheet_name="PP",
    url_type="url_1",
    url_only=True
)
permenkominfo = utils.load_excel_selected_regulations(
    file_path=file_path,
    sheet_name="PERMENKOMINFO",
    url_type="url_1",
    url_only=True
)

selected_regulations = uu + pp + permenkominfo
print(f"Total regulations: {len(selected_regulations)}")
display(selected_regulations)

In [None]:
output_path = os.path.join("..", "data", "json", "regulation_data.json")

web_driver = Firefox()
bpk_scraper = BPKScraper(web_driver=web_driver)

regulation_metadata = bpk_scraper.regulation_metadata(
    urls=selected_regulations,
    verbose=True
)

utils.list_of_dict_to_json(
    data=regulation_metadata,
    output_path=output_path
)

web_driver.quit()

In [None]:
input_json_file = os.path.join("..", "data", "json", "regulation_data.json")
utils.modify_status_json_regulation(input_json_file=input_json_file, verbose=True)

### **Download Regulation PDF**

In [None]:
input_json_file = os.path.join("..", "data", "json", "regulation_data_modified.json")
json_data = utils.read_json(input_path=input_json_file)
download_data = []

for regulation in json_data:
    download_data.append({
        "name": regulation["download_name"],
        "url": regulation["download_link"],
    })

download_full_dir_path = os.path.join(os.getcwd(), "..", "data", "pdf")

BPKScraper.download_regulation_pdf(
    download_data=download_data,
    download_full_dir_path=download_full_dir_path,
    verbose=True
)

### **Scrape Regulation Content**

In [None]:
# Change These Input
file_path = os.path.join("..", "data", "active", "selected_regulation.xlsx")

uu = utils.load_excel_selected_regulations(
    file_path=file_path,
    sheet_name="UU",
    url_type="url_2",
    url_only=False
)
pp = utils.load_excel_selected_regulations(
    file_path=file_path,
    sheet_name="PP",
    url_type="url_2",
    url_only=False
)
permenkominfo = utils.load_excel_selected_regulations(
    file_path=file_path,
    sheet_name="PERMENKOMINFO",
    url_type="url_2",
    url_only=False
)

regulation_names_and_links = uu + pp + permenkominfo
print(f"Total regulations: {len(regulation_names_and_links)}")
display(regulation_names_and_links)

In [None]:
output_dir = os.path.join("..", "data", "markdown", "raw", "komdigi")

web_driver = Firefox()
komdigi_scraper = KomdigiScraper(web_driver=web_driver)

komdigi_scraper.regulation_product_content(
    regulation_names_and_links=regulation_names_and_links,
    output_dir=output_dir,
    verbose=True
)

web_driver.quit()

### **Convert PDF to TXT/MD**

In [None]:
input_dir = Path("..").joinpath("data", "pdf")
output_dir = Path("..").joinpath("data", "markdown", "raw", "bpk")

pdf_converter = PDFConverter()
pdf_converter.pdf_to_txt(
    input_dir=input_dir,
    output_dir=output_dir
)

### **Parse Regulation Content**

In [None]:
input_dir = os.path.join("..", "data", "markdown", "clean", "all")
json_input = os.path.join("..", "data", "json", "regulation_data_modified.json")
json_output = os.path.join("..", "data", "json", "regulation_data_final.json")

parser = RegulationParser()
regulation_data = parser.parse_regulations_content(
    input_dir=input_dir,
    json_input=json_input,
    json_output=json_output,
    verbose=True
)