In [None]:
import os
import re
import json
import time
import dateparser
import pandas as pd
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.remote.webdriver import WebDriver
# from selenium.webdriver.remote.webelement import WebElement
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC

## **Scraping Regulation Metadata**

In [None]:
def load_excel_selected_regulations(file_path: str, sheet_name: str, url_type: str = 'url_1',
                                    url_only: bool = True ) -> list[str] | list[dict]:

    # Read All Regulation with 'used' == 1
    selected_regulations = pd.read_excel(file_path, sheet_name=sheet_name)
    selected_regulations = selected_regulations.loc[selected_regulations['used'] == 1].copy()
    if url_only:
        return selected_regulations[url_type].tolist()
    else:
        selected_regulations = selected_regulations.loc[selected_regulations[url_type].notna()].copy()
        selected_regulations = selected_regulations.loc[:, ['name', url_type]].copy()
        selected_regulations.rename(columns={'name': 'name', url_type: 'url'}, inplace=True)
        return selected_regulations.to_dict(orient='records')

# Change These Input
filename = 'dataset.xlsx'
dir_path = os.path.join('data', 'active')
file_path = os.path.join(dir_path, filename)

uu = load_excel_selected_regulations(file_path=file_path, sheet_name='UU', url_type='url_1', url_only=True)
pp = load_excel_selected_regulations(file_path=file_path, sheet_name='PP', url_type='url_1', url_only=True)
permenkominfo = load_excel_selected_regulations(file_path=file_path, sheet_name='PERMENKOMINFO', url_type='url_1', url_only=True)

selected_regulations = uu + pp + permenkominfo
print(f'Total regulations: {len(selected_regulations)}')
display(selected_regulations)

In [None]:
class BPKScraper:

    def __init__(self, web_driver: WebDriver):
        self.web_driver = web_driver
        self.ENCODE = {
            'type': {
                'UU': '01',
                'PERPPU': '02',
                'PP': '03',
                'PERPRES': '04',
                'PERMENKOMINFO': '05'
            },
            'section': {
                'document': '1',
                'considering': '2',
                'observing': '3',
                'definition': '4',
                'chapter': '5',
                'article': '6',
                'section': '7',
            }
        }
        self.WORD_TO_NUMBER = {
            "kesatu": 1, "kedua": 2, "ketiga": 3, "keempat": 4, "kelima": 5,
            "keenam": 6, "ketujuh": 7, "kedelapan": 8, "kesembilan": 9, "kesepuluh": 10,
            "kesebelas": 11, "kedua belas": 12, "ketiga belas": 13, "keempat belas": 14, "kelima belas": 15,
            "keenam belas": 16, "ketujuh belas": 17, "kedelapan belas": 18, "kesembilan belas": 19, "kedua puluh": 20,
            "kedua puluh satu": 21, "kedua puluh dua": 22, "kedua puluh tiga": 23, "kedua puluh empat": 24, "kedua puluh lima": 25,
            "kedua puluh enam": 26, "kedua puluh tujuh": 27, "kedua puluh delapan": 28, "kedua puluh sembilan": 29, "ketiga puluh": 30,
            "ketiga puluh satu": 31, "ketiga puluh dua": 32, "ketiga puluh tiga": 33, "ketiga puluh empat": 34, "ketiga puluh lima": 35,
            "ketiga puluh enam": 36, "ketiga puluh tujuh": 37, "ketiga puluh delapan": 38, "ketiga puluh sembilan": 39, "keempat puluh": 40,
            "keempat puluh satu": 41, "keempat puluh dua": 42, "keempat puluh tiga": 43, "keempat puluh empat": 44, "keempat puluh lima": 45,
            "keempat puluh enam": 46, "keempat puluh tujuh": 47, "keempat puluh delapan": 48, "keempat puluh sembilan": 49, "kelima puluh": 50
        }
    
    
    @staticmethod  # https://stackoverflow.com/questions/735975/static-methods-in-python
    def list_of_dict_to_json(regulation_data: list[dict], output_path: str) -> None:
        if not output_path.endswith('.json'):
            output_path = output_path + '.json'
        with open(output_path, 'w', encoding='utf-8') as file:
            json.dump(regulation_data, file, indent=4)
        

    def regulation_metadata(self, regulation_links: list[str], verbose: bool = True) -> list[dict]:
        metadata_box_xpath = '/html/body/div/div/div[2]/div[2]/div/div[1]/div[2]/div'
        download_box_xpath = '/html/body/div/div/div[2]/div[2]/div/div[2]/div[1]'
        status_box_xpath = '/html/body/div/div/div[2]/div[2]/div/div[2]/div[2]'
        metadata_inner_box_css_selector = 'div.container.fs-6'
        status_inner_box_css_selector = 'div.container.fs-6'
        status_type_patterns = r'(Dicabut dengan :|Diubah dengan :|Mengubah :|Mencabut :)'

        # Final result
        regulation_metadata = list()
        durations = list()

        # Iterate for all regulation links
        for regulation_link in tqdm(iterable=regulation_links, desc='Scraping regulation metadata', disable=not verbose):
            start = time.time()
            
            # Go to the page
            access_page = False
            trial_number = 10

            for _ in range(trial_number):
                try:
                    # Try access the page
                    self.web_driver.get(regulation_link)
                    wait = WebDriverWait(self.web_driver, timeout=10)
                    wait.until(EC.presence_of_element_located((By.XPATH, metadata_box_xpath)))
                    wait.until(EC.presence_of_element_located((By.XPATH, download_box_xpath)))
                    access_page = True
                    break
                except TimeoutException as e:
                    # If timeout, wait for 2 seconds
                    time.sleep(2)
            
            if not access_page:
                if verbose:
                    print(f'Unable to access {regulation_link} after {trial_number} attempts')
                    print('Skip the scraping process to the next regulation link')
                continue

            # Extract metadata
            ineffective = False
            metadata_box = self.web_driver.find_element(By.XPATH, metadata_box_xpath)
            metadata_inner_box = metadata_box.find_element(By.CSS_SELECTOR, metadata_inner_box_css_selector)
            metadata_elements = metadata_inner_box.find_elements(By.XPATH, './*')[:-2]

            for index, element in enumerate(metadata_elements):
                if index == 1:
                    # Regulation title
                    title = re.search(r'Judul\s(.*)', element.text, re.IGNORECASE)
                    title = title[1] if title is not None else ''

                    # Regulation about
                    about = re.search(r'Tentang (.*)', title, re.IGNORECASE)
                    about = about[1] if about is not None else ''

                    # Regulation amendment number
                    amendment = '0'
                    if re.search(r'^Perubahan Atas', about, re.IGNORECASE):
                        amendment = '1'
                    elif re.search(r'^Perubahan (.+) Atas', about, re.IGNORECASE):
                        amendment = re.search(r'^Perubahan (.+) Atas', about, re.IGNORECASE)[1]
                        amendment = str(self.WORD_TO_NUMBER[amendment.strip().lower()])

                elif index == 3:  # Regulation number
                    number = re.search(r'Nomor\s(\d+)', element.text, re.IGNORECASE)
                    number = number[1] if number is not None else ''
                elif index == 4:  # Regulation type
                    regulation_type = re.search(r'Bentuk\s(.*)', element.text, re.IGNORECASE)
                    regulation_type = regulation_type[1] if regulation_type is not None else ''
                elif index == 5:  # Regulation short type
                    short_type = re.search(r'Bentuk Singkat\s(.*)', element.text, re.IGNORECASE)
                    short_type = short_type[1].upper() if short_type is not None else ''
                elif index == 6:  # Regulation year
                    year = re.search(r'Tahun\s(.*)', element.text, re.IGNORECASE)
                    year = year[1] if year is not None else ''
                elif index == 7:  # Regulation issue palce
                    issue_place = re.search(r'Tempat Penetapan\s(.*)', element.text, re.IGNORECASE)
                    issue_place = issue_place[1] if issue_place is not None else ''
                elif index == 8:  # Regulation issue date
                    issue_date = re.search(r'Tanggal Penetapan\s(.*)', element.text, re.IGNORECASE)
                    if issue_date is not None:
                        issue_date = dateparser.parse(date_string=issue_date[1], languages=['id'])
                        issue_date = issue_date.strftime('%Y-%m-%d')
                    else:
                        issue_date = ''
                elif index == 10:  # Regulation effective date
                    effective_date = re.search(r'Tanggal Berlaku\s(.*)', element.text, re.IGNORECASE)
                    if effective_date is not None:
                        effective_date = dateparser.parse(date_string=effective_date[1], languages=['id'])
                        effective_date = effective_date.strftime('%Y-%m-%d')
                    else:
                        effective_date = ''
                elif index == 12:  # Regulation subjects
                    subjects = re.search(r'Subjek\s(.*)', element.text, re.IGNORECASE)
                    subjects = subjects[1] if subjects is not None else ''
                    subjects = subjects.split('-')
                    subjects = [subject.strip() for subject in subjects]
                elif index == 13:  # Regulation status
                    status = re.search(r'Status\s(.*)', element.text, re.IGNORECASE)
                    status = status[1] if status is not None else ''
                    if status.lower() == 'tidak berlaku':
                        print(f'INEFFECTIVE REGULATION: {regulation_link}')
                        ineffective = True
                elif index == 15:  # Regulation institution
                    institution = re.search(r'Lokasi\s(.*)', element.text, re.IGNORECASE)
                    institution = institution[1] if institution is not None else ''

            if ineffective:
                continue

            # Create regulation ID
            regulation_id = '{year}{type}{number}{section}{section_number}{additional_section_number}'.format(
                year=year,
                type=self.ENCODE['type'][short_type],
                number=str(number).zfill(3),
                section=self.ENCODE['section']['document'],
                section_number='000',
                additional_section_number='00'
            )

            # Extract download link and name
            download_box = self.web_driver.find_element(By.XPATH, download_box_xpath)
            download_link = download_box.find_element(By.CSS_SELECTOR, '[href]').get_attribute('href')
            download_name = f'{short_type}_{year}_{str(number).zfill(3)}'

            # Extract regulation status references
            status_box = self.web_driver.find_element(By.XPATH, status_box_xpath)
            
            try:
                status_inner_box = status_box.find_element(By.CSS_SELECTOR, status_inner_box_css_selector)
            except NoSuchElementException as e:
                status_inner_box = None
            
            repealed = list()
            repeal = list()
            amended = list()
            amend = list()
            
            if status_inner_box is not None:
                status_elements = status_inner_box.find_elements(By.XPATH, './*')
                current_status = None
                next_status = None

                for element in status_elements:
                    text = element.text.strip()
                    current_status = next_status
                    next_status = None
                    
                    if re.search(status_type_patterns, text, re.IGNORECASE):
                        current_status = text
                        next_status = text
                        continue

                    regulation_references = element.find_elements(By.CSS_SELECTOR, '[href]')
                    for regulation_reference in regulation_references:
                        href = regulation_reference.get_attribute('href')
                        if current_status == 'Dicabut dengan :':
                            repealed.append(href)
                        elif current_status == 'Mencabut :':
                            repeal.append(href)
                        elif current_status == 'Diubah dengan :':
                            amended.append(href)
                        elif current_status == 'Mengubah :':
                            amend.append(href)
            
            # Combine and append all metadata to regulation_metadata
            regulation_metadata.append({
                'id': regulation_id,                # ID peraturan
                'url': regulation_link,             # Link Web Peraturan
                'download_link': download_link,     # Link Download Peraturan
                'download_name': download_name,     # Nama File Download
                'title': title,                     # Judul Lengkap Peraturan
                'about': about,                     # Judul Isi Peraturan
                'type': regulation_type,            # Jenis Peraturan
                'short_type': short_type,           # Jenis Peraturan (Singkatan)
                'amendment': amendment,             # Nomor Amandemen
                'number': number,                   # Nomor Peraturan
                'year': year,                       # Tahun Peraturan
                'institution': institution,         # Lembaga
                'issue_place': issue_place,         # Tempat Penetapan
                'issue_date': issue_date,           # Tanggal Penetapan
                'effective_date': effective_date,   # Tanggal Diberlakukan
                'subjects': subjects,               # Subjek
                'status': {
                    'repealed': repealed,           # Dicabut dengan ..
                    'repeal': repeal,               # Mencabut ...
                    'amended': amended,             # Diubah dengan ...
                    'amend': amend                  # Mengubah ...
                }
            })

            durations.append(time.time() - start)
            time.sleep(2)  # Break for 2 seconds
        
        self.web_driver.quit()

        if verbose:
            print('=' * 76)
            print(f'Total regulations : {len(regulation_links)} regulations')
            print(f'Total time        : {round(sum(durations), 3)} seconds')
            print(f'Average time      : {round(sum(durations) / len(regulation_links), 3)} seconds')
            print('NOTE! Time records do not include the 2 seconds break between each regulation')
            print('=' * 76)

        return regulation_metadata

In [None]:
dir_path = 'data'
output = 'regulation_data.json'
output_path = os.path.join(dir_path, output)

web_driver = webdriver.Firefox()
scraper = BPKScraper(web_driver=web_driver)
regulation_metadata = scraper.regulation_metadata(
    regulation_links=selected_regulations,
    verbose=True
)

BPKScraper.list_of_dict_to_json(
    regulation_data=regulation_metadata,
    output_path=output_path
)

In [None]:
# from jdih_scraper.bpk import BPKScraper

# OUTPUT = 'test.json'
# OUTPUT_PATH = os.path.join(DIR_PATH, OUTPUT)

# web_driver = webdriver.Firefox()
# scraper = BPKScraper(web_driver=web_driver)
# regulation_data = scraper.scrape_jdih_bpk_regulation_metadata(
#     regulation_links=selected_regulations[:5],
#     output_path=OUTPUT_PATH,
#     verbose=True
# )

In [None]:
# READ JSON
# https://stackoverflow.com/questions/20199126/reading-json-from-a-file
with open(output_path) as input_file:
    json_data = json.load(input_file)

json_data

In [None]:
json_data[0]['download_name']

## **Creta CSV File for URL -> ID Mapping**

In [None]:
mapping_url_id = list()
for regulation in tqdm(json_data):
    mapping_url_id.append({
        'url': regulation['url'],
        'id': regulation['id'],
    })

In [None]:
output = 'mapping_url_id.csv'
output_path = os.path.join(dir_path, output)

mapping_url_id_df = pd.DataFrame(mapping_url_id)
mapping_url_id_df.to_csv(output_path, index=False)
mapping_url_id_df

## **Modified Regulation Data Status**

- Mengubah url di 'status' menjadi ID jika ada di dalam Mapping URL -> ID
- Jika tidak ada di dalam Mapping URL -> ID maka tetap biarkan dalam bentuk url

In [None]:
mapping_url_id_df.loc[mapping_url_id_df['url'] == 'https://peraturan.bpk.go.id/Details/45357/uu-no-36-tahun-1999', 'id'].values[0]

In [None]:
for index, regulation in tqdm(enumerate(json_data)):
    for status, values in regulation['status'].items():
        temp_list = list()
        for val in values:
            mapping_value = mapping_url_id_df.loc[mapping_url_id_df['url'] == val, 'id'].values
            mapping_value = str(mapping_value[0]) if len(mapping_value) > 0 else val
            temp_list.append(mapping_value)
            # KALAU MAU URL DIHAPUS
            # if mapping_value != val:
                # temp_list.append(mapping_value)
        json_data[index]['status'][status] = temp_list

In [None]:
output = 'regulation_data_modified.json'
output_path = os.path.join(dir_path, output)

output_json_str = BPKScraper.list_of_dict_to_json(json_data, output_path)
print(f'Successfully saved {len(json_data)} regulatory data to {output_path}')

json_data

## **Create EXCEL File for Filtering**

TIDAK JADI.

Karena aku sekarang aku melakukan filtering dulu, baru melakukan semua yang di atas.

In [None]:
# filtering_data = list()
# for regulation in tqdm(json_data):
#     filtering_data.append({
#         'id': regulation['id'],
#         'name': regulation['download_name'],
#         'about': regulation['about'],
#         'subjects': regulation['subjects'],
#         'url': regulation['url'],
#         'used': False
#     })

In [None]:
# filtering_data_df = pd.DataFrame(filtering_data)
# filtering_data_df.to_excel(os.path.join('output', 'regulation_data_filtering_real.xlsx'), index=False)
# filtering_data_df

## **Arsip**

In [None]:
# def scrape_regulation_data(links: list[str]) -> list[dict]:
#     metadata_box_xpath = '/html/body/div/div/div[2]/div[2]/div/div[1]/div[2]/div'
#     download_box_xpath = '/html/body/div/div/div[2]/div[2]/div/div[2]/div[1]'
#     status_box_xpath = '/html/body/div/div/div[2]/div[2]/div/div[2]/div[2]'
#     metadata_inner_box_css_selector = 'div.container.fs-6'
#     status_inner_box_css_selector = 'div.container.fs-6'
#     status_type_patterns = r'(Dicabut dengan :|Diubah dengan :|Mengubah :|Mencabut :)'

#     driver = webdriver.Firefox()
#     regulation_data = list()

#     for regulation_link in tqdm(iterable=links, desc='Scraping regulation data'):
#         # print(f'{str(regulation_index + 1).zfill(3)}. {regulation_link}')
#         driver.get(regulation_link)
#         wait = WebDriverWait(driver, timeout=5)
#         wait.until(EC.presence_of_element_located((By.XPATH, metadata_box_xpath)))
#         wait.until(EC.presence_of_element_located((By.XPATH, download_box_xpath)))

#         # EXTRACT METADATA
#         ineffective = False
#         metadata_box = driver.find_element(By.XPATH, metadata_box_xpath)
#         metadata_inner_box = metadata_box.find_element(By.CSS_SELECTOR, metadata_inner_box_css_selector)
#         metadata_elements = metadata_inner_box.find_elements(By.XPATH, './*')[:-2]

#         for index, element in enumerate(metadata_elements):
#             if index == 1:  # Regulation title and about
#                 title = re.search(r'Judul\s(.*)', element.text)
#                 title = title[1] if title is not None else ''
#                 about = re.search(r'[Tt]entang (.*)', title)
#                 about = about[1] if about is not None else ''
#             elif index == 3:  # Regulation number
#                 number = re.search(r'Nomor\s(\d+)', element.text)
#                 number = number[1] if number is not None else ''
#                 # old_numbering = re.search(r'(\d+)\/', new_numbering[0])
#                 # number = new_numbering[1] if old_numbering is None else old_numbering[1]
#             elif index == 4:  # Regulation type
#                 regulation_type = re.search(r'Bentuk\s(.*)', element.text)
#                 regulation_type = regulation_type[1] if regulation_type is not None else ''
#             elif index == 5:  # Regulation short type
#                 short_type = re.search(r'Bentuk Singkat\s(.*)', element.text)
#                 short_type = short_type[1].upper() if short_type is not None else ''
#             elif index == 6:  # Regulation year
#                 year = re.search(r'Tahun\s(.*)', element.text)
#                 year = year[1] if year is not None else ''
#             elif index == 7:  # Regulation issue palce
#                 issue_place = re.search(r'Tempat Penetapan\s(.*)', element.text)
#                 issue_place = issue_place[1] if issue_place is not None else ''
#             elif index == 8:  # Regulation issue date
#                 issue_date = re.search(r'Tanggal Penetapan\s(.*)', element.text)
#                 if issue_date is not None:
#                     issue_date = dateparser.parse(date_string=issue_date[1], languages=['id'])
#                     issue_date = issue_date.strftime('%Y-%m-%d')
#                 else:
#                     issue_date = ''
#             elif index == 10:  # Regulation effective date
#                 effective_date = re.search(r'Tanggal Berlaku\s(.*)', element.text)
#                 if effective_date is not None:
#                     effective_date = dateparser.parse(date_string=effective_date[1], languages=['id'])
#                     effective_date = effective_date.strftime('%Y-%m-%d')
#                 else:
#                     effective_date = ''
#             elif index == 12:  # Regulation subjects
#                 subjects = re.search(r'Subjek\s(.*)', element.text)
#                 subjects = subjects[1] if subjects is not None else ''
#                 subjects = subjects.split('-')
#                 subjects = [subject.strip() for subject in subjects]
#             elif index == 13:  # Regulation status
#                 status = re.search(r'Status\s(.*)', element.text)
#                 status = status[1] if status is not None else ''
#                 if status.lower() == 'tidak berlaku':
#                     print(f'INEFFECTIVE REGULATION: {regulation_link}')
#                     ineffective = True
#             elif index == 15:  # Regulation institution
#                 institution = re.search(r'Lokasi\s(.*)', element.text)
#                 institution = institution[1] if institution is not None else ''

#         if ineffective:
#             continue

#         # TODO: Ubah '05' menyesuaikan dengan tipe peraturan
#         # 'type': {
#         #     'UU': '01',
#         #     'PERPPU': '02',
#         #     'PP': '03',
#         #     'PERPRES': '04',
#         #     'PERMENKOMINFO': '05'
#         # },
#         regulation_id = f'{year}05{str(number).zfill(3)}1000'

#         # EXTRACT DOWNLOAD LNK AND FILENAME
#         download_box = driver.find_element(By.XPATH, download_box_xpath)
#         download_link = download_box.find_element(By.CSS_SELECTOR, '[href]').get_attribute('href')
#         download_name = f'{short_type}_{str(number).zfill(3)}_{year}'

#         # EXTRACT REGULATION STATUS REFERENCE
#         status_box = driver.find_element(By.XPATH, status_box_xpath)
        
#         try:
#             status_inner_box = status_box.find_element(By.CSS_SELECTOR, status_inner_box_css_selector)
#         except NoSuchElementException as e:
#             status_inner_box = None
        
#         repealed = list()
#         repeal = list()
#         amended = list()
#         amend = list()
        
#         if status_inner_box is not None:
#             status_elements = status_inner_box.find_elements(By.XPATH, './*')
#             current_status = None
#             next_status = None

#             for element in status_elements:
#                 text = element.text.strip()
#                 current_status = next_status
#                 next_status = None
                
#                 if re.search(status_type_patterns, text):
#                     current_status = text
#                     next_status = text
#                     continue
                
#                 # https://peraturan.bpk.go.id/Details/203070/permenkominfo-no-5-tahun-2021
#                 # Kasus unik, di mana Mencabut sebagian masuk ke dalam mencabut
#                 # SUDAH SOLVED
#                 regulation_references = element.find_elements(By.CSS_SELECTOR, '[href]')
#                 for regulation_reference in regulation_references:
#                     href = regulation_reference.get_attribute('href')
#                     if current_status == 'Dicabut dengan :':
#                         repealed.append(href)
#                     elif current_status == 'Mencabut :':
#                         repeal.append(href)
#                     elif current_status == 'Diubah dengan :':
#                         amended.append(href)
#                     elif current_status == 'Mengubah :':
#                         amend.append(href)


        
#         # COMBINE ALL DATA AND APPEND TO regulation_data
#         data = {
#             'id': regulation_id,                # ID peraturan
#             'url': regulation_link,             # Link Web Peraturan
#             'download_link': download_link,     # Link Download Peraturan
#             'download_name': download_name,     # Nama File Download
#             'title': title,                     # Judul Lengkap Peraturan
#             'about': about,                     # Judul Isi Peraturan
#             'type': regulation_type,            # Jenis Peraturan
#             'short_type': short_type,           # Jenis Peraturan (Singkatan)
#             'number': number,                   # Nomor Peraturan
#             'year': year,                       # Tahun Peraturan
#             'institution': institution,         # Lembaga
#             'issue_place': issue_place,         # Tempat Penetapan
#             'issue_date': issue_date,           # Tanggal Penetapan
#             'effective_date': effective_date,   # Tanggal Diberlakukan
#             'subjects': subjects,               # Subjek
#             'status': {
#                 'repealed': repealed,           # Dicabut dengan ..
#                 'repeal': repeal,               # Mencabut ...
#                 'amended': amended,             # Diubah dengan ...
#                 'amend': amend                  # Mengubah ...
#             }
#         }

#         regulation_data.append(data)
        
#         # WAIT 2 SECOND FOR THE NEXT REGULATION
#         time.sleep(2)
    
#     driver.quit()
    
#     return regulation_data

In [None]:
# # https://www.freecodecamp.org/news/how-to-pretty-print-json-in-python/
# # https://www.geeksforgeeks.org/how-to-convert-python-dictionary-to-json/
# # Convert the data to a JSON formatted string with 4 spaces of indentation
# def list_of_dict_to_json(data: list[dict], output_name: str) -> str:
#     if not output_name.endswith('.json'):
#         output_name = output_name + '.json'
#     with open(output_name, 'w') as output_file:
#         json.dump(data, output_file, indent=4)
#         output_json_str = json.dumps(data, indent=4)
#         return output_json_str

In [None]:
# regulation_data = scrape_regulation_data(selected_regulations)
# print(f'[INFO] Successfully scraping {len(regulation_data)} regulatory data')

# OUTPUT = 'regulation_data.json'
# OUTPUT_PATH = os.path.join(DIR_PATH, OUTPUT)
# output_json_str = list_of_dict_to_json(regulation_data, OUTPUT_PATH)
# print(f'[INFO] Successfully saved {len(regulation_data)} regulatory data to {OUTPUT_PATH}')

In [None]:
# # READ JSON
# # https://stackoverflow.com/questions/20199126/reading-json-from-a-file
# with open(OUTPUT_PATH) as input_file:
#     json_data = json.load(input_file)

# json_data