In [None]:
import os
import re
import json
import time
from tqdm import tqdm

In [None]:
# MODEL 2: STRUKTUR PERATURAN DASAR DAN AMANDEMEN SAMA

class RegulationParser:

    def __init__(self):
        # self.ALPHABET_TO_NUMERIC = {chr(i): f"{i - 64:02}" for i in range(65, 91)}
        
        self.WORD_TO_NUMBER = {
            "kesatu": 1, "kedua": 2, "ketiga": 3, "keempat": 4, "kelima": 5,
            "keenam": 6, "ketujuh": 7, "kedelapan": 8, "kesembilan": 9, "kesepuluh": 10,
            "kesebelas": 11, "kedua belas": 12, "ketiga belas": 13, "keempat belas": 14, "kelima belas": 15,
            "keenam belas": 16, "ketujuh belas": 17, "kedelapan belas": 18, "kesembilan belas": 19, "kedua puluh": 20,
            "kedua puluh satu": 21, "kedua puluh dua": 22, "kedua puluh tiga": 23, "kedua puluh empat": 24, "kedua puluh lima": 25,
            "kedua puluh enam": 26, "kedua puluh tujuh": 27, "kedua puluh delapan": 28, "kedua puluh sembilan": 29, "ketiga puluh": 30,
            "ketiga puluh satu": 31, "ketiga puluh dua": 32, "ketiga puluh tiga": 33, "ketiga puluh empat": 34, "ketiga puluh lima": 35,
            "ketiga puluh enam": 36, "ketiga puluh tujuh": 37, "ketiga puluh delapan": 38, "ketiga puluh sembilan": 39, "keempat puluh": 40,
            "keempat puluh satu": 41, "keempat puluh dua": 42, "keempat puluh tiga": 43, "keempat puluh empat": 44, "keempat puluh lima": 45,
            "keempat puluh enam": 46, "keempat puluh tujuh": 47, "keempat puluh delapan": 48, "keempat puluh sembilan": 49, "kelima puluh": 50
        }

        self.REGULATION_ENCODING = {
            "type": {
                "UU": "01",
                "PERPPU": "02",
                "PP": "03",
                "PERPRES": "04",
                "PERMENKOMINFO": "05"
            },
            "section": {
                "document": "1",
                "considering": "2",
                "observing": "3",
                "definition": "4",
                "chapter": "5",
                "article": "6",
                "section": "7",
            }
        }

        self.REGEX_PATTERNS = {
            "document": {
                "metadata": r"^(\w+)_(\w+)_(\w+)"  # Format: Jenis, tahun, dan nomor peraturan
            },

            "main": {
                "considering": r"(?<=## menimbang)([\S\s]*?)(?=## mengingat)",  # Menimbang
                "observing": r"(?<=## mengingat)([\S\s]*?)(?=(?:dengan persetujuan|## memperhatikan|## memutuskan))",  # Mengingat
                "amendment_to": r"^Perubahan",  # Cek apakah merupakan peraturan amandemen
                "chapter": r"(## BAB[\S\s]*?)(?=\s+(?:## BAB|agar setiap orang mengetahuinya|ditetapkan di))"  # Daftar Bab
            },

            # For every chapters (Bab)
            "chapter": {
                "about": r"## (BAB [^#]+)##",  # Nama Bab
                "part": r"(## Bagian [\S\s]*?)(?=\s+(?:## Bagian|$))",  # Daftar Bagian
                "paragraph": r"(## Paragraf [\S\s]*?)(?=\s+(?:## Paragraf|$))",  # Daftar Paragraf
                "article": r"(## Pasal \w+[\S\s]*?)(?=(?:##|ditetapkan di|$))"  # Daftar Pasal
            },

            # For every parts (Bagian)
            "part": {
                "about": r"## (Bagian [^#]+)##",  # Nama Bagian
                "number": r"Bagian (\w+) -"  # Nomor Bagian
            },

            # For every paragraphs (Paragraf)
            "paragraph": {
                "about": r"## (Paragraf [^#]+)##",  # Nama Paragraf
                "number": r"Paragraf (\w+) -"  # Nomor Paragraf

            },

            # For every articles (Pasal)
            "article": {
                "number": r"## Pasal (\d+\w*)",  # Nomor Pasal
                "text": r"## Pasal \w+\n*([\S\s]*)",  # Isi Pasal
                "check_definition": r"^dalam (?:undang-undang|peraturan)",  # Cek apakah Pasal 1 adalah definisi
                "definition": r"\(\d+[a-z]?\) (([A-Z][a-z]*(?:\s[A-Z][a-z]*)*) .*)",  # Daftar definisi dalam Pasal 1
                "no_ref": r"\*{2}NO_REF\*{2}",  # Pasal dengan tanda **NO_REF**
                "reference_1": r"Pasal (\d+\w*)",  # Pasal referensi jenis 1
                "reference_2": r"Pasal (\d+\w*) sampai dengan Pasal (\d+\w*)"  # Pasal referensi jenis 2
            },

            "amendment_to": {
                "amendment_point_1": r"(## \d+\.[\S\s]*?)(?=\s+(?:## \d+\.|## Pasal II))", # Poin amandemen jenis 1
                "amendment_point_2": r"(?<=## Pasal I)([\s\S]*?)(?=## Pasal II)",  # Poin amandemen jenis 2
                "chapter": r"(## BAB[\S\s]*?)(?=\s+(?:## BAB|agar setiap orang mengetahuinya|ditetapkan di))",  # Daftar Bab
                "part": r"(## Bagian [\S\s]*?)(?=\s+(?:## Bagian|$))",  # Daftar Bagian
                "paragraph": r"(## Paragraf [\S\s]*?)(?=\s+(?:## Paragraf|$))",  # Daftar Paragraf
                # "amendment_articles": r"(## Pasal \d+[A-Z]?[\S\s]*?)(?=(?:##|$))"  # Pasal Amandemen
            }
        }

    def parse_regulations_content(
            self,
            input_dir: str,
            json_input: str,
            json_output: str,
            verbose: bool = True
    ) -> list[dict]:
        
        """
        Parses regulation content from Markdown files and extracts structured data.

        Args:
            input_dir (str): Directory containing regulation Markdown files.
            json_input (str): Path to JSON file containing regulation metadata.
            json_output (str): Path to save the parsed regulation content as JSON.
            verbose (bool, optional): Whether to print progress and error messages. Defaults to True.

        Returns:
            list[dict]: A list of parsed regulation dictionaries.
        """

        # Initialize variables
        article_dict = {}
        durations = []
        result = []
        files = []
        success = 0
        failed = 0

        # Collect Markdown files from input directory
        for filename in os.listdir(input_dir):
            if filename.endswith(".md"):
                files.append((os.path.join(input_dir, filename), filename))

        # Iterate over each Markdown file
        for regulation_file in tqdm(iterable=files, desc="Parsing regulations content", disable=not verbose):
            start_time = time.time()
            filepath, filename = regulation_file

            try:
                with open(filepath, "r", encoding="utf8") as file:
                    # Initialize data
                    text = file.read()
                    regulation_dict = {}
                    definition_list = []

                    # Extract metadata from filename
                    metadata = re.search(self.REGEX_PATTERNS["document"]["metadata"], filename)
                    regulation_type = self.REGULATION_ENCODING["type"][metadata[1]]
                    regulation_year = metadata[2]
                    regulation_num = int(metadata[3])

                    # Create template ID for regulations
                    id_template = (
                        f"{regulation_year}{regulation_type}{str(regulation_num).zfill(3)}"
                        + "{reg_section}{section_num}{extra_section_number}"
                    )

                    # Generate regulation ID
                    regulation_id = id_template.format(
                        reg_section=self.REGULATION_ENCODING["section"]["document"],
                        section_num="000",
                        extra_section_number="00"
                    )

                    # Load regulation metadata from JSON
                    with open(json_input) as json_data:
                        for regulation_data in json.load(json_data):
                            if regulation_data["id"] == regulation_id:
                                regulation_dict = regulation_data
                                break

                    # Initialize content dictionary
                    regulation_dict["content"] = {}

                    # Extract "Menimbang" (considering) section
                    regulation_dict["content"]["considering"] = {
                        "id": id_template.format(
                            reg_section=self.REGULATION_ENCODING["section"]["considering"],
                            section_num="000",
                            extra_section_number="00"
                        ),
                        "text": re.search(self.REGEX_PATTERNS["main"]["considering"], text, re.IGNORECASE)[1].strip()
                    }

                    # Extract "Mengingat" (observing) section
                    regulation_dict["content"]["observing"] = {
                        "id": id_template.format(
                            reg_section=self.REGULATION_ENCODING["section"]["observing"],
                            section_num="000",
                            extra_section_number="00"
                        ), 
                        "text": re.search(self.REGEX_PATTERNS["main"]["observing"], text, re.IGNORECASE)[1].strip()
                    }

                    # Check if the regulation is an amendment
                    is_amendment = re.search(
                        self.REGEX_PATTERNS["main"]["amendment_to"],
                        regulation_dict["about"],
                        re.IGNORECASE
                    )
                    
                    if is_amendment:
                        regulation_dict, definition_list, article_dict = self.__parse_amendment_regulation(
                            text=text,
                            id_template=id_template,
                            regulation_dict=regulation_dict,
                            definition_list=definition_list,
                            article_dict=article_dict,
                            amended_regulations=regulation_dict["status"]["amend"]
                        )
                    else:
                        regulation_dict, definition_list, article_dict = self.__parse_base_regulation(
                            text=text,
                            id_template=id_template,
                            regulation_dict=regulation_dict,
                            definition_list=definition_list,
                            article_dict=article_dict
                        )
                    
                    result.append(regulation_dict)
                    success += 1
                
            except Exception as e:
                if verbose:
                    failed += 1
                    print(f"ERROR parsing content of {filename}")
                    print(e)

            durations.append(time.time() - start_time)

        # Ensure output file has .json extension
        if not json_output.endswith(".json"):
            json_output = f"{json_output}.json"

        # Save parsed results to JSON file
        with open(json_output, "w", encoding="utf8") as output_file: 
            json.dump(result, output_file, indent=4)
        
        # Print summary if verbose mode is enabled
        if verbose:
            print("=" * 76)
            print(f"Input directory   : {input_dir}")
            print(f"Input JSON        : {json_input}")
            print(f"Output JSON       : {json_output}")
            print(f"Total regulations : {len(files)} regulations")
            print(f"Total success     : {success} regulations")
            print(f"Total failed      : {failed} regulations")
            print(f"Total articles    : {len(article_dict)} articles")
            print(f"Total time        : {round(sum(durations) * 1000, 3)} miliseconds")
            print(f"Average time/file : {round(sum(durations) * 1000 / success, 3)} miliseconds")
            print("=" * 76)

        return result
    

    def __parse_base_regulation(
            self,
            text: str,
            id_template: str,
            regulation_dict: dict,
            definition_list: list,
            article_dict: dict
    ) -> tuple[dict, list, dict]:
        
        """
        Parses the base regulation text to extract chapters, parts, paragraphs, and articles.

        Args:
            text (str): The regulation text.
            id_template (str): Template for generating regulation IDs.
            regulation_dict (dict): Dictionary containing regulation metadata and parsed content.
            definition_list (list): List of legal definitions extracted from the text.
            article_dict (dict): Dictionary to store parsed articles.

        Returns:
            tuple[dict, list, dict]: Updated regulation dictionary, definition list, and article dictionary.
        """

        # Initialize variables
        last_article_number = ""
        regulation_dict["content"]["articles"] = {}

        # Extract all chapters from the text
        chapters = re.findall(self.REGEX_PATTERNS["main"]["chapter"], text, re.IGNORECASE)

        if chapters:
            # Process each chapter
            for chapter_num, chapter in enumerate(chapters):
                chapter_number = chapter_num + 1
                chapter_about = re.search(self.REGEX_PATTERNS["chapter"]["about"], chapter, re.IGNORECASE)[1].strip().upper()
                chapter_about = re.sub(r"\n", " - ", chapter_about, flags=re.IGNORECASE)

                # Extract all parts within the chapter
                parts = re.findall(self.REGEX_PATTERNS["chapter"]["part"], chapter.strip() + "\n", re.IGNORECASE)

                if parts:
                    # Process each part
                    for part_num, part in enumerate(parts):
                        part_number = part_num + 1
                        part_about = re.search(self.REGEX_PATTERNS["part"]["about"], part, re.IGNORECASE)[1].strip()
                        part_about = re.sub(r"\n", " - ", part_about, flags=re.IGNORECASE)

                        # Extract all paragraphs within the part
                        paragraphs = re.findall(self.REGEX_PATTERNS["chapter"]["paragraph"], part.strip() + "\n", re.IGNORECASE)

                        if paragraphs:
                            # Process each paragraph
                            for paragraph_num, paragraph in enumerate(paragraphs):
                                paragraph_number = paragraph_num + 1
                                paragraph_about = re.search(self.REGEX_PATTERNS["paragraph"]["about"], paragraph, re.IGNORECASE)[1].strip()
                                paragraph_about = re.sub(r"\n", " - ", paragraph_about, flags=re.IGNORECASE)
                                
                                # Parse articles within the paragraph
                                regulation_dict, definition_list, article_dict, last_article_number = self.__parse_articles(
                                    text=paragraph,
                                    chapter_number=str(chapter_number),
                                    chapter_about=chapter_about,
                                    part_number=str(part_number),
                                    part_about=part_about,
                                    paragraph_number=str(paragraph_number),
                                    paragraph_about=paragraph_about,
                                    id_template=id_template,
                                    regulation_dict=regulation_dict,
                                    definition_list=definition_list,
                                    article_dict=article_dict
                                )
                        else:
                            # If no paragraphs, parse articles within the part
                            regulation_dict, definition_list, article_dict, last_article_number = self.__parse_articles(
                                text=part,
                                chapter_number=str(chapter_number),
                                chapter_about=chapter_about,
                                part_number=str(part_number),
                                part_about=part_about,
                                paragraph_number="",
                                paragraph_about="",
                                id_template=id_template,
                                regulation_dict=regulation_dict,
                                definition_list=definition_list,
                                article_dict=article_dict
                            )
                else:
                    # If no parts, parse articles within the chapter
                    regulation_dict, definition_list, article_dict, last_article_number = self.__parse_articles(
                        text=chapter,
                        chapter_number=str(chapter_number),
                        chapter_about=chapter_about,
                        part_number="",
                        part_about="",
                        paragraph_number="",
                        paragraph_about="",
                        id_template=id_template,
                        regulation_dict=regulation_dict,
                        definition_list=definition_list,
                        article_dict=article_dict
                    )
        else:
            # If no chapters, parse the entire text as articles
            regulation_dict, definition_list, article_dict, last_article_number = self.__parse_articles(
                text=text,
                chapter_number="",
                chapter_about="",
                part_number="",
                part_about="",
                paragraph_number="",
                paragraph_about="",
                id_template=id_template,
                regulation_dict=regulation_dict,
                definition_list=definition_list,
                article_dict=article_dict
            )

        # Mark the last article's "next_article" field as empty
        regulation_dict["content"]["articles"][last_article_number]["next_article"] = ""

        return regulation_dict, definition_list, article_dict


    def __parse_amendment_regulation(
            self,
            text: str,
            id_template: str,
            regulation_dict: dict,
            definition_list: list,
            article_dict: dict,
            amended_regulations: list
    ) -> tuple[dict, list, dict]:
        
        """
        Parses an amendment regulation text and updates the regulation dictionary.
        
        Args:
            text (str): The amendment regulation text.
            id_template (str): Identifier template for articles.
            regulation_dict (dict): Dictionary to store parsed regulations.
            definition_list (list): List of definitions extracted from the text.
            article_dict (dict): Dictionary to store parsed articles.
            amended_regulations (list): List of amended regulations.
        
        Returns:
            tuple[dict, list, dict]: Updated regulation dictionary, definition list, and article dictionary.
        """
        
        regulation_dict["content"]["articles"] = {}
        
        # Extract amendment points from the text
        amendment_points = re.findall(self.REGEX_PATTERNS["amendment_to"]["amendment_point_1"], text, re.IGNORECASE)

        if not amendment_points:
            # Fallback to another regex pattern if amendment points are not found
            amendment_points = re.search(self.REGEX_PATTERNS["amendment_to"]["amendment_point_2"], text, re.IGNORECASE)
            if amendment_points:
                amendment_points = amendment_points[1].strip()
                first_sentence = re.search(r"^.*", amendment_points, re.IGNORECASE)[0].strip()  # Extract first sentence
                amendment_points = [amendment_points.replace(first_sentence, "").strip()]  # Remove first sentence
        
        # Process each amendment point
        for point in amendment_points:
            # Extract parts within the amendment
            parts = re.findall(self.REGEX_PATTERNS["amendment_to"]["part"], point.strip() + "\n", re.IGNORECASE)

            if parts:
                # Process each part
                for part in parts:
                    part_about = re.search(self.REGEX_PATTERNS["part"]["about"], part, re.IGNORECASE)[1].strip()
                    part_about = re.sub(r"\n", " - ", part_about, flags=re.IGNORECASE)
                    part_number = re.search(self.REGEX_PATTERNS["part"]["number"], part_about, re.IGNORECASE)[1].strip().lower()
                    part_number = self.WORD_TO_NUMBER.get(part_number, 0)

                    # Extract paragraphs within the part
                    paragraphs = re.findall(self.REGEX_PATTERNS["chapter"]["paragraph"], part.strip() + "\n", re.IGNORECASE)

                    if paragraphs:
                        # Process each paragraph
                        for paragraph in paragraphs:
                            paragraph_about = re.search(self.REGEX_PATTERNS["paragraph"]["about"], paragraph, re.IGNORECASE)[1].strip()
                            paragraph_about = re.sub(r"\n", " - ", paragraph_about, flags=re.IGNORECASE)
                            paragraph_number = re.search(self.REGEX_PATTERNS["paragraph"]["number"], paragraph_about, re.IGNORECASE)[1].strip()
                            
                            # Parse articles within the paragraph
                            regulation_dict, definition_list, article_dict, last_article_number = self.__parse_articles(
                                text=paragraph,
                                chapter_number="",
                                chapter_about="",
                                part_number=str(part_number),
                                part_about=part_about,
                                paragraph_number=str(paragraph_number),
                                paragraph_about=paragraph_about,
                                id_template=id_template,
                                regulation_dict=regulation_dict,
                                definition_list=definition_list,
                                article_dict=article_dict,
                                amended_regulations=amended_regulations
                            )

                    else:
                        # If no paragraphs, parse articles within the part
                        regulation_dict, definition_list, article_dict, last_article_number = self.__parse_articles(
                            text=part,
                            chapter_number="",
                            chapter_about="",
                            part_number=str(part_number),
                            part_about=part_about,
                            paragraph_number="",
                            paragraph_about="",
                            id_template=id_template,
                            regulation_dict=regulation_dict,
                            definition_list=definition_list,
                            article_dict=article_dict,
                            amended_regulations=amended_regulations
                        )

            else:
                # Extract paragraphs if no parts are found
                paragraphs = re.findall(self.REGEX_PATTERNS["amendment_to"]["paragraph"], point.strip() + "\n", re.IGNORECASE)

                if paragraphs:
                    # Process each paragraph
                    for paragraph_num, paragraph in enumerate(paragraphs):
                        paragraph_number = paragraph_num + 1
                        paragraph_about = re.search(self.REGEX_PATTERNS["paragraph"]["about"], paragraph, re.IGNORECASE)[1].strip()
                        paragraph_about = re.sub(r"\n", " - ", paragraph_about, flags=re.IGNORECASE)
                        paragraph_number = re.search(self.REGEX_PATTERNS["paragraph"]["number"], paragraph_about, re.IGNORECASE)[1].strip()

                        # Parse articles within the paragraph
                        regulation_dict, definition_list, article_dict, last_article_number = self.__parse_articles(
                            text=paragraph,
                            chapter_number="",
                            chapter_about="",
                            part_number="",
                            part_about="",
                            paragraph_number=str(paragraph_number),
                            paragraph_about=paragraph_about,
                            id_template=id_template,
                            regulation_dict=regulation_dict,
                            definition_list=definition_list,
                            article_dict=article_dict,
                            amended_regulations=amended_regulations
                        )

                else:
                    # Parse articles directly from the amendment point
                    regulation_dict, definition_list, article_dict, last_article_number = self.__parse_articles(
                        text=point,
                        chapter_number="",
                        chapter_about="",
                        part_number="",
                        part_about="",
                        paragraph_number="",
                        paragraph_about="",
                        id_template=id_template,
                        regulation_dict=regulation_dict,
                        definition_list=definition_list,
                        article_dict=article_dict,
                        amended_regulations=amended_regulations
                    )

        return regulation_dict, definition_list, article_dict


    def __parse_articles(
            self,
            text: str,
            chapter_number: str,
            chapter_about: str,
            part_number: str,
            part_about: str,
            paragraph_number: str,
            paragraph_about: str,
            id_template: str,
            regulation_dict: dict,
            definition_list: list,
            article_dict: dict,
            amended_regulations: list = []
    ) -> tuple[dict, list, str]:
        
        """
        Parses articles from the given legal text and updates the regulation dictionary.

        Args:
            text (str): The full legal text to be parsed.
            chapter_number (str): The chapter number.
            chapter_about (str): Description of the chapter.
            part_number (str): The part number.
            part_about (str): Description of the part.
            paragraph_number (str): The paragraph number.
            paragraph_about (str): Description of the paragraph.
            id_template (str): The ID template for generating article IDs.
            regulation_dict (dict): The dictionary containing regulation data.
            definition_list (list): A list of definitions extracted from Article 1.
            article_dict (dict): A dictionary to store parsed articles.
            amended_regulations (list, optional): List of amended regulations. Defaults to [].

        Returns:
            tuple[dict, list, str]: Updated regulation dictionary, definition list, and last article number.
        """
        
        # Store last processed article number
        last_article_number = ""

        # Extract all articles from the text using regex
        articles = re.findall(self.REGEX_PATTERNS["chapter"]["article"], text, re.IGNORECASE)

        # Process each article found in the text
        for article in articles:
            article_number = re.search(self.REGEX_PATTERNS["article"]["number"], article, re.IGNORECASE)[1]
            article_text = re.search(self.REGEX_PATTERNS["article"]["text"], article, re.IGNORECASE)[1].strip()
            article_text = re.sub(r"\n+", "\n", article_text)
        
            # Generate unique ID for the article
            article_id = self.__article_number_to_id(article_number, id_template, return_last_six=False)

            # Update last processed article number
            last_article_number = article_number

            # Determine previous, next article, and list of amended article IDs
            previous_article = ""
            next_article = ""
            amended_article = []

            if not amended_regulations:
                # If not an amendment, calculate previous and next article IDs
                if article_number != "1":
                    previous_article = id_template.format(
                        reg_section=self.REGULATION_ENCODING["section"]["article"],
                        section_num=str(int(article_number) - 1).zfill(3),
                        extra_section_number="00"
                    )
                
                next_article = id_template.format(
                    reg_section=self.REGULATION_ENCODING["section"]["article"],
                    section_num=str(int(article_number) + 1).zfill(3),
                    extra_section_number="00"
                )

            else:
                # Handle amendments: Find the next article from amended regulations
                for amended_regulation_id in amended_regulations:
                    # Generate a list containing 2 possible next article IDs
                    amended_regulation_id_template = amended_regulation_id[:-6] + "{reg_section}{section_num}{extra_section_number}"
                    pred_next_article_ids = self.__get_next_article_ids(article_number, amended_regulation_id_template)
                    
                    # Check if the possible next article ID exists, save and stop if it does
                    for pred_next_article_id in pred_next_article_ids:
                        if pred_next_article_id in article_dict.keys():
                            next_article = pred_next_article_id
                            break
                    if next_article:
                        break
                
                # Handle amendments: Find the previous article from amended regulations
                if article_number.isdigit() and article_number != "1":
                    # Find the largest existing previous article ID within the same regulation
                    # by filtering article IDs that have the same numeric article number (excluding letter variations)
                    filtered_ids = []
                    for regulation_id in [regulation_dict["id"]] + amended_regulations:
                        filtered_ids += list(filter(
                            lambda x: x.startswith(str(int(regulation_id[:-6] + article_id[9:13]) - 1)),
                            article_dict.keys()
                        ))
                    
                    # Create a dictionary with the prefix as the key and the last two digits as the value.
                    id_groups = {}
                    for id in filtered_ids:
                        prefix, suffix = id[:-2], id[-2:]  # Split prefix and suffix (last two digits)
                        if prefix not in id_groups or suffix > id_groups[prefix]:  
                            id_groups[prefix] = suffix  # Store the largest number for each prefix

                    # Collect article IDs that have the largest suffix values
                    max_suffix_ids = [id for id in filtered_ids if id[-2:] == id_groups[id[:-2]]]

                    # Select the highest article ID from the results as the previous article
                    # TODO: HAPUS TRY EXCEPT INI SETELAH SEMUA MARKDOWN PERATURAN SELESAI
                    try:
                        previous_article = max(max_suffix_ids)
                    except Exception as e:
                        print(f"{article_id}")
                        print(e)

                else:
                    # Attempt to find the previous article ID from the lettered article
                    for regulation_id in [regulation_dict["id"]] + amended_regulations:
                        # Generate possible previous article IDs
                        prev_regulation_id_template = regulation_id[:-6] + "{reg_section}{section_num}{extra_section_number}"
                        pred_prev_article_id = self.__get_previous_article_id(article_number, prev_regulation_id_template)

                        # Check if the possible previous article ID exists, save and stop if it does
                        if pred_prev_article_id in article_dict.keys():
                            previous_article = pred_prev_article_id
                            break
                
                # If the previous article belongs to the same regulation, update its next_article field
                if previous_article.startswith(regulation_dict["id"][:-6]):
                    previous_article_number = self.__id_to_article_number(previous_article)
                    regulation_dict["content"]["articles"][previous_article_number]["next_article"] = article_id
                    article_dict[previous_article]["next_article"] = article_id
                
                # Retrieving data for the amendment article
                fetched = False
                for amended_regulation_id in amended_regulations:
                    amended_article_id = amended_regulation_id[:-6] + article_id[-6:]
                    if amended_article_id in article_dict.keys():
                        if not fetched:
                            # Update the chapter/part/paragraph number/about to follow the amended article 
                            # only if the current amandment article does not already have a new number/about 
                            chapter_number = chapter_number if chapter_number else article_dict[amended_article_id]["chapter_number"]
                            chapter_about = chapter_about if chapter_about else article_dict[amended_article_id]["chapter_about"]
                            part_number = part_number if part_number else article_dict[amended_article_id]["part_number"]
                            part_about = part_about if part_about else article_dict[amended_article_id]["part_about"]
                            paragraph_number = paragraph_number if paragraph_number else article_dict[amended_article_id]["paragraph_number"]
                            paragraph_about = paragraph_about if paragraph_about else article_dict[amended_article_id]["paragraph_about"]
                            fetched = True

                        # Store amended article ID
                        amended_article.append(amended_article_id)
                
                if previous_article:
                    # If the chapter/part/paragraph data is still empty, copy it from the previous article.
                    # This usually happens with newly added articles due to amendments (e.g., Pasal 40A UU_2024_001).
                    chapter_number = chapter_number if chapter_number else article_dict[previous_article]["chapter_number"]
                    chapter_about = chapter_about if chapter_about else article_dict[previous_article]["chapter_about"]
                    part_number = part_number if part_number else article_dict[previous_article]["part_number"]
                    part_about = part_about if part_about else article_dict[previous_article]["part_about"]
                    paragraph_number = paragraph_number if paragraph_number else article_dict[previous_article]["paragraph_number"]
                    paragraph_about = paragraph_about if paragraph_about else article_dict[previous_article]["paragraph_about"]

            if article_number == "1":
                # Extract definitions if the article contains legal definitions
                if re.search(self.REGEX_PATTERNS["article"]["check_definition"], article_text, re.IGNORECASE):
                    definitions = re.findall(self.REGEX_PATTERNS["article"]["definition"], article_text)

                    for index, definition_data in enumerate(definitions):
                        definition, name = definition_data
                        definition_list.append({
                            "id": id_template.format(
                                reg_section=self.REGULATION_ENCODING["section"]["definition"],
                                section_num=str(index + 1).zfill(3),
                                extra_section_number="00"
                            ),
                            "name": name.strip(),
                            "definition": definition.strip()
                        })

            # Store definition list
            regulation_dict["content"]["definitions"] = definition_list
            
            # Extract references to other articles within the text
            # if the article does NOT contain the **NO_REF** marker
            all_article_references = []
            if not re.search(self.REGEX_PATTERNS["article"]["no_ref"], article_text, re.IGNORECASE):
                all_article_references = self.__get_article_id_references(
                    article_text=article_text,
                    current_regulation_id=regulation_dict["id"],
                    id_template=id_template,
                    amended_regulations=amended_regulations,
                    article_dict=article_dict
                )
            else:
                article_text = re.sub(self.REGEX_PATTERNS["article"]["no_ref"], "", article_text, flags=re.IGNORECASE).strip()
            
            # Store article data
            regulation_dict["content"]["articles"][article_number] = {
                "id": article_id,
                "chapter_number": chapter_number,
                "chapter_about": chapter_about,
                "part_number": part_number,
                "part_about": part_about,
                "paragraph_number": paragraph_number,
                "paragraph_about": paragraph_about,
                "article_number": article_number,
                "text": article_text,
                "previous_article": previous_article,
                "next_article": next_article,
                "references": all_article_references,
                "amend": amended_article
            }
            
            # Store article to article_dict
            article_dict[article_id] = regulation_dict["content"]["articles"][article_number]

        return regulation_dict, definition_list, article_dict, last_article_number


    def __previous_label(self, label: str) -> str:
        """
        Generate the previous label in a sequence where letters decrement alphabetically.

        If a letter is not "A", it is decremented. If it is "A", it becomes "Z",
        and the decrementation continues to the left.

        Args:
            label (str): The input label consisting of uppercase letters.

        Returns:
            str: The previous label in the sequence.

        Example:
            __previous_label("B") -> "A"
            __previous_label("C") -> "B"
            __previous_label("ABC") -> "ABB"
            __previous_label("AAA") -> "ZZ"
            __previous_label("BAA") -> "AZZ"
        """
        label = list(label)  # Convert string to a list of characters for manipulation

        # Iterate backwards to handle letter decrement
        for i in range(len(label) - 1, -1, -1):
            if label[i] != "A":  
                label[i] = chr(ord(label[i]) - 1)  # Decrement the letter
                return "".join(label)
            label[i] = "Z"  # If "A", change to "Z" and continue to the previous letter

        return "".join(label[1:])  # If all were "A", remove the first character
    

    def __next_label(self, label: str) -> str:
        """
        Generate the next label in a sequence where letters increment alphabetically.

        If a letter is not "Z", it is incremented. If it is "Z", it becomes "A",
        and the incrementation continues to the left.

        Args:
            label (str): The input label consisting of uppercase letters.

        Returns:
            str: The next label in the sequence.

        Example:
            __next_label("A") -> "B"
            __next_label("B") -> "C"
            __next_label("ABC") -> "ABD"
            __next_label("ZZZ") -> "AAAA"
            __next_label("AZZ") -> "BAA"
        """
        label = list(label)  # Convert string to a list of characters for manipulation
        
        # Iterate backwards to handle letter increment
        for i in range(len(label) - 1, -1, -1):
            if label[i] != "Z":  
                label[i] = chr(ord(label[i]) + 1)  # Increment the letter
                return "".join(label)
            label[i] = "A"  # If "Z", change to "A" and continue to the previous letter
        
        return "A" + "".join(label)  # If all were "Z", add "A" in front
    

    def __letter_to_string_number(self, letter: str, default: str = "00") -> str:
        """
        Convert an alphabetical string into a two-digit numerical representation.

        This function follows an Excel-like numbering system:
        - "A" → "01", "B" → "02", ..., "Z" → "26"
        - "AA" → "27", "AB" → "28", ..., "CU" → "99"
        - If the input contains non-alphabet characters, it returns the default value.
        - If the converted number exceeds 99, a ValueError is raised.

        Args:
            letter (str): The alphabetical string to convert (e.g., "A", "Z", "AA", "CU").
            default (str, optional): The default value to return if input is invalid. Defaults to "00".

        Returns:
            str: The corresponding two-digit numerical representation as a string.

        Raises:
            ValueError: If the converted number exceeds 99.

        Examples:
            - __letter_to_string_number("A") -> "01"
            - __letter_to_string_number("Z") -> "26"
            - __letter_to_string_number("AA") -> "27"
            - __letter_to_string_number("CU") -> "99"
            - __letter_to_string_number("") -> "00"
            - __letter_to_string_number("A1") -> "00"
            - __letter_to_string_number("XYZ") -> ValueError
        """

        if not letter.isalpha():
            return default  # Return default if letter is empty

        result = 0
        for char in letter:
            result = result * 26 + (ord(char) - ord('A') + 1)

        # Maximum limit 99 ("CU")
        if result > 99:
            raise ValueError(f"Letter '{letter}' exceeds the maximum allowed value of 99 ('CU').")

        return str(result).zfill(2)


    def __article_number_to_id(self, article_number: str, id_template: str, return_last_six=False) -> str:
        """
        Generate a formatted article ID based on the given article number.

        This function converts an article number (which may contain a numerical part  
        and an optional alphabetical suffix) into a standardized ID format  
        using a provided template.

        If `return_last_six` is set to True, the function returns only the last  
        six characters of the generated article ID.

        Args:
            article_number (str): The article number as a string (e.g., "10A", "12").
            id_template (str): A template string for generating the article ID.
            return_last_six (bool, optional): If True, return only the last six characters. Defaults to False.

        Returns:
            str: The formatted article ID.

        Examples:
            id_template = "202401001" + "{reg_section}{section_num}{extra_section_number}"
            - __article_number_to_id("10A", id_template) → "202401001101001"
            - __article_number_to_id("12", id_template) → "202401001101200"
            - __article_number_to_id("10A", id_template, return_last_six=True) → "101001"
        """

        # Initialize the article ID
        article_id = ""

        # Check if the article number contains an alphabetical suffix
        article_alphabet = re.search(r"\d+([A-Z]+)", str(article_number), re.IGNORECASE)

        if article_alphabet:
            # Format the article ID with the alphabet suffix converted to its numerical equivalent
            number = re.search(r"\d+", str(article_number), re.IGNORECASE)[0] # Extract the numeric part
            article_id = id_template.format(
                reg_section=self.REGULATION_ENCODING["section"]["article"],
                section_num=number.zfill(3),
                extra_section_number=self.__letter_to_string_number(article_alphabet[1])
            )
        else:
            # Format the article ID without a letter suffix (defaulting to "00")
            article_id = id_template.format(
                reg_section=self.REGULATION_ENCODING["section"]["article"],
                section_num=str(article_number).zfill(3),
                extra_section_number="00"
            )
        
        # Return the last six characters if required
        return article_id[-6:] if return_last_six else article_id


    def __id_to_article_number(self, article_id: str) -> str:
        """
        Convert an article ID into a formatted article representation.

        This function processes the last 5 digits of the article ID by extracting its numerical and alphabetical components.
        The first three digits represent the main article number, while the last two digits indicate an 
        alphabetical suffix following an Excel-like numbering system (e.g., 1 → A, 26 → Z, 27 → AA).

        Args:
            article_id (str): The 15-digit article ID (e.g., "202401001101001", "202401001101200").

        Returns:
            str: The formatted article string (e.g., "10A", "12").

        Examples:
            - __id_to_article_number("202401001101001") -> "10A"
            - __id_to_article_number("202401001101200") -> "12"
        """

        # Extract the last 5 characters to ensure proper formatting
        article_id = article_id[-5:]

        # Extract the first three digits as the numerical part, removing leading zeros
        number_part = str(int(article_id[:3]))
        
        # Extract the last two digits as the alphabet index like in Excel (1 → A, 26 → Z, 27 → AA, dst.)
        alphabet_index = int(article_id[-2:])
        
        if alphabet_index == 0:
            return number_part  # If "00", return only the number part

        # Convert the numeric index to an alphabetical suffix (Excel-like system)
        alphabet_part = ""
        while alphabet_index > 0:
            alphabet_index -= 1
            alphabet_part = chr(ord("A") + (alphabet_index % 26)) + alphabet_part
            alphabet_index //= 26

        return number_part + alphabet_part
    

    def __get_previous_article_id(self, article_number: str, id_template: str) -> str:
        """
        Generate the previous article ID based on the current article number.

        The function extracts the numeric and alphabetic parts of the article number.
        If the article ends in "A", it removes the letter (e.g., "10A" → "10").
        Otherwise, it decrements the alphabetic part (e.g., "10C" → "10B"). 
        The input must have an alphabetic part (e.g., "10A", "10B"), not just a number.

        Args:
            article_number (str): The current article number (e.g., "10A", "10B").
            id_template (str): A template string for formatting the output ID.

        Returns:
            str: The previous article ID, formatted using the provided template.

        Examples:
            - __get_previous_article_id("10C", template) -> "10B"
            - __get_previous_article_id("10B", template) -> "10A"
            - __get_previous_article_id("10A", template) -> "10"
        """

        # Extract numeric and alphabet parts
        match = re.match(r"(\d+)([A-Z]+)", article_number, re.IGNORECASE)
        if not match:
            return ""  # Return empty if the input format is invalid (e.g., "10" without a letter)

        article_number = int(match.group(1))  # Extract and convert numeric part
        article_alphabet = match.group(2)  # Extract alphabet part

        if article_alphabet == "A":
            # If the article ends with "A", remove the letter to return only the number
            prev_section_num = article_number
            prev_extra_section = "00"
        else:
            # Otherwise, decrement the alphabetic part (e.g., "10C" → "10B")
            prev_section_num = article_number
            prev_extra_section = self.__previous_label(article_alphabet)

        # Format and return the previous article ID
        return id_template.format(
            reg_section=self.REGULATION_ENCODING["section"]["article"],
            section_num=str(prev_section_num).zfill(3),
            extra_section_number=self.__letter_to_string_number(prev_extra_section, default="00")
        )
    

    def __get_next_article_ids(self, article_number: str, id_template: str) -> list:
        """
        Generate a list containing 2 possible next article IDs

        This function determines the possible next article IDs by either:
        - Incrementing the alphabetic part (e.g., "10A" → "10B").
        - Moving to the next number (e.g., "10A" → "11").

        Args:
            article_number (str): The current article number (e.g., "10", "10A").
            id_template (str): A template string for formatting the output IDs.

        Returns:
            list: A list of the next possible article IDs formatted using the template.

        Examples:
            - __get_next_article_ids("10A", template) -> [template("10B"), template("11")]
            - __get_next_article_ids("10", template) -> [template("10A"), template("11")]
        """

        next_article_ids = []

        # Extract numeric and alphabet parts
        match = re.match(r"(\d+)([A-Z]*)", article_number, re.IGNORECASE)
        if not match:
            return []  # Return empty if the input format is invalid

        article_number = int(match.group(1))  # Extract and convert numeric part
        article_alphabet = match.group(2)  # Extract alphabet part (if any)

        if article_alphabet:
            # Generate next articles for cases like "10A" → "10B" or "11"
            next_sections = [
                (article_number, self.__next_label(article_alphabet)),  # Example: "10A" → "10B"
                (article_number + 1, "00")  # Example: "10A" → "11"
            ]
        else:
            # Generate next articles for cases like "10" → "10A" or "11"
            next_sections = [
                (article_number, "A"),  # Example: "10" → "10A"
                (article_number + 1, "00")  # Example: "10" → "11"
            ]
        
        # Format and store the generated IDs
        for section_num, extra_section in next_sections:
            next_article_ids.append(
                id_template.format(
                    reg_section=self.REGULATION_ENCODING["section"]["article"],
                    section_num=str(section_num).zfill(3),
                    extra_section_number=self.__letter_to_string_number(extra_section, default="00")
                )
            )

        return next_article_ids


    def __generate_article_range(self, list1: list, list2: list) -> list:
        """
        Generate a list of article numbers based on direct references and article ranges.

        This function processes two lists:  
        - `list1`: A list of directly referenced article numbers.  
        - `list2`: A list of article ranges, where each tuple contains a start and end value  
        (e.g., [("10", "12B")] → generates ["10", "11", "12", "12A", "12B"]).  

        The function ensures that all numbers in the specified range are included,  
        along with letter suffixes (e.g., "A", "B", etc.) if present in the end value.  
        The final list is sorted naturally (numerical + alphabetical order).  

        Args:
            list1 (list): A list of individual article numbers as strings.
            list2 (list): A list of tuples representing article number ranges (start, end).

        Returns:
            list: A sorted list of all referenced article numbers.

        Examples:
            __generate_article_range(["5", "8"], [("10", "12B")]) → ["5", "8", "10", "11", "12", "12A", "12B"]
        """

        # Convert list1 to a set for unique references
        set1 = set(list1)
        set2 = set()

        for start, end in list2:
            # Extract the numeric part of the start value
            start_num = int(re.match(r"\d+", start).group())

            # Extract the numeric and optional letter part of the end value
            end_match = re.match(r"(\d+)([A-Z]?)", end, re.IGNORECASE)
            end_num = int(end_match.group(1))  # Extract numeric part of the end range
            end_letter = end_match.group(2)  # Extract letter suffix (if any)

            # Add all numeric values within the range
            for i in range(start_num, end_num + 1):
                set2.add(str(i))

            # If the end value has a letter, generate letter suffixes from "A" to the end letter
            if end_letter:
                for letter in str.ascii_uppercase[:ord(end_letter) - ord("A") + 1]:
                    set2.add(f"{end_num}{letter}")

        # Natural sorting function (handles numerical and alphabetical sorting)
        def natural_sort_key(s):
            return [int(text) if text.isdigit() else text for text in re.split(r"(\d+)", s) if text]

        # Combine both sets and return the sorted result
        return sorted(set1.union(set2), key=natural_sort_key)
    

    def __get_article_id_references(
            self,
            article_text: str,
            current_regulation_id: str,
            id_template: str,
            amended_regulations: list,
            article_dict: dict
    ) -> list:
        """
        Extract referenced article IDs from the given article text.

        This function identifies article references within the given text, processes them into valid article IDs,
        and attempts to match them with the regulation database, considering amendments.

        Args:
            article_text (str): The text containing article references.
            current_regulation_id (str): The regulation ID of the current article.
            id_template (str): The template used for formatting article IDs.
            amended_regulations (list): A list of regulation IDs that amended by the current regulation.
            article_dict (dict): A dictionary mapping valid article IDs to their details.

        Returns:
            list: A list of referenced article IDs.

        """
        
        all_article_references = []

        # Extract article reference numbers using predefined regex patterns
        reference_type_1 = list(set(re.findall(self.REGEX_PATTERNS["article"]["reference_1"], article_text, re.IGNORECASE)))
        reference_type_2 = list(set(re.findall(self.REGEX_PATTERNS["article"]["reference_2"], article_text, re.IGNORECASE)))
        
        if reference_type_1 or reference_type_2:
            # Generate a list of article numbers, considering possible ranges
            article_references = self.__generate_article_range(reference_type_1, reference_type_2)

            # Convert article numbers into formatted article IDs
            article_references = [
                self.__article_number_to_id(number, id_template, return_last_six=True) for number in article_references
            ]

            if amended_regulations:
                # Iterate through each referenced article number
                for article_reference_num in article_references:
                    # Check the most recent regulation first, then move to older amendments
                    for regulation_id in sorted([current_regulation_id] + amended_regulations, reverse=True):
                        # Generate a possible article ID by combining the regulation ID and article number
                        other_article_id = regulation_id[:-6] + article_reference_num

                        # If the generated article ID exists in the article dictionary, store it and stop searching
                        if other_article_id in article_dict.keys():
                            all_article_references.append(other_article_id)
                            break
            else:
                # If no amendments exist, assume all references belong to the current regulation
                for article_reference_num in article_references:
                    all_article_references.append(current_regulation_id[:-6] + article_reference_num)
        
        return all_article_references


In [None]:
input_dir = os.path.join("data", "markdown", "fix", "temp")
json_input = os.path.join("data", "regulation_data_modified.json")
json_output = os.path.join("data", "regulation_data_final.json")

# TODO: Error nya terjadi peraturan base yang diamandemen belum ada di dalam folder
# TODO: HAPUS TRY EXCEPT DI ATAS SETELAH SEMUA MARKDOWN PERATURAN SELESAI
parser = RegulationParser()
regulation_data = parser.parse_regulations_content(
    input_dir=input_dir,
    json_input=json_input,
    json_output=json_output,
    verbose=True
)

In [None]:
# import json
# from deepdiff import DeepDiff

# with open("data/regulation_data_final_1.json", encoding="utf-8") as input_file:
#     json_data_1 = json.load(input_file)

# with open("data/regulation_data_final_2.json", encoding="utf-8") as input_file:
#     json_data_2 = json.load(input_file)

# # Cek Perbedaan 2 JSON
# diff = DeepDiff(json_data_1, json_data_2, ignore_order=True)
# display(diff)

In [None]:
# # MODEL 2: STRUKTUR PERATURAN DASAR DAN AMANDEMEN SAMA

# class RegulationParser:

#     def __init__(self):
#         self.ALPHABET_TO_NUMERIC = {chr(i): f'{i - 64:02}' for i in range(65, 91)}
#         self.WORD_TO_NUMBER = {
#             "kesatu": 1, "kedua": 2, "ketiga": 3, "keempat": 4, "kelima": 5,
#             "keenam": 6, "ketujuh": 7, "kedelapan": 8, "kesembilan": 9, "kesepuluh": 10,
#             "kesebelas": 11, "kedua belas": 12, "ketiga belas": 13, "keempat belas": 14, "kelima belas": 15,
#             "keenam belas": 16, "ketujuh belas": 17, "kedelapan belas": 18, "kesembilan belas": 19, "kedua puluh": 20,
#             "kedua puluh satu": 21, "kedua puluh dua": 22, "kedua puluh tiga": 23, "kedua puluh empat": 24, "kedua puluh lima": 25,
#             "kedua puluh enam": 26, "kedua puluh tujuh": 27, "kedua puluh delapan": 28, "kedua puluh sembilan": 29, "ketiga puluh": 30,
#             "ketiga puluh satu": 31, "ketiga puluh dua": 32, "ketiga puluh tiga": 33, "ketiga puluh empat": 34, "ketiga puluh lima": 35,
#             "ketiga puluh enam": 36, "ketiga puluh tujuh": 37, "ketiga puluh delapan": 38, "ketiga puluh sembilan": 39, "keempat puluh": 40,
#             "keempat puluh satu": 41, "keempat puluh dua": 42, "keempat puluh tiga": 43, "keempat puluh empat": 44, "keempat puluh lima": 45,
#             "keempat puluh enam": 46, "keempat puluh tujuh": 47, "keempat puluh delapan": 48, "keempat puluh sembilan": 49, "kelima puluh": 50
#         }
#         self.REGULATION_ENCODING = {
#             'type': {
#                 'UU': '01',
#                 'PERPPU': '02',
#                 'PP': '03',
#                 'PERPRES': '04',
#                 'PERMENKOMINFO': '05'
#             },
#             'section': {
#                 'document': '1',
#                 'considering': '2',
#                 'observing': '3',
#                 'definition': '4',
#                 'chapter': '5',
#                 'article': '6',
#                 'section': '7',
#             }
#         }
#         self.REGEX_PATTERNS = {
#             'document': {
#                 'metadata': r'^(\w+)_(\w+)_(\w+)'  # Jenis, tahun, dan nomor peraturan
#             },
#             'main': {
#                 'considering': r'(?<=## menimbang)([\S\s]*?)(?=## mengingat)',                                         # Menimbang
#                 'observing': r'(?<=## mengingat)([\S\s]*?)(?=(?:dengan persetujuan|## memperhatikan|## memutuskan))',  # Mengingat
#                 'amendment_to': r'^Perubahan',                                                                         # Cek Peraturan Amandemen
#                 'chapter': r'(## BAB[\S\s]*?)(?=\s+(?:## BAB|agar setiap orang mengetahuinya|ditetapkan di))'          # Daftar Bab
#             },
#             'chapter': {
#                 # For every chapters
#                 'about': r'## (BAB [^#]+)##',                                    # Nama Bab
#                 'part': r'(## Bagian [\S\s]*?)(?=\s+(?:## Bagian|$))',           # Daftar Bagian
#                 'paragraph': r'(## Paragraf [\S\s]*?)(?=\s+(?:## Paragraf|$))',  # Daftar Paragraf
#                 'article': r'(## Pasal \w+[\S\s]*?)(?=(?:##|ditetapkan di|$))'   # Daftar Pasal
#             },
#             'part': {
#                 # For every parts
#                 'about': r'## (Bagian [^#]+)##',  # Nama Bagian
#                 'number': r'Bagian (\w+) -'       # Nomor Bagian
#             },
#             'paragraph': {
#                 # For every paragraphs
#                 'about': r'## (Paragraf [^#]+)##',  # Nama Paragraf
#                 'number': r'Paragraf (\w+) -'       # Nomor Paragraf

#             },
#             'article': {
#                 # For every articles
#                 'number': r'## Pasal (\d+\w*)',                                       # Nomor Pasal
#                 'text': r'## Pasal \w+\n*([\S\s]*)',                                  # Isi Pasal
#                 'check_definition': r'^dalam (?:undang-undang|peraturan)',            # Cek apakah Pasal 1 adalah definisi
#                 'definition': r'\(\d+[a-z]?\) (([A-Z][a-z]*(?:\s[A-Z][a-z]*)*) .*)',  # Daftar Definisi
#                 'no_ref': r'\*{2}NO_REF\*{2}',                                        # Pasal dengan tanda **NO_REF**
#                 'reference_1': r'Pasal (\d+\w*)',                                     # Pasal Referensi Jenis 1
#                 'reference_2': r'Pasal (\d+\w*) sampai dengan Pasal (\d+\w*)'         # Pasal Referensi Jenis 2
#             },
#             'amendment_to': {
#                 'amendment_point_1': r'(## \d+\.[\S\s]*?)(?=\s+(?:## \d+\.|## Pasal II))',                      # Poin Amandemen Jenis 1
#                 'amendment_point_2': r'(?<=## Pasal I)([\s\S]*?)(?=## Pasal II)',                               # Poin Amandemen Jenis 2
#                 'chapter': r'(## BAB[\S\s]*?)(?=\s+(?:## BAB|agar setiap orang mengetahuinya|ditetapkan di))',  # Daftar Bab
#                 'part': r'(## Bagian [\S\s]*?)(?=\s+(?:## Bagian|$))',                                          # Daftar Bagian
#                 'paragraph': r'(## Paragraf [\S\s]*?)(?=\s+(?:## Paragraf|$))',                                 # Daftar Paragraf
#                 # 'amendment_articles': r'(## Pasal \d+[A-Z]?[\S\s]*?)(?=(?:##|$))',                            # Pasal Amandemen
#             }
#         }

#     def parse_regulations_content(
#             self,
#             input_dir: str,
#             json_input: str,
#             json_output: str,
#             verbose: bool = True
#     ) -> list[dict]:

#         # Initialize data
#         article_dict = dict()
#         durations = list()
#         result = list()
#         files = list()
#         success = 0
#         failed = 0

#         # Get all Markdown files path and name
#         for filename in os.listdir(input_dir):
#             if filename.endswith(".md"):
#                 files.append((os.path.join(input_dir, filename), filename))

#         # Iterate for every files
#         for regulation_file in tqdm(iterable=files, desc='Parsing regulations content', disable=not verbose):
#             start_time = time.time()
#             filepath, filename = regulation_file

#             try:
#                 with open(filepath, 'r', encoding='utf8') as file:
#                     # Initialize data
#                     text = file.read()
#                     regulation_dict = dict()
#                     definition_list = list()

#                     # Get the file metadata
#                     metadata = re.search(self.REGEX_PATTERNS['document']['metadata'], filename)
#                     regulation_type = self.REGULATION_ENCODING['type'][metadata[1]]
#                     regulation_year = metadata[2]
#                     regulation_num = int(metadata[3])

#                     # Create template ID
#                     id_template = f'{regulation_year}{regulation_type}{str(regulation_num).zfill(3)}' \
#                         + '{reg_section}{section_num}{extra_section_number}'

#                     # Create regulation ID
#                     regulation_id = id_template.format(
#                         reg_section=self.REGULATION_ENCODING['section']['document'],
#                         section_num='000',
#                         extra_section_number='00'
#                     )

#                     # Get regulation data from regulation JSON
#                     with open(json_input) as json_data:
#                         for regulation_data in json.load(json_data):
#                             if regulation_data['id'] == regulation_id:
#                                 regulation_dict = regulation_data

#                     # Initialize the dictionary of parsing results
#                     regulation_dict['content'] = dict()

#                     # Get considering text (Menimbang)
#                     regulation_dict['content']['considering'] = {
#                         'id': id_template.format(
#                             reg_section=self.REGULATION_ENCODING['section']['considering'],
#                             section_num='000',
#                             extra_section_number='00'
#                         ),
#                         'text': re.search(self.REGEX_PATTERNS['main']['considering'], text, re.IGNORECASE)[1].strip()
#                     }

#                     # Get observing text (Mengingat)
#                     regulation_dict['content']['observing'] = {
#                         'id': id_template.format(
#                             reg_section=self.REGULATION_ENCODING['section']['observing'],
#                             section_num='000',
#                             extra_section_number='00'
#                         ), 
#                         'text': re.search(self.REGEX_PATTERNS['main']['observing'], text, re.IGNORECASE)[1].strip()
#                     }

#                     # Check for amendment regulation
#                     is_amendment = re.search(self.REGEX_PATTERNS['main']['amendment_to'], regulation_dict['about'], re.IGNORECASE)
                    
#                     if is_amendment:
#                         regulation_dict, definition_list, article_dict = self.__parse_amendment_regulation(
#                             text=text,
#                             id_template=id_template,
#                             regulation_dict=regulation_dict,
#                             definition_list=definition_list,
#                             article_dict=article_dict,
#                             amend_regulations=regulation_dict['status']['amend']
#                         )
#                     else:
#                         regulation_dict, definition_list, article_dict = self.__parse_base_regulation(
#                             text=text,
#                             id_template=id_template,
#                             regulation_dict=regulation_dict,
#                             definition_list=definition_list,
#                             article_dict=article_dict
#                         )
                    
#                     result.append(regulation_dict)
#                     success += 1
                
#             except Exception as e:
#                 if verbose:
#                     failed += 1
#                     print(f'ERROR parsing content of {filename}')
#                     print(e)

#             durations.append(time.time() - start_time)

#         if not json_output.endswith('.json'):
#             json_output = f'{json_output}.json'

#         with open(json_output, "w") as output_file: 
#             json.dump(result, output_file, indent=4)
        
#         if verbose:
#             print('=' * 76)
#             print(f'Input directory   : {input_dir}')
#             print(f'Input JSON        : {json_input}')
#             print(f'Output JSON       : {json_output}')
#             print(f'Total regulations : {len(files)} regulations')
#             print(f'Total success     : {success} regulations')
#             print(f'Total failed      : {failed} regulations')
#             print(f'Total articles    : {len(article_dict)} articles')
#             print(f'Total time        : {round(sum(durations) * 1000, 3)} miliseconds')
#             print(f'Average time/file : {round(sum(durations) * 1000 / success, 3)} miliseconds')
#             print('=' * 76)

#         filtered_ids = list(filter(
#             lambda x: x.startswith('201601019603100'[:-6] + '6'),
#             article_dict.keys()
#         ))
#         display(filtered_ids)

#         return result
    

#     def __parse_base_regulation(
#             self,
#             text: str,
#             id_template: str,
#             regulation_dict: dict,
#             definition_list: list,
#             article_dict: dict
#     ) -> tuple[dict, list, dict]:

#         # Define last_article_number
#         last_article_number = ''
#         regulation_dict['content']['articles'] = dict()

#         # Get all chapters
#         chapters = re.findall(self.REGEX_PATTERNS['main']['chapter'], text, re.IGNORECASE)

#         # If the chapter exists
#         if chapters:
#             # Iterate for every chapters
#             for chapter_num, chapter in enumerate(chapters):
#                 chapter_number = chapter_num + 1
#                 chapter_about = re.search(self.REGEX_PATTERNS['chapter']['about'], chapter, re.IGNORECASE)[1].strip().upper()
#                 chapter_about = re.sub(r'\n', ' - ', chapter_about, flags=re.IGNORECASE)

#                 # Get all parts
#                 parts = re.findall(self.REGEX_PATTERNS['chapter']['part'], chapter.strip() + '\n', re.IGNORECASE)

#                 # If the part exists
#                 if parts:
#                     # Iterate for every parts
#                     for part_num, part in enumerate(parts):
#                         # Get part about/name
#                         part_number = part_num + 1
#                         part_about = re.search(self.REGEX_PATTERNS['part']['about'], part, re.IGNORECASE)[1].strip()
#                         part_about = re.sub(r'\n', ' - ', part_about, flags=re.IGNORECASE)

#                         # Get all paragraphs
#                         paragraphs = re.findall(self.REGEX_PATTERNS['chapter']['paragraph'], part.strip() + '\n', re.IGNORECASE)
                        
#                         # If the paragraph exists
#                         if paragraphs:
#                             # Iterate for every paragraphs
#                             for paragraph_num, paragraph in enumerate(paragraphs):
#                                 # Get paragraph about/name
#                                 paragraph_number = paragraph_num + 1
#                                 paragraph_about = re.search(self.REGEX_PATTERNS['paragraph']['about'], paragraph, re.IGNORECASE)[1].strip()
#                                 paragraph_about = re.sub(r'\n', ' - ', paragraph_about, flags=re.IGNORECASE)
                                
#                                 regulation_dict, definition_list, article_dict, last_article_number = self.__parse_articles(
#                                     text=paragraph,
#                                     chapter_number=str(chapter_number),
#                                     chapter_about=chapter_about,
#                                     part_number=str(part_number),
#                                     part_about=part_about,
#                                     paragraph_number=str(paragraph_number),
#                                     paragraph_about=paragraph_about,
#                                     id_template=id_template,
#                                     regulation_dict=regulation_dict,
#                                     definition_list=definition_list,
#                                     article_dict=article_dict
#                                 )

#                         else:
#                             regulation_dict, definition_list, article_dict, last_article_number = self.__parse_articles(
#                                 text=part,
#                                 chapter_number=str(chapter_number),
#                                 chapter_about=chapter_about,
#                                 part_number=str(part_number),
#                                 part_about=part_about,
#                                 paragraph_number='',
#                                 paragraph_about='',
#                                 id_template=id_template,
#                                 regulation_dict=regulation_dict,
#                                 definition_list=definition_list,
#                                 article_dict=article_dict
#                             )

#                 else:
#                     regulation_dict, definition_list, article_dict, last_article_number = self.__parse_articles(
#                         text=chapter,
#                         chapter_number=str(chapter_number),
#                         chapter_about=chapter_about,
#                         part_number='',
#                         part_about='',
#                         paragraph_number='',
#                         paragraph_about='',
#                         id_template=id_template,
#                         regulation_dict=regulation_dict,
#                         definition_list=definition_list,
#                         article_dict=article_dict
#                     )
#         else:
#             regulation_dict, definition_list, article_dict, last_article_number = self.__parse_articles(
#                 text=text,
#                 chapter_number='',
#                 chapter_about='',
#                 part_number='',
#                 part_about='',
#                 paragraph_number='',
#                 paragraph_about='',
#                 id_template=id_template,
#                 regulation_dict=regulation_dict,
#                 definition_list=definition_list,
#                 article_dict=article_dict
#             )

#         regulation_dict['content']['articles'][last_article_number]['next_article'] = ''

#         return regulation_dict, definition_list, article_dict


#     def __parse_amendment_regulation(
#             self,
#             text: str,
#             id_template: str,
#             regulation_dict: dict,
#             definition_list: list,
#             article_dict: dict,
#             amend_regulations: list
#     ) -> tuple[dict, list, dict]:
        
#         regulation_dict['content']['articles'] = dict()
        
#         # Get all amendment points
#         amendment_points = re.findall(self.REGEX_PATTERNS['amendment_to']['amendment_point_1'], text, re.IGNORECASE)

#         if not amendment_points:
#             amendment_points = re.search(self.REGEX_PATTERNS['amendment_to']['amendment_point_2'], text, re.IGNORECASE)[1].strip()
#             first_sentence = re.search(r'^.*', amendment_points, re.IGNORECASE)[0].strip()  # Dapatkan kalimat pertama
#             amendment_points = [amendment_points.replace(first_sentence, '').strip()]		# Hapus kalimat pertama
        
#         # Iterate for every points
#         for point in amendment_points:
            

#             # Get all parts
#             parts = re.findall(self.REGEX_PATTERNS['amendment_to']['part'], point.strip() + '\n', re.IGNORECASE)

#             # If the part exists
#             if parts:
#                 # Iterate for every parts
#                 for part in parts:
#                     # Get part about/name
#                     part_about = re.search(self.REGEX_PATTERNS['part']['about'], part, re.IGNORECASE)[1].strip()
#                     part_about = re.sub(r'\n', ' - ', part_about, flags=re.IGNORECASE)
#                     part_number = re.search(self.REGEX_PATTERNS['part']['number'], part_about, re.IGNORECASE)[1].strip().lower()
#                     part_number = self.WORD_TO_NUMBER.get(part_number, 0)

#                     # Get all paragraphs
#                     paragraphs = re.findall(self.REGEX_PATTERNS['chapter']['paragraph'], part.strip() + '\n', re.IGNORECASE)
                    
#                     # If the paragraph exists
#                     if paragraphs:
#                         # Iterate for every paragraphs
#                         for paragraph in paragraphs:
#                             # Get paragraph about/name
#                             paragraph_about = re.search(self.REGEX_PATTERNS['paragraph']['about'], paragraph, re.IGNORECASE)[1].strip()
#                             paragraph_about = re.sub(r'\n', ' - ', paragraph_about, flags=re.IGNORECASE)
#                             paragraph_number = re.search(self.REGEX_PATTERNS['paragraph']['number'], paragraph_about, re.IGNORECASE)[1].strip()
                            
#                             regulation_dict, definition_list, article_dict, last_article_number = self.__parse_articles(
#                                 text=paragraph,
#                                 chapter_number='',
#                                 chapter_about='',
#                                 part_number=str(part_number),
#                                 part_about=part_about,
#                                 paragraph_number=str(paragraph_number),
#                                 paragraph_about=paragraph_about,
#                                 id_template=id_template,
#                                 regulation_dict=regulation_dict,
#                                 definition_list=definition_list,
#                                 article_dict=article_dict,
#                                 amendment=True,
#                                 amend_regulations=amend_regulations
#                             )

#                     else:
#                         regulation_dict, definition_list, article_dict, last_article_number = self.__parse_articles(
#                             text=part,
#                             chapter_number='',
#                             chapter_about='',
#                             part_number=str(part_number),
#                             part_about=part_about,
#                             paragraph_number='',
#                             paragraph_about='',
#                             id_template=id_template,
#                             regulation_dict=regulation_dict,
#                             definition_list=definition_list,
#                             article_dict=article_dict,
#                             amendment=True,
#                             amend_regulations=amend_regulations
#                         )

#             else:
#                 # Get all paragraphs
#                 paragraphs = re.findall(self.REGEX_PATTERNS['amendment_to']['paragraph'], point.strip() + '\n', re.IGNORECASE)
                
#                 # If the paragraph exists
#                 if paragraphs:
#                     # Iterate for every paragraphs
#                     for paragraph_num, paragraph in enumerate(paragraphs):
#                         # Get paragraph about/name
#                         paragraph_number = paragraph_num + 1
#                         paragraph_about = re.search(self.REGEX_PATTERNS['paragraph']['about'], paragraph, re.IGNORECASE)[1].strip()
#                         paragraph_about = re.sub(r'\n', ' - ', paragraph_about, flags=re.IGNORECASE)
#                         paragraph_number = re.search(self.REGEX_PATTERNS['paragraph']['number'], paragraph_about, re.IGNORECASE)[1].strip()

#                         regulation_dict, definition_list, article_dict, last_article_number = self.__parse_articles(
#                             text=paragraph,
#                             chapter_number='',
#                             chapter_about='',
#                             part_number='',
#                             part_about='',
#                             paragraph_number=str(paragraph_number),
#                             paragraph_about=paragraph_about,
#                             id_template=id_template,
#                             regulation_dict=regulation_dict,
#                             definition_list=definition_list,
#                             article_dict=article_dict,
#                             amendment=True,
#                             amend_regulations=amend_regulations
#                         )

#                 else:
#                     regulation_dict, definition_list, article_dict, last_article_number = self.__parse_articles(
#                         text=point,
#                         chapter_number='',
#                         chapter_about='',
#                         part_number='',
#                         part_about='',
#                         paragraph_number='',
#                         paragraph_about='',
#                         id_template=id_template,
#                         regulation_dict=regulation_dict,
#                         definition_list=definition_list,
#                         article_dict=article_dict,
#                         amendment=True,
#                         amend_regulations=amend_regulations
#                     )

#         return regulation_dict, definition_list, article_dict


#     def __parse_articles(
#             self,
#             text: str,
#             chapter_number: str,
#             chapter_about: str,
#             part_number: str,
#             part_about: str,
#             paragraph_number: str,
#             paragraph_about: str,
#             id_template: str,
#             regulation_dict: dict,
#             definition_list: list,
#             article_dict: dict,
#             amendment: bool = False,
#             amend_regulations: list = list() 
#     ) -> tuple[dict, list]:
        
#         # Define last article number
#         last_article_number = ''
        
#         # Get all articles
#         articles = re.findall(self.REGEX_PATTERNS['chapter']['article'], text, re.IGNORECASE)

#         # Iterate for every articles
#         for article in articles:
#             article_number = re.search(self.REGEX_PATTERNS['article']['number'], article, re.IGNORECASE)[1]
#             article_text = re.search(self.REGEX_PATTERNS['article']['text'], article, re.IGNORECASE)[1].strip()
#             article_text = re.sub(r'\n+', '\n', article_text)
        
#             # Create current article ID
#             article_id = self.__article_number_to_id(article_number, id_template, return_last_six=False)

#             # Define last_article_number
#             last_article_number = article_number

#             # Get previous article ID
#             # TODO: BUAT PREVIOUS ARTICLE DAN NEXT ARTICLE UNTUK PASAL AMANDEMEN PAKAI ALGORITMA DI KERTAS DAN HP
#             # CEK ID 202401001603600
#             previous_article = ''
#             next_article = ''
#             if not amend_regulations:
#                 # Get previous article ID
#                 if article_number != '1':
#                     previous_article = id_template.format(
#                         reg_section=self.REGULATION_ENCODING['section']['article'],
#                         section_num=str(int(article_number) - 1).zfill(3),
#                         extra_section_number='00'
#                     )
                
#                 # Get predicted next article ID
#                 next_article = id_template.format(
#                     reg_section=self.REGULATION_ENCODING['section']['article'],
#                     section_num=str(int(article_number) + 1).zfill(3),
#                     extra_section_number='00'
#                 )

#             else:
#                 # filtered_ids = list(filter(
#                 #     lambda x: x.startswith(amend_regulations[0][:-6] + self.REGULATION_ENCODING['section']['article']),
#                 #     article_dict.keys()
#                 # ))

#                 # NEXT ARTICLE INI SIFATNYA PREDIKSI [DONE]
#                 # Cari ID terkecil yang lebih besar dari selected_id
#                 # Jika pasal saat ini adalah 5, maka next nya adalah 5A atau 6 x jumlah peraturan amandemen
#                 # Jika pasal saat ini adalah 5B, maka next nya adalah 5C atau 6 x jumlah peraturan amandemen
#                 # next_article = min((x for x in filtered_ids if x > amend_regulations[0][:-6] + article_id[-6:]), default='BESAR')
                
#                 for amended_regulation_id in amend_regulations:
#                     amended_regulation_id_template = amended_regulation_id[:-6] + '{reg_section}{section_num}{extra_section_number}'
#                     pred_next_article_ids = self.__get_next_article_ids(article_number, amended_regulation_id_template)
#                     for pred_next_article_id in pred_next_article_ids:
#                         if pred_next_article_id in article_dict.keys():
#                             next_article = pred_next_article_id
#                             break  # Break dari loop dalam
#                     if next_article:
#                         break  # Break dari loop luar
                
#                 # TODO: PREVIOUS ARTICLE INI BARU PAKAI ALGORITMA DI KERTAS DAN HP
#                 # Cari ID terbesar yang lebih kecil dari selected_id
#                 # previous_article = max((x for x in filtered_ids if x < amend_regulations[0][:-6] + article_id[-6:]), default='KECIL')

#                 # 2016_01_019_6_045_01
#                 # Jika sekarang Pasal 45A maka previous nya jelas cari yang 45
#                 # 2016_01_019_6_045_00 [DONE]
#                 # Jika sekarang Pasal 45 maka previous nya tidak jelas, bisa 44Z, ..., 44A, atau 44
#                 # previous_article = max((x for x in filtered_ids if x < amend_regulations[0][:-6] + article_id[-6:]), default='KECIL')
#                 if article_number.isdigit():
#                     if article_number != '1':
#                         filtered_ids = []
#                         for regulation_id in [regulation_dict['id']] + amend_regulations:
#                             filtered_ids += list(filter(
#                                 lambda x: x.startswith(str(int(regulation_id[:-6] + article_id[9:13]) - 1)),
#                                 article_dict.keys()
#                             ))
                        
#                         # Buat dictionary dengan prefix sebagai key dan angka dua digit terakhir sebagai value
#                         id_groups = {}

#                         for id in filtered_ids:
#                             prefix, suffix = id[:-2], id[-2:]  # Pisahkan prefix dan dua angka terakhir
#                             if prefix not in id_groups or suffix > id_groups[prefix]:  
#                                 id_groups[prefix] = suffix  # Simpan angka terbesar untuk setiap prefix

#                         # Ambil ID yang memiliki dua angka terakhir terbesar
#                         max_suffix_ids = [id for id in filtered_ids if id[-2:] == id_groups[id[:-2]]]

#                         # Ambil ID terbesar dari hasil tersebut
#                         try:
#                             previous_article = max(max_suffix_ids)
#                         except:
#                             print(article_id)
#                             print(filtered_ids)
#                             print(max_suffix_ids)

#                 else:
#                     for regulation_id in [regulation_dict['id']] + amend_regulations:
#                         prev_regulation_id_template = regulation_id[:-6] + '{reg_section}{section_num}{extra_section_number}'
#                         pred_prev_article_id = self.__get_previous_article_id(article_number, prev_regulation_id_template)
#                         if pred_prev_article_id in article_dict.keys():
#                             previous_article = pred_prev_article_id
#                             break  # Break dari loop dalam
                
#                 # Jika previous article ada di peraturan amandemen yang sama,
#                 # maka ganti next article di previous article tersebut menjadi article saat ini 
#                 if previous_article.startswith(regulation_dict['id'][:-6]):
#                     previous_article_number = self.__id_to_article_number(previous_article)
#                     regulation_dict['content']['articles'][previous_article_number]['next_article'] = article_id
#                     # article_dict[article_id] = regulation_dict['content']['articles'][article_number]



#             # Get definition (Article/Pasal 1)
#             if article_number == '1':
#                 if re.search(self.REGEX_PATTERNS['article']['check_definition'], article_text, re.IGNORECASE):
#                     definitions = re.findall(self.REGEX_PATTERNS['article']['definition'], article_text)

#                     for index, definition_data in enumerate(definitions):
#                         definition, name = definition_data
#                         definition_list.append({
#                             'id': id_template.format(
#                                 reg_section=self.REGULATION_ENCODING['section']['definition'],
#                                 section_num=str(index + 1).zfill(3),
#                                 extra_section_number='00'
#                             ),
#                             'name': name.strip(),
#                             'definition': definition.strip()
#                         })

#             # Store definition list
#             regulation_dict['content']['definitions'] = definition_list
            
#             # Extract article references if the article does NOT contain the **NO_REF** marker
#             all_article_references = list()
#             if not re.search(self.REGEX_PATTERNS['article']['no_ref'], article_text, re.IGNORECASE):
#                 all_article_references = self.__get_article_id_references(
#                     article_text=article_text, current_regulation_id=regulation_dict['id'],
#                     id_template=id_template, amend_regulations=amend_regulations, article_dict=article_dict)
#             else:
#                 article_text = re.sub(self.REGEX_PATTERNS['article']['no_ref'], '', article_text, flags=re.IGNORECASE).strip()

#             # Ambil daftar peraturan amend
#             amended_article = list()

#             if amend_regulations:
#                 fetched = False
#                 # Mengambil daftar ID pasal yang diamandemen, termasuk data chapter, part, paragraph nya (jika tidak diubah)
#                 for regulation_id in amend_regulations:
#                     # Buat ulang (tebak) ID Pasal di peraturan sebelum revisi
#                     other_article_id = regulation_id[:-6] + '6' + article_id[-5:]
#                     # Jika ID tersebut ada di article_dict maka ID nya pasti ada, terus simpan 
#                     if other_article_id in article_dict.keys():
#                         if not fetched:
#                             # Chapter
#                             chapter_number = chapter_number if chapter_number else article_dict[other_article_id]['chapter_number']
#                             chapter_about = chapter_about if chapter_about else article_dict[other_article_id]['chapter_about']

#                             # Part
#                             part_number = part_number if part_number else article_dict[other_article_id]['part_number']
#                             part_about = part_about if part_about else article_dict[other_article_id]['part_about']

#                             # Paragraph
#                             paragraph_number = paragraph_number if paragraph_number else article_dict[other_article_id]['paragraph_number']
#                             paragraph_about = paragraph_about if paragraph_about else article_dict[other_article_id]['paragraph_about']

#                             fetched = True

#                         # Article ID
#                         amended_article.append(other_article_id)
            
#             # Store article data
#             regulation_dict['content']['articles'][article_number] = {
#                 'id': article_id,
#                 'chapter_number': chapter_number,
#                 'chapter_about': chapter_about,
#                 'part_number': part_number,
#                 'part_about': part_about,
#                 'paragraph_number': paragraph_number,
#                 'paragraph_about': paragraph_about,
#                 'article_number': article_number,
#                 'text': article_text,
#                 'previous_article': previous_article,
#                 'next_article': next_article,
#                 'references': all_article_references,
#                 'amend': amended_article
#             }
            
#             # Store article to article_dict
#             article_dict[article_id] = regulation_dict['content']['articles'][article_number]

#         return regulation_dict, definition_list, article_dict, last_article_number


#     def __previous_label(self, label: str) -> str:
#         """
#         Generate the previous label in a sequence where letters decrement alphabetically.

#         If a letter is not 'A', it is decremented. If it is 'A', it becomes 'Z',
#         and the decrementation continues to the left.

#         Args:
#             label (str): The input label consisting of uppercase letters.

#         Returns:
#             str: The previous label in the sequence.

#         Example:
#             __previous_label("B") -> "A"
#             __previous_label("C") -> "B"
#             __previous_label("ABC") -> "ABB"
#             __previous_label("AAA") -> "ZZ"
#             __previous_label("BAA") -> "AZZ"
#         """
#         label = list(label)  # Convert string to a list of characters for manipulation

#         # Iterate backwards to handle letter decrement
#         for i in range(len(label) - 1, -1, -1):
#             if label[i] != 'A':  
#                 label[i] = chr(ord(label[i]) - 1)  # Decrement the letter
#                 return ''.join(label)
#             label[i] = 'Z'  # If 'A', change to 'Z' and continue to the previous letter

#         return ''.join(label[1:])  # If all were 'A', remove the first character
    

#     def __next_label(self, label: str) -> str:
#         """
#         Generate the next label in a sequence where letters increment alphabetically.

#         If a letter is not 'Z', it is incremented. If it is 'Z', it becomes 'A',
#         and the incrementation continues to the left.

#         Args:
#             label (str): The input label consisting of uppercase letters.

#         Returns:
#             str: The next label in the sequence.

#         Example:
#             __next_label("A") -> "B"
#             __next_label("B") -> "C"
#             __next_label("ABC") -> "ABD"
#             __next_label("ZZZ") -> "AAAA"
#             __next_label("AZZ") -> "BAA"
#         """
#         label = list(label)  # Convert string to a list of characters for manipulation
        
#         # Iterate backwards to handle letter increment
#         for i in range(len(label) - 1, -1, -1):
#             if label[i] != 'Z':  
#                 label[i] = chr(ord(label[i]) + 1)  # Increment the letter
#                 return ''.join(label)
#             label[i] = 'A'  # If 'Z', change to 'A' and continue to the previous letter
        
#         return 'A' + ''.join(label)  # If all were 'Z', add 'A' in front


#     def __article_number_to_id(self, article_number: str, id_template: str, return_last_six=False) -> str:
#         """
#         Generate a formatted article ID based on the given article number.

#         This function converts an article number (which may contain a numerical part  
#         and an optional alphabetical suffix) into a standardized ID format  
#         using a provided template.

#         If `return_last_six` is set to True, the function returns only the last  
#         six characters of the generated article ID.

#         Args:
#             article_number (str): The article number as a string (e.g., "10A", "12").
#             id_template (str): A template string for generating the article ID.
#             return_last_six (bool, optional): If True, return only the last six characters. Defaults to False.

#         Returns:
#             str: The formatted article ID.

#         Examples:
#             id_template = "202401001" + "{reg_section}{section_num}{extra_section_number}"
#             __article_number_to_id("10A", id_template) → "202401001101001"
#             __article_number_to_id("12", id_template) → "202401001101200"
#             __article_number_to_id("10A", id_template, return_last_six=True) → "101001"
#         """

#         # Initialize the article ID
#         article_id = ''

#         # Check if the article number contains an alphabetical suffix
#         article_alphabet = re.search(r'\d+([A-Z]+)', str(article_number), re.IGNORECASE)

#         if article_alphabet:
#             # Format the article ID with the alphabet suffix converted to its numerical equivalent
#             number = re.search(r'\d+', str(article_number), re.IGNORECASE)[0] # Extract the numeric part
#             article_id = id_template.format(
#                 reg_section=self.REGULATION_ENCODING['section']['article'],
#                 section_num=number.zfill(3),
#                 extra_section_number=self.ALPHABET_TO_NUMERIC[article_alphabet[1]]
#             )
#         else:
#             # Format the article ID without a letter suffix (defaulting to "00")
#             article_id = id_template.format(
#                 reg_section=self.REGULATION_ENCODING['section']['article'],
#                 section_num=str(article_number).zfill(3),
#                 extra_section_number='00'
#             )
        
#         # Return the last six characters if required
#         return article_id[-6:] if return_last_six else article_id


#     def __id_to_article_number(self, article_id: str) -> str:
#         """
#         Convert an article ID into a formatted article representation.

#         This function processes the last 5 digits of the article ID by extracting its numerical and alphabetical components.
#         The first three digits represent the main article number, while the last two digits indicate an 
#         alphabetical suffix following an Excel-like numbering system (e.g., 1 → A, 26 → Z, 27 → AA).

#         Args:
#             article_id (str): The 15-digit article ID (e.g., "202401001101001", "202401001101200").

#         Returns:
#             str: The formatted article string (e.g., "10A", "12").

#         Examples:
#             __id_to_article_number("202401001101001") -> "10A"
#             __id_to_article_number("202401001101200") -> "12"
#         """

#         # Extract the last 5 characters to ensure proper formatting
#         article_id = article_id[-5:]

#         # Extract the first three digits as the numerical part, removing leading zeros
#         number_part = str(int(article_id[:3]))
        
#         # Extract the last two digits as the alphabet index like in Excel (1 → A, 26 → Z, 27 → AA, dst.)
#         alphabet_index = int(article_id[-2:])
        
#         if alphabet_index == 0:
#             return number_part  # If "00", return only the number part

#         # Convert the numeric index to an alphabetical suffix (Excel-like system)
#         alphabet_part = ""
#         while alphabet_index > 0:
#             alphabet_index -= 1
#             alphabet_part = chr(ord('A') + (alphabet_index % 26)) + alphabet_part
#             alphabet_index //= 26

#         return number_part + alphabet_part
    

#     def __get_previous_article_id(self, article_number: str, id_template: str) -> str:
#         """
#         Generate the previous article ID based on the current article number.

#         The function extracts the numeric and alphabetic parts of the article number.
#         If the article ends in 'A', it removes the letter (e.g., "10A" → "10").
#         Otherwise, it decrements the alphabetic part (e.g., "10C" → "10B"). 
#         The input must have an alphabetic part (e.g., "10A", "10B"), not just a number.

#         Args:
#             article_number (str): The current article number (e.g., "10A", "10B").
#             id_template (str): A template string for formatting the output ID.

#         Returns:
#             str: The previous article ID, formatted using the provided template.

#         Examples:
#             __get_previous_article_id("10C", template) -> "10B"
#             __get_previous_article_id("10B", template) -> "10A"
#             __get_previous_article_id("10A", template) -> "10"
#         """

#         # Extract numeric and alphabet parts
#         match = re.match(r'(\d+)([A-Z]+)', article_number, re.IGNORECASE)
#         if not match:
#             return ""  # Return empty if the input format is invalid (e.g., "10" without a letter)

#         article_number = int(match.group(1))  # Extract and convert numeric part
#         article_alphabet = match.group(2)  # Extract alphabet part

#         if article_alphabet == 'A':
#             # If the article ends with 'A', remove the letter to return only the number
#             prev_section_num = article_number
#             prev_extra_section = '00'
#         else:
#             # Otherwise, decrement the alphabetic part (e.g., "10C" → "10B")
#             prev_section_num = article_number
#             prev_extra_section = self.__previous_label(article_alphabet)

#         # Format and return the previous article ID
#         return id_template.format(
#             reg_section=self.REGULATION_ENCODING['section']['article'],
#             section_num=str(prev_section_num).zfill(3),
#             extra_section_number=self.ALPHABET_TO_NUMERIC.get(prev_extra_section, '00')
#         )
    

#     def __get_next_article_ids(self, article_number: str, id_template: str) -> list:
#         """
#         Generate a list containing 2 possible next article IDs

#         This function determines the possible next article IDs by either:
#         - Incrementing the alphabetic part (e.g., "10A" → "10B").
#         - Moving to the next number (e.g., "10A" → "11").

#         Args:
#             article_number (str): The current article number (e.g., "10", "10A").
#             id_template (str): A template string for formatting the output IDs.

#         Returns:
#             list: A list of the next possible article IDs formatted using the template.

#         Examples:
#             __get_next_article_ids("10A", template) -> [template("10B"), template("11")]
#             __get_next_article_ids("10", template) -> [template("10A"), template("11")]
#         """

#         next_article_ids = []

#         # Extract numeric and alphabet parts
#         match = re.match(r'(\d+)([A-Z]*)', article_number, re.IGNORECASE)
#         if not match:
#             return []  # Return empty if the input format is invalid

#         article_number = int(match.group(1))  # Extract and convert numeric part
#         article_alphabet = match.group(2)  # Extract alphabet part (if any)

#         if article_alphabet:
#             # Generate next articles for cases like "10A" → "10B" or "11"
#             next_sections = [
#                 (article_number, self.__next_label(article_alphabet)),  # Example: "10A" → "10B"
#                 (article_number + 1, '00')  # Example: "10A" → "11"
#             ]
#         else:
#             # Generate next articles for cases like "10" → "10A" or "11"
#             next_sections = [
#                 (article_number, 'A'),  # Example: "10" → "10A"
#                 (article_number + 1, '00')  # Example: "10" → "11"
#             ]
        
#         # Format and store the generated IDs
#         for section_num, extra_section in next_sections:
#             next_article_ids.append(
#                 id_template.format(
#                     reg_section=self.REGULATION_ENCODING['section']['article'],
#                     section_num=str(section_num).zfill(3),
#                     extra_section_number=self.ALPHABET_TO_NUMERIC.get(extra_section, '00')
#                 )
#             )

#         return next_article_ids


#     def __generate_article_range(self, list1: list, list2: list) -> list:
#         """
#         Generate a list of article numbers based on direct references and article ranges.

#         This function processes two lists:  
#         - `list1`: A list of directly referenced article numbers.  
#         - `list2`: A list of article ranges, where each tuple contains a start and end value  
#         (e.g., [("10", "12B")] → generates ["10", "11", "12", "12A", "12B"]).  

#         The function ensures that all numbers in the specified range are included,  
#         along with letter suffixes (e.g., "A", "B", etc.) if present in the end value.  
#         The final list is sorted naturally (numerical + alphabetical order).  

#         Args:
#             list1 (list): A list of individual article numbers as strings.
#             list2 (list): A list of tuples representing article number ranges (start, end).

#         Returns:
#             list: A sorted list of all referenced article numbers.

#         Examples:
#             __generate_article_range(["5", "8"], [("10", "12B")]) → ["5", "8", "10", "11", "12", "12A", "12B"]
#         """

#         # Convert list1 to a set for unique references
#         set1 = set(list1)
#         set2 = set()

#         for start, end in list2:
#             # Extract the numeric part of the start value
#             start_num = int(re.match(r'\d+', start).group())

#             # Extract the numeric and optional letter part of the end value
#             end_match = re.match(r'(\d+)([A-Z]?)', end, re.IGNORECASE)
#             end_num = int(end_match.group(1))  # Extract numeric part of the end range
#             end_letter = end_match.group(2)  # Extract letter suffix (if any)

#             # Add all numeric values within the range
#             for i in range(start_num, end_num + 1):
#                 set2.add(str(i))

#             # If the end value has a letter, generate letter suffixes from 'A' to the end letter
#             if end_letter:
#                 for letter in str.ascii_uppercase[:ord(end_letter) - ord('A') + 1]:
#                     set2.add(f"{end_num}{letter}")

#         # Natural sorting function (handles numerical and alphabetical sorting)
#         def natural_sort_key(s):
#             return [int(text) if text.isdigit() else text for text in re.split(r'(\d+)', s) if text]

#         # Combine both sets and return the sorted result
#         return sorted(set1.union(set2), key=natural_sort_key)
    

#     def __get_article_id_references(
#             self,
#             article_text: str,
#             current_regulation_id: str,
#             id_template: str,
#             amend_regulations: list,
#             article_dict: dict
#     ) -> list:
#         """
#         Extract referenced article IDs from the given article text.

#         This function identifies article references within the given text, processes them into valid article IDs,
#         and attempts to match them with the regulation database, considering amendments.

#         Args:
#             article_text (str): The text containing article references.
#             current_regulation_id (str): The regulation ID of the current article.
#             id_template (str): The template used for formatting article IDs.
#             amend_regulations (list): A list of regulation IDs that amended the current regulation.
#             article_dict (dict): A dictionary mapping valid article IDs to their details.

#         Returns:
#             list: A list of referenced article IDs.

#         """
        
#         all_article_references = []

#         # Extract article reference numbers using predefined regex patterns
#         reference_type_1 = list(set(re.findall(self.REGEX_PATTERNS['article']['reference_1'], article_text, re.IGNORECASE)))
#         reference_type_2 = list(set(re.findall(self.REGEX_PATTERNS['article']['reference_2'], article_text, re.IGNORECASE)))
        
#         if reference_type_1 or reference_type_2:
#             # Generate a list of article numbers, considering possible ranges
#             article_references = self.__generate_article_range(reference_type_1, reference_type_2)

#             # Convert article numbers into formatted article IDs
#             article_references = [
#                 self.__article_number_to_id(number, id_template, return_last_six=True) for number in article_references
#             ]

#             if amend_regulations:
#                 # Iterate through each referenced article number
#                 for article_reference_num in article_references:
#                     # Check the most recent regulation first, then move to older amendments
#                     for regulation_id in sorted([current_regulation_id] + amend_regulations, reverse=True):
#                         # Generate a possible article ID by combining the regulation ID and article number
#                         other_article_id = regulation_id[:-6] + article_reference_num

#                         # If the generated article ID exists in the article dictionary, store it and stop searching
#                         if other_article_id in article_dict.keys():
#                             all_article_references.append(other_article_id)
#                             break
#             else:
#                 # If no amendments exist, assume all references belong to the current regulation
#                 for article_reference_num in article_references:
#                     all_article_references.append(current_regulation_id[:-6] + article_reference_num)
        
#         return all_article_references


In [None]:
# list_1 = ['2', '3', '4', '3A', '5', '1']
# list_1 = sorted(list_1, reverse=False)  # Urutkan secara alami

# # Dapatkan semua elemen yang lebih besar dari '3A'
# filtered_list = [data for data in list_1 if data > '3A']

# # Ambil elemen pertama (terkecil) jika ada
# smallest_after_3A = filtered_list[0] if filtered_list else None

# print(smallest_after_3A)


In [None]:
# input_dir = os.path.join('data', 'markdown', 'fix', 'temp')
# json_input = os.path.join('data', 'regulation_data_modified.json')
# json_output = os.path.join('data', 'regulation_data_final.json')

# # TODO: Error nya terjadi peraturan base yang diamandemen belum ada di dalam folder 
# parser = RegulationParser()
# regulation_data = parser.parse_regulations_content(
#     input_dir=input_dir,
#     json_input=json_input,
#     json_output=json_output,
#     verbose=True
# )

In [None]:
# # MODEL 2: STRUKTUR PERATURAN DASAR DAN AMANDEMEN SAMA

# class RegulationParser:

#     def __init__(self):
#         self.ALPHABET_TO_NUMERIC = {chr(i): f'{i - 64:02}' for i in range(65, 91)}
#         self.WORD_TO_NUMBER = {
#             "kesatu": 1, "kedua": 2, "ketiga": 3, "keempat": 4, "kelima": 5,
#             "keenam": 6, "ketujuh": 7, "kedelapan": 8, "kesembilan": 9, "kesepuluh": 10,
#             "kesebelas": 11, "kedua belas": 12, "ketiga belas": 13, "keempat belas": 14, "kelima belas": 15,
#             "keenam belas": 16, "ketujuh belas": 17, "kedelapan belas": 18, "kesembilan belas": 19, "kedua puluh": 20,
#             "kedua puluh satu": 21, "kedua puluh dua": 22, "kedua puluh tiga": 23, "kedua puluh empat": 24, "kedua puluh lima": 25,
#             "kedua puluh enam": 26, "kedua puluh tujuh": 27, "kedua puluh delapan": 28, "kedua puluh sembilan": 29, "ketiga puluh": 30,
#             "ketiga puluh satu": 31, "ketiga puluh dua": 32, "ketiga puluh tiga": 33, "ketiga puluh empat": 34, "ketiga puluh lima": 35,
#             "ketiga puluh enam": 36, "ketiga puluh tujuh": 37, "ketiga puluh delapan": 38, "ketiga puluh sembilan": 39, "keempat puluh": 40,
#             "keempat puluh satu": 41, "keempat puluh dua": 42, "keempat puluh tiga": 43, "keempat puluh empat": 44, "keempat puluh lima": 45,
#             "keempat puluh enam": 46, "keempat puluh tujuh": 47, "keempat puluh delapan": 48, "keempat puluh sembilan": 49, "kelima puluh": 50
#         }
#         self.REGULATION_ENCODING = {
#             'type': {
#                 'UU': '01',
#                 'PERPPU': '02',
#                 'PP': '03',
#                 'PERPRES': '04',
#                 'PERMENKOMINFO': '05'
#             },
#             'section': {
#                 'document': '1',
#                 'considering': '2',
#                 'observing': '3',
#                 'definition': '4',
#                 'chapter': '5',
#                 'article': '6',
#                 'section': '7',
#             }
#         }
#         self.REGEX_PATTERNS = {
#             'document': {
#                 'metadata': r'^(\w+)_(\w+)_(\w+)'  # Jenis, tahun, dan nomor peraturan
#             },
#             'main': {
#                 'considering': r'(?<=## menimbang)([\S\s]*?)(?=## mengingat)',                                         # Menimbang
#                 'observing': r'(?<=## mengingat)([\S\s]*?)(?=(?:dengan persetujuan|## memperhatikan|## memutuskan))',  # Mengingat
#                 'amendment_to': r'^Perubahan',                                                                         # Cek Peraturan Amandemen
#                 'chapter': r'(## BAB[\S\s]*?)(?=\s+(?:## BAB|agar setiap orang mengetahuinya|ditetapkan di))'          # Daftar Bab
#             },
#             'chapter': {
#                 # For every chapters
#                 'about': r'## (BAB [^#]+)##',                                    # Nama Bab
#                 'part': r'(## Bagian [\S\s]*?)(?=\s+(?:## Bagian|$))',           # Daftar Bagian
#                 'paragraph': r'(## Paragraf [\S\s]*?)(?=\s+(?:## Paragraf|$))',  # Daftar Paragraf
#                 'article': r'(## Pasal \w+[\S\s]*?)(?=(?:##|ditetapkan di|$))'   # Daftar Pasal
#             },
#             'part': {
#                 # For every parts
#                 'about': r'## (Bagian [^#]+)##',  # Nama Bagian
#                 'number': r'Bagian (\w+) -'       # Nomor Bagian
#             },
#             'paragraph': {
#                 # For every paragraphs
#                 'about': r'## (Paragraf [^#]+)##',  # Nama Paragraf
#                 'number': r'Paragraf (\w+) -'       # Nomor Paragraf

#             },
#             'article': {
#                 # For every articles
#                 'number': r'## Pasal (\d+\w*)',                                       # Nomor Pasal
#                 'text': r'## Pasal \w+\n*([\S\s]*)',                                  # Isi Pasal
#                 'check_definition': r'^dalam (?:undang-undang|peraturan)',            # Cek apakah Pasal 1 adalah definisi
#                 'definition': r'\(\d+[a-z]?\) (([A-Z][a-z]*(?:\s[A-Z][a-z]*)*) .*)',  # Daftar Definisi
#                 'reference_1': r'Pasal (\d+\w*)',                                     # Pasal Referensi Jenis 1
#                 'reference_2': r'Pasal (\d+\w*) sampai dengan Pasal (\d+\w*)'         # Pasal Referensi Jenis 2
#             },
#             'amendment_to': {
#                 'amendment_point_1': r'(## \d+\.[\S\s]*?)(?=\s+(?:## \d+\.|## Pasal II))',                      # Poin Amandemen Jenis 1
#                 'amendment_point_2': r'(?<=## Pasal I)([\s\S]*?)(?=## Pasal II)',                               # Poin Amandemen Jenis 2
#                 'chapter': r'(## BAB[\S\s]*?)(?=\s+(?:## BAB|agar setiap orang mengetahuinya|ditetapkan di))',  # Daftar Bab
#                 'part': r'(## Bagian [\S\s]*?)(?=\s+(?:## Bagian|$))',                                          # Daftar Bagian
#                 'paragraph': r'(## Paragraf [\S\s]*?)(?=\s+(?:## Paragraf|$))',                                 # Daftar Paragraf
#                 # 'amendment_articles': r'(## Pasal \d+[A-Z]?[\S\s]*?)(?=(?:##|$))',                            # Pasal Amandemen
#             }
#         }

#     def parse_regulations_content(
#             self,
#             input_dir: str,
#             json_input: str,
#             json_output: str,
#             verbose: bool = True
#     ) -> list[dict]:

#         # Initialize data
#         article_dict = dict()
#         durations = list()
#         result = list()
#         files = list()
#         success = 0
#         failed = 0

#         # Get all Markdown files path and name
#         for filename in os.listdir(input_dir):
#             if filename.endswith(".md"):
#                 files.append((os.path.join(input_dir, filename), filename))

#         # Iterate for every files
#         for regulation_file in tqdm(iterable=files, desc='Parsing regulations content', disable=not verbose):
#             start_time = time.time()
#             filepath, filename = regulation_file

#             try:
#                 with open(filepath, 'r', encoding='utf8') as file:
#                     # Initialize data
#                     text = file.read()
#                     regulation_dict = dict()
#                     definition_list = list()

#                     # Get the file metadata
#                     metadata = re.search(self.REGEX_PATTERNS['document']['metadata'], filename)
#                     regulation_type = self.REGULATION_ENCODING['type'][metadata[1]]
#                     regulation_year = metadata[2]
#                     regulation_num = int(metadata[3])

#                     # Create template ID
#                     id_template = f'{regulation_year}{regulation_type}{str(regulation_num).zfill(3)}' \
#                         + '{reg_section}{section_num}{extra_section_number}'

#                     # Create regulation ID
#                     regulation_id = id_template.format(
#                         reg_section=self.REGULATION_ENCODING['section']['document'],
#                         section_num='000',
#                         extra_section_number='00'
#                     )

#                     # Get regulation data from regulation JSON
#                     with open(json_input) as json_data:
#                         for regulation_data in json.load(json_data):
#                             if regulation_data['id'] == regulation_id:
#                                 regulation_dict = regulation_data

#                     # Initialize the dictionary of parsing results
#                     regulation_dict['content'] = dict()

#                     # Get considering text (Menimbang)
#                     regulation_dict['content']['considering'] = {
#                         'id': id_template.format(
#                             reg_section=self.REGULATION_ENCODING['section']['considering'],
#                             section_num='000',
#                             extra_section_number='00'
#                         ),
#                         'text': re.search(self.REGEX_PATTERNS['main']['considering'], text, re.IGNORECASE)[1].strip()
#                     }

#                     # Get observing text (Mengingat)
#                     regulation_dict['content']['observing'] = {
#                         'id': id_template.format(
#                             reg_section=self.REGULATION_ENCODING['section']['observing'],
#                             section_num='000',
#                             extra_section_number='00'
#                         ), 
#                         'text': re.search(self.REGEX_PATTERNS['main']['observing'], text, re.IGNORECASE)[1].strip()
#                     }

#                     # Check for amendment regulation
#                     is_amendment = re.search(self.REGEX_PATTERNS['main']['amendment_to'], regulation_dict['about'], re.IGNORECASE)
                    
#                     if is_amendment:
#                         regulation_dict, definition_list, article_dict = self.__parse_amendment_regulation(
#                             text=text,
#                             id_template=id_template,
#                             regulation_dict=regulation_dict,
#                             definition_list=definition_list,
#                             article_dict=article_dict,
#                             amend_regulations=regulation_dict['status']['amend']
#                         )
#                     else:
#                         regulation_dict, definition_list, article_dict = self.__parse_base_regulation(
#                             text=text,
#                             id_template=id_template,
#                             regulation_dict=regulation_dict,
#                             definition_list=definition_list,
#                             article_dict=article_dict
#                         )
                    
#                     result.append(regulation_dict)
#                     success += 1
                
#             except Exception as e:
#                 if verbose:
#                     failed += 1
#                     print(f'ERROR parsing content of {filename}')
#                     print(e)

#             durations.append(time.time() - start_time)

#         if not json_output.endswith('.json'):
#             json_output = f'{json_output}.json'

#         with open(json_output, "w") as output_file: 
#             json.dump(result, output_file, indent=4)
        
#         if verbose:
#             print('=' * 76)
#             print(f'Input directory   : {input_dir}')
#             print(f'Input JSON        : {json_input}')
#             print(f'Output JSON       : {json_output}')
#             print(f'Total regulations : {len(files)} regulations')
#             print(f'Total success     : {success} regulations')
#             print(f'Total failed      : {failed} regulations')
#             print(f'Total articles    : {len(article_dict)} articles')
#             print(f'Total time        : {round(sum(durations) * 1000, 3)} miliseconds')
#             print(f'Average time/file : {round(sum(durations) * 1000 / success, 3)} miliseconds')
#             print('=' * 76)

#         return result
    

#     def __parse_base_regulation(
#             self,
#             text: str,
#             id_template: str,
#             regulation_dict: dict,
#             definition_list: list,
#             article_dict: dict
#     ) -> tuple[dict, list, dict]:

#         regulation_dict['content']['articles'] = dict()

#         # Get all chapters
#         chapters = re.findall(self.REGEX_PATTERNS['main']['chapter'], text, re.IGNORECASE)

#         # If the chapter exists
#         if chapters:
#             # Iterate for every chapters
#             for chapter_num, chapter in enumerate(chapters):
#                 chapter_number = chapter_num + 1
#                 chapter_about = re.search(self.REGEX_PATTERNS['chapter']['about'], chapter, re.IGNORECASE)[1].strip().upper()
#                 chapter_about = re.sub(r'\n', ' - ', chapter_about, flags=re.IGNORECASE)

#                 # Get all parts
#                 parts = re.findall(self.REGEX_PATTERNS['chapter']['part'], chapter.strip() + '\n', re.IGNORECASE)

#                 # If the part exists
#                 if parts:
#                     # Iterate for every parts
#                     for part_num, part in enumerate(parts):
#                         # Get part about/name
#                         part_number = part_num + 1
#                         part_about = re.search(self.REGEX_PATTERNS['part']['about'], part, re.IGNORECASE)[1].strip()
#                         part_about = re.sub(r'\n', ' - ', part_about, flags=re.IGNORECASE)

#                         # Get all paragraphs
#                         paragraphs = re.findall(self.REGEX_PATTERNS['chapter']['paragraph'], part.strip() + '\n', re.IGNORECASE)
                        
#                         # If the paragraph exists
#                         if paragraphs:
#                             # Iterate for every paragraphs
#                             for paragraph_num, paragraph in enumerate(paragraphs):
#                                 # Get paragraph about/name
#                                 paragraph_number = paragraph_num + 1
#                                 paragraph_about = re.search(self.REGEX_PATTERNS['paragraph']['about'], paragraph, re.IGNORECASE)[1].strip()
#                                 paragraph_about = re.sub(r'\n', ' - ', paragraph_about, flags=re.IGNORECASE)
                                
#                                 regulation_dict, definition_list, article_dict = self.__parse_articles(
#                                     text=paragraph,
#                                     chapter_number=str(chapter_number),
#                                     chapter_about=chapter_about,
#                                     part_number=str(part_number),
#                                     part_about=part_about,
#                                     paragraph_number=str(paragraph_number),
#                                     paragraph_about=paragraph_about,
#                                     id_template=id_template,
#                                     regulation_dict=regulation_dict,
#                                     definition_list=definition_list,
#                                     article_dict=article_dict
#                                 )

#                         else:
#                             regulation_dict, definition_list, article_dict = self.__parse_articles(
#                                 text=part,
#                                 chapter_number=str(chapter_number),
#                                 chapter_about=chapter_about,
#                                 part_number=str(part_number),
#                                 part_about=part_about,
#                                 paragraph_number='',
#                                 paragraph_about='',
#                                 id_template=id_template,
#                                 regulation_dict=regulation_dict,
#                                 definition_list=definition_list,
#                                 article_dict=article_dict
#                             )

#                 else:
#                     regulation_dict, definition_list, article_dict = self.__parse_articles(
#                         text=chapter,
#                         chapter_number=str(chapter_number),
#                         chapter_about=chapter_about,
#                         part_number='',
#                         part_about='',
#                         paragraph_number='',
#                         paragraph_about='',
#                         id_template=id_template,
#                         regulation_dict=regulation_dict,
#                         definition_list=definition_list,
#                         article_dict=article_dict
#                     )
#         else:
#             regulation_dict, definition_list, article_dict = self.__parse_articles(
#                 text=text,
#                 chapter_number='',
#                 chapter_about='',
#                 part_number='',
#                 part_about='',
#                 paragraph_number='',
#                 paragraph_about='',
#                 id_template=id_template,
#                 regulation_dict=regulation_dict,
#                 definition_list=definition_list,
#                 article_dict=article_dict
#             )


#         return regulation_dict, definition_list, article_dict


#     def __parse_amendment_regulation(
#             self,
#             text: str,
#             id_template: str,
#             regulation_dict: dict,
#             definition_list: list,
#             article_dict: dict,
#             amend_regulations: list
#     ) -> tuple[dict, list, dict]:
        
#         regulation_dict['content']['articles'] = dict()
        
#         # Get all amendment points
#         amendment_points = re.findall(self.REGEX_PATTERNS['amendment_to']['amendment_point_1'], text, re.IGNORECASE)

#         if not amendment_points:
#             amendment_points = re.search(self.REGEX_PATTERNS['amendment_to']['amendment_point_2'], text, re.IGNORECASE)[1].strip()
#             first_sentence = re.search(r'^.*', amendment_points, re.IGNORECASE)[0].strip()  # Dapatkan kalimat pertama
#             amendment_points = [amendment_points.replace(first_sentence, '').strip()]		# Hapus kalimat pertama
        
#         # Iterate for every points
#         for point in amendment_points:
            

#             # Get all parts
#             parts = re.findall(self.REGEX_PATTERNS['amendment_to']['part'], point.strip() + '\n', re.IGNORECASE)

#             # If the part exists
#             if parts:
#                 # Iterate for every parts
#                 for part in parts:
#                     # Get part about/name
#                     part_about = re.search(self.REGEX_PATTERNS['part']['about'], part, re.IGNORECASE)[1].strip()
#                     part_about = re.sub(r'\n', ' - ', part_about, flags=re.IGNORECASE)
#                     part_number = re.search(self.REGEX_PATTERNS['part']['number'], part_about, re.IGNORECASE)[1].strip().lower()
#                     part_number = self.WORD_TO_NUMBER.get(part_number, 0)

#                     # Get all paragraphs
#                     paragraphs = re.findall(self.REGEX_PATTERNS['chapter']['paragraph'], part.strip() + '\n', re.IGNORECASE)
                    
#                     # If the paragraph exists
#                     if paragraphs:
#                         # Iterate for every paragraphs
#                         for paragraph in paragraphs:
#                             # Get paragraph about/name
#                             paragraph_about = re.search(self.REGEX_PATTERNS['paragraph']['about'], paragraph, re.IGNORECASE)[1].strip()
#                             paragraph_about = re.sub(r'\n', ' - ', paragraph_about, flags=re.IGNORECASE)
#                             paragraph_number = re.search(self.REGEX_PATTERNS['paragraph']['number'], paragraph_about, re.IGNORECASE)[1].strip()
                            
#                             regulation_dict, definition_list, article_dict = self.__parse_articles(
#                                 text=paragraph,
#                                 chapter_number='',
#                                 chapter_about='',
#                                 part_number=str(part_number),
#                                 part_about=part_about,
#                                 paragraph_number=str(paragraph_number),
#                                 paragraph_about=paragraph_about,
#                                 id_template=id_template,
#                                 regulation_dict=regulation_dict,
#                                 definition_list=definition_list,
#                                 article_dict=article_dict,
#                                 amendment=True,
#                                 amend_regulations=amend_regulations
#                             )

#                     else:
#                         regulation_dict, definition_list, article_dict = self.__parse_articles(
#                             text=part,
#                             chapter_number='',
#                             chapter_about='',
#                             part_number=str(part_number),
#                             part_about=part_about,
#                             paragraph_number='',
#                             paragraph_about='',
#                             id_template=id_template,
#                             regulation_dict=regulation_dict,
#                             definition_list=definition_list,
#                             article_dict=article_dict,
#                             amendment=True,
#                             amend_regulations=amend_regulations
#                         )

#             else:
#                 # Get all paragraphs
#                 paragraphs = re.findall(self.REGEX_PATTERNS['amendment_to']['paragraph'], point.strip() + '\n', re.IGNORECASE)
                
#                 # If the paragraph exists
#                 if paragraphs:
#                     # Iterate for every paragraphs
#                     for paragraph_num, paragraph in enumerate(paragraphs):
#                         # Get paragraph about/name
#                         paragraph_number = paragraph_num + 1
#                         paragraph_about = re.search(self.REGEX_PATTERNS['paragraph']['about'], paragraph, re.IGNORECASE)[1].strip()
#                         paragraph_about = re.sub(r'\n', ' - ', paragraph_about, flags=re.IGNORECASE)
#                         paragraph_number = re.search(self.REGEX_PATTERNS['paragraph']['number'], paragraph_about, re.IGNORECASE)[1].strip()

#                         regulation_dict, definition_list, article_dict = self.__parse_articles(
#                             text=paragraph,
#                             chapter_number='',
#                             chapter_about='',
#                             part_number='',
#                             part_about='',
#                             paragraph_number=str(paragraph_number),
#                             paragraph_about=paragraph_about,
#                             id_template=id_template,
#                             regulation_dict=regulation_dict,
#                             definition_list=definition_list,
#                             article_dict=article_dict,
#                             amendment=True,
#                             amend_regulations=amend_regulations
#                         )

#                 else:
#                     regulation_dict, definition_list, article_dict = self.__parse_articles(
#                         text=point,
#                         chapter_number='',
#                         chapter_about='',
#                         part_number='',
#                         part_about='',
#                         paragraph_number='',
#                         paragraph_about='',
#                         id_template=id_template,
#                         regulation_dict=regulation_dict,
#                         definition_list=definition_list,
#                         article_dict=article_dict,
#                         amendment=True,
#                         amend_regulations=amend_regulations
#                     )

#         return regulation_dict, definition_list, article_dict


#     def __parse_articles(
#             self,
#             text: str,
#             chapter_number: str,
#             chapter_about: str,
#             part_number: str,
#             part_about: str,
#             paragraph_number: str,
#             paragraph_about: str,
#             id_template: str,
#             regulation_dict: dict,
#             definition_list: list,
#             article_dict: dict,
#             amendment: bool = False,
#             amend_regulations: list = list() 
#     ) -> tuple[dict, list]:
        
#         # Get all articles
#         articles = re.findall(self.REGEX_PATTERNS['chapter']['article'], text, re.IGNORECASE)

#         # Iterate for every articles
#         for article in articles:
#             article_number = re.search(self.REGEX_PATTERNS['article']['number'], article, re.IGNORECASE)[1]
#             article_text = re.search(self.REGEX_PATTERNS['article']['text'], article, re.IGNORECASE)[1].strip()
#             article_text = re.sub(r'\n+', '\n', article_text)
            
#             # Get definition (Article/Pasal 1)
#             if article_number == '1':
#                 if re.search(self.REGEX_PATTERNS['article']['check_definition'], article_text, re.IGNORECASE):
#                     definitions = re.findall(self.REGEX_PATTERNS['article']['definition'], article_text)

#                     for index, definition_data in enumerate(definitions):
#                         definition, name = definition_data
#                         definition_list.append({
#                             'id': id_template.format(
#                                 reg_section=self.REGULATION_ENCODING['section']['definition'],
#                                 section_num=str(index + 1).zfill(3),
#                                 extra_section_number='00'
#                             ),
#                             'name': name.strip(),
#                             'definition': definition.strip()
#                         })

#             # Store definition list
#             regulation_dict['content']['definitions'] = definition_list

#             # Define article ID
#             article_id = ''

#             if amendment:
#                 # Create article ID
#                 article_alphabet = re.search(r'\d+([A-Z]+)', str(article_number), re.IGNORECASE)

#                 if article_alphabet:
#                     article_id = id_template.format(
#                         reg_section=self.REGULATION_ENCODING['section']['article'],
#                         section_num=re.search(r'\d+', str(article_number), re.IGNORECASE)[0].zfill(3),
#                         extra_section_number=self.ALPHABET_TO_NUMERIC[article_alphabet[1]]
#                     )
#                 else:
#                     article_id = id_template.format(
#                         reg_section=self.REGULATION_ENCODING['section']['article'],
#                         section_num=str(article_number).zfill(3),
#                         extra_section_number='00'
#                     )

#                 # Store article data
#                 regulation_dict['content']['articles'][article_number] = {
#                     'id': article_id,
#                     'chapter_number': chapter_number,
#                     'chapter_about': chapter_about,
#                     'part_number': part_number,
#                     'part_about': part_about,
#                     'paragraph_number': paragraph_number,
#                     'paragraph_about': paragraph_about,
#                     'article_number': article_number,
#                     'text': article_text
#                 }

#                 # Store '' as previous article ID (for consistency with base regulation / not amandment)
#                 regulation_dict['content']['articles'][article_number]['previous_article'] = ''

#                 # Store empty list as 'references' (for consistency with base regulation / not amandment)
#                 regulation_dict['content']['articles'][article_number]['references'] = list()

#                 # Mendefinisikan daftar pasal yang diamandemen
#                 regulation_dict['content']['articles'][article_number]['amend'] = list()
                
#                 # Ambil daftar peraturan amend
#                 for regulation_id in amend_regulations:
#                     # Buat ulang (tebak) ID Pasal di peraturan sebelum revisi
#                     other_article_id = regulation_id[:-6] + '6' + article_id[-5:]

#                     # Jika ID tersebut ada di article_dict maka ID nya pasti ada, terus simpan 
#                     if other_article_id in article_dict.keys():

#                         # BINGUNG ANTARA MAU PAKAI ATAU TIDAK: Chapter, Part, Paragraph
#                         # Article ID FIX PAKAI

#                         # Chapter
#                         regulation_dict['content']['articles'][article_number]['chapter_number'] = \
#                             chapter_number if chapter_number else article_dict[other_article_id]['chapter_number']
#                         regulation_dict['content']['articles'][article_number]['chapter_about'] = \
#                             chapter_about if chapter_about else article_dict[other_article_id]['chapter_about']
                        
#                         # Part
#                         regulation_dict['content']['articles'][article_number]['part_number'] = \
#                             part_number if part_number else article_dict[other_article_id]['part_number']
#                         regulation_dict['content']['articles'][article_number]['part_about'] = \
#                             part_about if part_about else article_dict[other_article_id]['part_about']
                        
#                         # Paragraph
#                         regulation_dict['content']['articles'][article_number]['paragraph_number'] = \
#                             paragraph_number if paragraph_number else article_dict[other_article_id]['paragraph_number']
#                         regulation_dict['content']['articles'][article_number]['paragraph_about'] = \
#                             paragraph_about if paragraph_about else article_dict[other_article_id]['paragraph_about']
                        
#                         # Article ID
#                         regulation_dict['content']['articles'][article_number]['amend'].append(other_article_id)
                
#                 # Store article to article_dict
#                 article_dict[article_id] = regulation_dict['content']['articles'][article_number]

#             else:
#                 # Create article ID
#                 article_id = id_template.format(
#                     reg_section=self.REGULATION_ENCODING['section']['article'],
#                     section_num=str(article_number).zfill(3),
#                     extra_section_number='00'
#                 )

#                 # Store article
#                 regulation_dict['content']['articles'][article_number] = {
#                     'id': article_id,
#                     'chapter_number': chapter_number,
#                     'chapter_about': chapter_about,
#                     'part_number': part_number,
#                     'part_about': part_about,
#                     'paragraph_number': paragraph_number,
#                     'paragraph_about': paragraph_about,
#                     'article_number': article_number,
#                     'text': article_text
#                 }

#                 # Get previous article ID
#                 if article_number != '1':
#                     regulation_dict['content']['articles'][article_number]['previous_article'] = \
#                         id_template.format(
#                             reg_section=self.REGULATION_ENCODING['section']['article'],
#                             section_num=str(int(article_number) - 1).zfill(3),
#                             extra_section_number='00'
#                         )
#                 else:
#                     regulation_dict['content']['articles'][article_number]['previous_article'] = ''
                
                
#                 # Get article reference to other article
#                 regulation_dict['content']['articles'][article_number]['references'] = list()
#                 reference_type_1 = list(set(re.findall(self.REGEX_PATTERNS['article']['reference_1'], article_text, re.IGNORECASE)))
#                 reference_type_2 = list(set(re.findall(self.REGEX_PATTERNS['article']['reference_2'], article_text, re.IGNORECASE)))

#                 article_references = self.__merge_article_references(reference_type_1, reference_type_2)

#                 for article_reference_num in article_references:
#                     regulation_dict['content']['articles'][article_number]['references'].append(
#                         id_template.format(
#                             reg_section=self.REGULATION_ENCODING['section']['article'],
#                             section_num=str(article_reference_num).zfill(3),
#                             extra_section_number='00'
#                         )
#                     )

#                 # Store empty list as 'amend' (for consistency with amandment regulation)
#                 regulation_dict['content']['articles'][article_number]['amend'] = list()
                        
#                 # Store article to article_dict
#                 article_dict[article_id] = regulation_dict['content']['articles'][article_number]

#         return regulation_dict, definition_list, article_dict


#     def __merge_article_references(self, list1: list, list2: list) -> list:
#         set1 = set(int(x) for x in list1)
#         set2 = set()

#         for start, end in list2:
#             set2.update(range(int(start), int(end) + 1))

#         return sorted(set1.union(set2))

In [None]:
# input_dir = os.path.join('data', 'markdown', 'fix', 'temp')
# json_input = os.path.join('data', 'regulation_data_modified.json')
# json_output = os.path.join('data', 'regulation_data_final.json')

# parser = RegulationParser()
# regulation_data = parser.parse_regulations_content(
#     input_dir=input_dir,
#     json_input=json_input,
#     json_output=json_output,
#     verbose=True
# )

In [None]:
# # MODEL 1: STRUKTUR PERATURAN DASAR DAN AMANDEMEN BERBEDA

# class RegulationParser:

#     def __init__(self):
#         self.ALPHABET_TO_NUMERIC = {chr(i): f'{i - 64:02}' for i in range(65, 91)}  # A-Z -> 01-26
#         self.REGULATION_ENCODING = {
#             'type': {
#                 'UU': '01',
#                 'PERPPU': '02',
#                 'PP': '03',
#                 'PERPRES': '04',
#                 'PERMENKOMINFO': '05'
#             },
#             'section': {
#                 'document': '1',
#                 'considering': '2',
#                 'observing': '3',
#                 'definition': '4',
#                 'chapter': '5',
#                 'article': '6',
#                 'section': '7',
#             }
#         }
#         self.REGEX_PATTERNS = {
#             'document': {
#                 'metadata': r'^(\w+)_(\w+)_(\w+)'  # Jenis, tahun, dan nomor peraturan
#             },
#             'main': {
#                 'considering': r'(?<=## menimbang)([\S\s]*?)(?=## mengingat)',                                                 # Menimbang
#                 'observing': r'(?<=## mengingat)([\S\s]*?)(?=(?:dengan persetujuan bersama|## memperhatikan|## memutuskan))',  # Mengingat
#                 'amendment_to': r'^Perubahan',                                                                                 # Cek Peraturan Amandemen
#                 'chapter': r'(## BAB[\S\s]*?)(?=\s+(?:## BAB|agar setiap orang mengetahuinya|ditetapkan di))'                  # Daftar Bab
#             },
#             'chapter': {
#                 # For every chapters
#                 'about': r'## (BAB [\w\s\-\/\,]+)##',                            # Nama Bab, ganti [\-\/\,] dengan semua tanda baca kecuali #
#                 'part': r'(## Bagian [\S\s]*?)(?=\s+(?:## Bagian|$))',           # Daftar Bagian
#                 'paragraph': r'(## Paragraf [\S\s]*?)(?=\s+(?:## Paragraf|$))',  # Daftar Paragraf
#                 'article': r'(## Pasal \w+[\S\s]*?)(?=(?:##|$))'                 # Daftar Pasal
#             },
#             'part': {
#                 # For every parts
#                 'about': r'## (Bagian [\w\s\-\/\,]+)##'  # Nama Bagian
#             },
#             'paragraph': {
#                 # For every paragraphs
#                 'about': r'## (Paragraf [\w\s\-\/\,]+)##'  # Nama Paragraf
#             },
#             'article': {
#                 # For every articles
#                 'number': r'## Pasal (\d+\w*)',                                       # Nomor Pasal
#                 'text': r'## Pasal \w+\n*([\S\s]*)',                                  # Isi Pasal
#                 'definition': r'\(\d+[a-z]?\) (([A-Z][a-z]*(?:\s[A-Z][a-z]*)*) .*)',  # Daftar Definisi
#                 'reference_1': r'Pasal (\d+\w*)',                                     # Pasal Referensi Jenis 1
#                 'reference_2': r'Pasal (\d+\w*) sampai dengan Pasal (\d+\w*)'         # Pasal Referensi Jenis 2
#             },
#             'amendment_to': {
#                 'amendment_point_1': r'(## \d+\.[\S\s]*?)(?=\s+(?:## \d+\.|## Pasal II))',  # Poin Amandemen Jenis 1
#                 'amendment_point_2': r'(?<=## Pasal I)([\s\S]*?)(?=## Pasal II)',           # Poin Amandemen Jenis 2
#                 'part': r'(## Bagian [\S\s]*?)(?=\s+(?:## Bagian|$))',                      # Daftar Bagian
#                 'paragraph': r'(## Paragraf [\S\s]*?)(?=\s+(?:## Paragraf|$))',             # Daftar Paragraf
#                 # 'amendment_articles': r'(## Pasal \d+[A-Z]?[\S\s]*?)(?=(?:##|$))',          # Pasal Amandemen
#             }
#         }

#     def parse_regulations_content(
#             self,
#             input_dir: str,
#             json_input: str,
#             json_output: str,
#             verbose: bool = True
#     ) -> list[dict]:

#         # Initialize data
#         article_dict = dict()
#         durations = list()
#         result = list()
#         files = list()
#         success = 0
#         failed = 0

#         # Get all Markdown files path and name
#         for filename in os.listdir(input_dir):
#             if filename.endswith(".md"):
#                 files.append((os.path.join(input_dir, filename), filename))

#         # Iterate for every files
#         for regulation_file in tqdm(iterable=files, desc='Parsing regulations content', disable=not verbose):
#             start_time = time.time()
#             filepath, filename = regulation_file

#             try:
#                 with open(filepath, 'r', encoding='utf8') as file:
#                     # Initialize data
#                     text = file.read()
#                     regulation_dict = dict()
#                     definition_list = list()

#                     # Get the file metadata
#                     metadata = re.search(self.REGEX_PATTERNS['document']['metadata'], filename)
#                     regulation_type = self.REGULATION_ENCODING['type'][metadata[1]]
#                     regulation_year = metadata[2]
#                     regulation_num = int(metadata[3])

#                     # Create template ID
#                     id_template = f'{regulation_year}{regulation_type}{str(regulation_num).zfill(3)}' \
#                         + '{reg_section}{section_num}{extra_section_number}'

#                     # Create regulation ID
#                     regulation_id = id_template.format(
#                         reg_section=self.REGULATION_ENCODING['section']['document'],
#                         section_num='000',
#                         extra_section_number='00'
#                     )

#                     # Get regulation data from regulation JSON
#                     with open(json_input) as json_data:
#                         for regulation_data in json.load(json_data):
#                             if regulation_data['id'] == regulation_id:
#                                 regulation_dict = regulation_data

#                     # Initialize the dictionary of parsing results
#                     regulation_dict['content'] = dict()

#                     # Get considering text (Menimbang)
#                     regulation_dict['content']['considering'] = {
#                         'id': id_template.format(
#                             reg_section=self.REGULATION_ENCODING['section']['considering'],
#                             section_num='000',
#                             extra_section_number='00'
#                         ),
#                         'text': re.search(self.REGEX_PATTERNS['main']['considering'], text, re.IGNORECASE)[1].strip()
#                     }

#                     # Get observing text (Mengingat)
#                     regulation_dict['content']['observing'] = {
#                         'id': id_template.format(
#                             reg_section=self.REGULATION_ENCODING['section']['observing'],
#                             section_num='000',
#                             extra_section_number='00'
#                         ), 
#                         'text': re.search(self.REGEX_PATTERNS['main']['observing'], text, re.IGNORECASE)[1].strip()
#                     }

#                     # Check for amendment regulation
#                     is_amendment = re.search(self.REGEX_PATTERNS['main']['amendment_to'], regulation_dict['about'], re.IGNORECASE)
                    
#                     if is_amendment:
#                         regulation_dict, definition_list, article_dict = self.__parse_amendment_regulation(
#                             text=text,
#                             id_template=id_template,
#                             regulation_dict=regulation_dict,
#                             definition_list=definition_list,
#                             article_dict=article_dict,
#                             amend_regulations=regulation_dict['status']['amend']
#                         )
#                     else:
#                         regulation_dict, definition_list, article_dict = self.__parse_base_regulation(
#                             text=text,
#                             id_template=id_template,
#                             regulation_dict=regulation_dict,
#                             definition_list=definition_list,
#                             article_dict=article_dict
#                         )
                    
#                     result.append(regulation_dict)
#                     success += 1
                
#             except Exception as e:
#                 if verbose:
#                     failed += 1
#                     print(f'ERROR parsing content of {filename}')
#                     print(e)

#             durations.append(time.time() - start_time)

#         if not json_output.endswith('.json'):
#             json_output = f'{json_output}.json'

#         with open(json_output, "w") as output_file: 
#             json.dump(result, output_file, indent=4)
        
#         if verbose:
#             print('=' * 76)
#             print(f'Input directory   : {input_dir}')
#             print(f'Input JSON        : {json_input}')
#             print(f'Output JSON       : {json_output}')
#             print(f'Total regulations : {len(files)} regulations')
#             print(f'Total success     : {success} regulations')
#             print(f'Total failed      : {failed} regulations')
#             print(f'Total articles    : {len(article_dict)} articles')
#             print(f'Total time        : {round(sum(durations) * 1000, 3)} miliseconds')
#             print(f'Average time/file : {round(sum(durations) * 1000 / success, 3)} miliseconds')
#             print('=' * 76)

#         return result
    

#     def __parse_base_regulation(
#             self,
#             text: str,
#             id_template: str,
#             regulation_dict: dict,
#             definition_list: list,
#             article_dict: dict
#     ) -> tuple[dict, list, dict]:

#         regulation_dict['content']['chapters'] = dict()

#         # Get all chapters
#         chapters = re.findall(self.REGEX_PATTERNS['main']['chapter'], text, re.IGNORECASE)

#         # Iterate for every chapters
#         for num, chapter in enumerate(chapters):
#             chapter_num = num + 1
#             chapter_about = re.search(self.REGEX_PATTERNS['chapter']['about'], chapter, re.IGNORECASE)[1].strip().upper()
#             chapter_about = re.sub(r'\n', ': ', chapter_about, flags=re.IGNORECASE)

#             regulation_dict['content']['chapters'][chapter_num] = {
#                 'id': id_template.format(
#                     reg_section=self.REGULATION_ENCODING['section']['chapter'],
#                     section_num=str(chapter_num).zfill(3),
#                     extra_section_number='00'
#                 ),
#                 'about': chapter_about
#             }
        
#             regulation_dict['content']['chapters'][chapter_num]['articles'] = dict()

#             # Get all parts
#             parts = re.findall(self.REGEX_PATTERNS['chapter']['part'], chapter.strip() + '\n', re.IGNORECASE)

#             # If the part exists
#             if parts:
#                 # Iterate for every parts
#                 for part in parts:
#                     # Get part about/name
#                     part_about = re.search(self.REGEX_PATTERNS['part']['about'], part, re.IGNORECASE)[1].strip()
#                     part_about = re.sub(r'\n', ': ', part_about, flags=re.IGNORECASE)

#                     # Get all paragraphs
#                     paragraphs = re.findall(self.REGEX_PATTERNS['chapter']['paragraph'], part.strip() + '\n', re.IGNORECASE)
                    
#                     # If the paragraph exists
#                     if paragraphs:
#                         # Iterate for every paragraphs
#                         for paragraph in paragraphs:
#                             # Get paragraph about/name
#                             paragraph_about = re.search(self.REGEX_PATTERNS['paragraph']['about'], paragraph, re.IGNORECASE)[1].strip()
#                             paragraph_about = re.sub(r'\n', ': ', paragraph_about, flags=re.IGNORECASE)
                            
#                             regulation_dict, definition_list, article_dict = self.__parse_articles(
#                                 text=paragraph,
#                                 chapter_num=chapter_num,
#                                 part_about=part_about,
#                                 paragraph_about=paragraph_about,
#                                 id_template=id_template,
#                                 regulation_dict=regulation_dict,
#                                 definition_list=definition_list,
#                                 article_dict=article_dict
#                             )

#                     else:
#                         regulation_dict, definition_list, article_dict = self.__parse_articles(
#                             text=part,
#                             chapter_num=chapter_num,
#                             part_about=part_about,
#                             paragraph_about='',
#                             id_template=id_template,
#                             regulation_dict=regulation_dict,
#                             definition_list=definition_list,
#                             article_dict=article_dict
#                         )

#             else:
#                 regulation_dict, definition_list, article_dict = self.__parse_articles(
#                     text=chapter,
#                     chapter_num=chapter_num,
#                     part_about='',
#                     paragraph_about='',
#                     id_template=id_template,
#                     regulation_dict=regulation_dict,
#                     definition_list=definition_list,
#                     article_dict=article_dict
#                 )

#         return regulation_dict, definition_list, article_dict


#     def __parse_amendment_regulation(
#             self,
#             text: str,
#             id_template: str,
#             regulation_dict: dict,
#             definition_list: list,
#             article_dict: dict,
#             amend_regulations: list
#     ) -> tuple[dict, list, dict]:
        
#         # MUNGKIN INI BUAT MASALAH? COBA DULU
#         regulation_dict['content']['articles'] = dict()
        
#         # Get all amendment points
#         amendment_points = re.findall(self.REGEX_PATTERNS['amendment_to']['amendment_point_1'], text, re.IGNORECASE)

#         if not amendment_points:
#             amendment_points = re.search(self.REGEX_PATTERNS['amendment_to']['amendment_point_2'], text, re.IGNORECASE)[1].strip()
#             first_sentence = re.search(r'^.*', amendment_points, re.IGNORECASE)[0].strip()  # Dapatkan kalimat pertama
#             amendment_points = [amendment_points.replace(first_sentence, '').strip()]		# Hapus kalimat pertama
        
#         # Iterate for every points
#         for point in amendment_points:
#             # Get all parts
#             parts = re.findall(self.REGEX_PATTERNS['amendment_to']['part'], point.strip() + '\n', re.IGNORECASE)

#             # If the part exists
#             if parts:
#                 # Iterate for every parts
#                 for part in parts:
#                     # Get part about/name
#                     part_about = re.search(self.REGEX_PATTERNS['part']['about'], part, re.IGNORECASE)[1].strip()
#                     part_about = re.sub(r'\n', ': ', part_about, flags=re.IGNORECASE)

#                     # Get all paragraphs
#                     paragraphs = re.findall(self.REGEX_PATTERNS['chapter']['paragraph'], part.strip() + '\n', re.IGNORECASE)
                    
#                     # If the paragraph exists
#                     if paragraphs:
#                         # Iterate for every paragraphs
#                         for paragraph in paragraphs:
#                             # Get paragraph about/name
#                             paragraph_about = re.search(self.REGEX_PATTERNS['paragraph']['about'], paragraph, re.IGNORECASE)[1].strip()
#                             paragraph_about = re.sub(r'\n', ': ', paragraph_about, flags=re.IGNORECASE)
                            
#                             regulation_dict, definition_list, article_dict = self.__parse_articles(
#                                 text=paragraph,
#                                 chapter_num=0,
#                                 part_about=part_about,
#                                 paragraph_about=paragraph_about,
#                                 id_template=id_template,
#                                 regulation_dict=regulation_dict,
#                                 definition_list=definition_list,
#                                 article_dict=article_dict,
#                                 amendment=True,
#                                 amend_regulations=amend_regulations
#                             )

#                     else:
#                         regulation_dict, definition_list, article_dict = self.__parse_articles(
#                             text=part,
#                             chapter_num=0,
#                             part_about=part_about,
#                             paragraph_about='',
#                             id_template=id_template,
#                             regulation_dict=regulation_dict,
#                             definition_list=definition_list,
#                             article_dict=article_dict,
#                             amendment=True,
#                             amend_regulations=amend_regulations
#                         )

#             else:
#                 # Get all paragraphs
#                 paragraphs = re.findall(self.REGEX_PATTERNS['amendment_to']['paragraph'], point.strip() + '\n', re.IGNORECASE)
                
#                 # If the paragraph exists
#                 if paragraphs:
#                     # Iterate for every paragraphs
#                     for paragraph in paragraphs:
#                         # Get paragraph about/name
#                         paragraph_about = re.search(self.REGEX_PATTERNS['paragraph']['about'], paragraph, re.IGNORECASE)[1].strip()
#                         paragraph_about = re.sub(r'\n', ': ', paragraph_about, flags=re.IGNORECASE)

#                         regulation_dict, definition_list, article_dict = self.__parse_articles(
#                             text=paragraph,
#                             chapter_num=0,
#                             part_about='',
#                             paragraph_about=paragraph_about,
#                             id_template=id_template,
#                             regulation_dict=regulation_dict,
#                             definition_list=definition_list,
#                             article_dict=article_dict,
#                             amendment=True,
#                             amend_regulations=amend_regulations
#                         )

#                 else:
#                     regulation_dict, definition_list, article_dict = self.__parse_articles(
#                         text=point,
#                         chapter_num=0,
#                         part_about='',
#                         paragraph_about='',
#                         id_template=id_template,
#                         regulation_dict=regulation_dict,
#                         definition_list=definition_list,
#                         article_dict=article_dict,
#                         amendment=True,
#                         amend_regulations=amend_regulations
#                     )

#         return regulation_dict, definition_list, article_dict


#     def __parse_articles(
#             self,
#             text: str,
#             chapter_num: int,
#             part_about: str,
#             paragraph_about: str,
#             id_template: str,
#             regulation_dict: dict,
#             definition_list: list,
#             article_dict: dict,
#             amendment: bool = False,
#             amend_regulations: list = list() 
#     ) -> tuple[dict, list]:
        
#         # Get all articles
#         articles = re.findall(self.REGEX_PATTERNS['chapter']['article'], text, re.IGNORECASE)

#         # Iterate for every articles
#         for article in articles:
#             article_num = re.search(self.REGEX_PATTERNS['article']['number'], article, re.IGNORECASE)[1]
#             article_text = re.search(self.REGEX_PATTERNS['article']['text'], article, re.IGNORECASE)[1].strip()
#             article_text = re.sub(r'\n+', '\n', article_text)
            
#             # Get definition (Article/Pasal 1)
#             if article_num == '1':
#                 definitions = re.findall(self.REGEX_PATTERNS['article']['definition'], article_text)
#                 for index, definition_data in enumerate(definitions):
#                     definition, name = definition_data
#                     definition_list.append({
#                         'id': id_template.format(
#                             reg_section=self.REGULATION_ENCODING['section']['definition'],
#                             section_num=str(index + 1).zfill(3),
#                             extra_section_number='00'
#                         ),
#                         'name': name.strip(),
#                         'definition': definition.strip()
#                     })
#                 regulation_dict['content']['definitions'] = definition_list

#             # Define article ID
#             article_id = ''

#             if amendment:
#                 # Create article ID
#                 article_alphabet = re.search(r'\d+([A-Z]+)', str(article_num), re.IGNORECASE)

#                 if article_alphabet:
#                     article_id = id_template.format(
#                         reg_section=self.REGULATION_ENCODING['section']['article'],
#                         section_num=re.search(r'\d+', str(article_num), re.IGNORECASE)[0].zfill(3),
#                         extra_section_number=self.ALPHABET_TO_NUMERIC[article_alphabet[1]]
#                     )
#                 else:
#                     article_id = id_template.format(
#                         reg_section=self.REGULATION_ENCODING['section']['article'],
#                         section_num=str(article_num).zfill(3),
#                         extra_section_number='00'
#                     )

#                 # Store article data
#                 regulation_dict['content']['articles'][article_num] = {
#                     'id': article_id,
#                     'part': part_about,
#                     'paragraph': paragraph_about,
#                     'text': article_text
#                 }

#                 # Store article to article_dict
#                 article_dict[article_id] = regulation_dict['content']['articles'][article_num]

#                 # Mendefinisikan daftar pasasl yang diamandemen
#                 regulation_dict['content']['articles'][article_num]['amend'] = list()
                
#                 # Ambil daftar peraturan amend
#                 for regulation_id in amend_regulations:
#                     # Buat ulang (tebak) ID Pasal di peraturan sebelum revisi
#                     other_article_id = regulation_id[:-6] + '6' + article_id[-5:]

#                     # Jika ID tersebut ada di article_dict maka ID nya pasti ada, terus simpan 
#                     if other_article_id in article_dict.keys():
#                         regulation_dict['content']['articles'][article_num]['part'] = \
#                             part_about if part_about else article_dict[other_article_id]['part']
#                         regulation_dict['content']['articles'][article_num]['paragraph'] = \
#                             paragraph_about if paragraph_about else article_dict[other_article_id]['paragraph']
#                         regulation_dict['content']['articles'][article_num]['amend'].append(other_article_id)

#             else:
#                 # Create article ID
#                 article_id = id_template.format(
#                     reg_section=self.REGULATION_ENCODING['section']['article'],
#                     section_num=str(article_num).zfill(3),
#                     extra_section_number='00'
#                 )

#                 # Store article
#                 regulation_dict['content']['chapters'][chapter_num]['articles'][article_num] = {
#                     'id': article_id,
#                     'part': part_about,
#                     'paragraph': paragraph_about,
#                     'text': article_text
#                 }

#                 # Store article to article_dict
#                 article_dict[article_id] = regulation_dict['content']['chapters'][chapter_num]['articles'][article_num]

#                 # Get previous article ID
#                 if article_num != '1':
#                     regulation_dict['content']['chapters'][chapter_num]['articles'][article_num]['previous_article'] = \
#                         id_template.format(
#                             reg_section=self.REGULATION_ENCODING['section']['article'],
#                             section_num=str(int(article_num) - 1).zfill(3),
#                             extra_section_number='00'
#                         )
#                 else:
#                     regulation_dict['content']['chapters'][chapter_num]['articles'][article_num]['previous_article'] = ''
                
#                 # Get article reference to other article
#                 regulation_dict['content']['chapters'][chapter_num]['articles'][article_num]['references'] = list()
#                 reference_type_1 = list(set(re.findall(self.REGEX_PATTERNS['article']['reference_1'], article_text, re.IGNORECASE)))
#                 reference_type_2 = list(set(re.findall(self.REGEX_PATTERNS['article']['reference_2'], article_text, re.IGNORECASE)))

#                 article_references = self.__merge_article_references(reference_type_1, reference_type_2)

#                 for article_reference_num in article_references:
#                     regulation_dict['content']['chapters'][chapter_num]['articles'][article_num]['references'].append(
#                         id_template.format(
#                             reg_section=self.REGULATION_ENCODING['section']['article'],
#                             section_num=str(article_reference_num).zfill(3),
#                             extra_section_number='00'
#                         )
#                     )
                        
#         return regulation_dict, definition_list, article_dict


#     def __merge_article_references(self, list1: list, list2: list) -> list:
#         set1 = set(int(x) for x in list1)
#         set2 = set()

#         for start, end in list2:
#             set2.update(range(int(start), int(end) + 1))

#         return sorted(set1.union(set2))

In [None]:
# input_dir = os.path.join('data', 'markdown', 'fix', 'temp')
# json_input = os.path.join('data', 'regulation_data_modified.json')
# json_output = os.path.join('data', 'regulation_data_final.json')

# parser = RegulationParser()
# regulation_data = parser.parse_regulations_content(
#     input_dir=input_dir,
#     json_input=json_input,
#     json_output=json_output,
#     verbose=True
# )

In [None]:
# mapping_alphabet_to_numeric = {chr(i): f'{i - 64:02}' for i in range(65, 91)}  # A-Z -> 01-26

# encode = {
#     'type': {
#         'UU': '01',
#         'PERPPU': '02',
#         'PP': '03',
#         'PERPRES': '04',
#         'PERMENKOMINFO': '05'
#     },
#     'section': {
#         'document': '1',
#         'considering': '2',
#         'observing': '3',
#         'definition': '4',
#         'chapter': '5',
#         'article': '6',
#         'section': '7',
#     }
# }

# patterns = {
#     'document': {
#         'metadata': r'^(\w+)_(\w+)_(\w+)'    # Jenis, tahun, dan nomor peraturan
#     },
#     'main': {
#         'considering': r'(?<=## menimbang)([\S\s]*?)(?=## mengingat)',                                                 # Menimbang
#         'observing': r'(?<=## mengingat)([\S\s]*?)(?=(?:dengan persetujuan bersama|## memperhatikan|## memutuskan))',  # Mengingat
#         'amendment_to': r'^Perubahan',                                                                                 # Cek Peraturan Revisi
#         'chapter': r'(## BAB[\S\s]*?)(?=\s+(?:## BAB|agar setiap orang mengetahuinya|ditetapkan di))'                  # Daftar Bab
#     },
#     'chapter': {
#         # For every chapters
#         'about': r'## (BAB [\w\s\-\/\,]+)##',                            # Nama Bab, ganti [\-\/\,] dengan semua tanda baca kecuali #
#         'part': r'(## Bagian [\S\s]*?)(?=\s+(?:## Bagian|$))',           # Daftar Bagian
#         'paragraph': r'(## Paragraf [\S\s]*?)(?=\s+(?:## Paragraf|$))',  # Daftar Paragraf
#         'article': r'(## Pasal \w+[\S\s]*?)(?=(?:##|$))'                 # Daftar Pasal
#     },
#     'part': {
#         # For every parts
#         'about': r'## (Bagian [\w\s\-\/\,]+)##'    # Nama Bagian
#     },
#     'paragraph': {
#         # For every paragraphs
#         'about': r'## (Paragraf [\w\s\-\/\,]+)##'  # Nama Paragraf
#     },
#     'article': {
#         # For every articles
#         'number': r'## Pasal (\d+\w*)',                                       # Nomor Pasal
#         'text': r'## Pasal \w+\n*([\S\s]*)',                                  # Isi Pasal
#         'definition': r'\(\d+[a-z]?\) (([A-Z][a-z]*(?:\s[A-Z][a-z]*)*) .*)',  # Daftar Definisi
#         'reference_1': r'Pasal (\d+\w*)',                                     # Pasal Referensi Jenis 1
#         'reference_2': r'Pasal (\d+\w*) sampai dengan Pasal (\d+\w*)'         # Pasal Referensi Jenis 2
#     },
#     'amendment_to': {
#         'amendment_point_1': r'(## \d+\.[\S\s]*?)(?=\s+(?:## \d+\.|## Pasal II))',  # Poin Amandemen Jenis 1
#         'amendment_point_2': r'(?<=## Pasal I)([\s\S]*?)(?=## Pasal II)',           # Poin Amandemen Jenis 2
#         'part': r'(## Bagian [\S\s]*?)(?=\s+(?:## Bagian|$))',                      # Daftar Bagian
#         'paragraph': r'(## Paragraf [\S\s]*?)(?=\s+(?:## Paragraf|$))',             # Daftar Paragraf
#         # 'amendment_articles': r'(## Pasal \d+[A-Z]?[\S\s]*?)(?=(?:##|$))',          # Pasal Amandemen
#     }
# }

In [None]:
# def parse_base_regulation(
#         text: str,
#         id_template: str,
#         regulation_dict: dict,
#         definition_list: list,
#         article_dict: dict
# ) -> tuple[dict, list, dict]:

#     regulation_dict['content']['chapters'] = dict()

#     # Get all chapters
#     chapters = re.findall(patterns['main']['chapter'], text, re.IGNORECASE)

#     # Iterate for every chapters
#     for num, chapter in enumerate(chapters):
#         chapter_num = num + 1
#         chapter_about = re.search(patterns['chapter']['about'], chapter, re.IGNORECASE)[1].strip().upper()
#         chapter_about = re.sub(r'\n', ': ', chapter_about, flags=re.IGNORECASE)

#         regulation_dict['content']['chapters'][chapter_num] = {
#             'id': id_template.format(
#                 reg_section=encode['section']['chapter'],
#                 section_num=str(chapter_num).zfill(3),
#                 extra_section_number='00'
#             ),
#             'about': chapter_about
#         }
    
#         regulation_dict['content']['chapters'][chapter_num]['articles'] = dict()

#         # Get all parts
#         parts = re.findall(patterns['chapter']['part'], chapter.strip() + '\n', re.IGNORECASE)

#         # If the part exists
#         if parts:
#             # Iterate for every parts
#             for part in parts:
#                 # Get part about/name
#                 part_about = re.search(patterns['part']['about'], part, re.IGNORECASE)[1].strip()
#                 part_about = re.sub(r'\n', ': ', part_about, flags=re.IGNORECASE)

#                 # Get all paragraphs
#                 paragraphs = re.findall(patterns['chapter']['paragraph'], part.strip() + '\n', re.IGNORECASE)
                
#                 # If the paragraph exists
#                 if paragraphs:
#                     # Iterate for every paragraphs
#                     for paragraph in paragraphs:
#                         # Get paragraph about/name
#                         paragraph_about = re.search(patterns['paragraph']['about'], paragraph, re.IGNORECASE)[1].strip()
#                         paragraph_about = re.sub(r'\n', ': ', paragraph_about, flags=re.IGNORECASE)
                        
#                         regulation_dict, definition_list, article_dict = parse_articles(
#                             text=paragraph,
#                             chapter_num=chapter_num,
#                             part_about=part_about,
#                             paragraph_about=paragraph_about,
#                             id_template=id_template,
#                             regulation_dict=regulation_dict,
#                             definition_list=definition_list,
#                             article_dict=article_dict
#                         )

#                 else:
#                     regulation_dict, definition_list, article_dict = parse_articles(
#                         text=part,
#                         chapter_num=chapter_num,
#                         part_about=part_about,
#                         paragraph_about='',
#                         id_template=id_template,
#                         regulation_dict=regulation_dict,
#                         definition_list=definition_list,
#                         article_dict=article_dict
#                     )

#         else:
#             regulation_dict, definition_list, article_dict = parse_articles(
#                 text=chapter,
#                 chapter_num=chapter_num,
#                 part_about='',
#                 paragraph_about='',
#                 id_template=id_template,
#                 regulation_dict=regulation_dict,
#                 definition_list=definition_list,
#                 article_dict=article_dict
#             )

#     return regulation_dict, definition_list, article_dict


# def parse_amendment_regulation(
#         text: str,
#         id_template: str,
#         regulation_dict: dict,
#         definition_list: list,
#         article_dict: dict,
#         amend_regulations: list
# ) -> tuple[dict, list, dict]:
    
#     # MUNGKIN INI BUAT MASALAH? COBA DULU
#     regulation_dict['content']['articles'] = dict()
    
#     # Get all amendment points
#     amendment_points = re.findall(patterns['amendment_to']['amendment_point_1'], text, re.IGNORECASE)

#     if not amendment_points:
#         amendment_points = re.search(patterns['amendment_to']['amendment_point_2'], text, re.IGNORECASE)[1].strip()
#         first_sentence = re.search(r'^.*', amendment_points, re.IGNORECASE)[0].strip()  # Dapatkan kalimat pertama
#         amendment_points = [amendment_points.replace(first_sentence, '').strip()]		# Hapus kalimat pertama
    
#     # Iterate for every points
#     for point in amendment_points:
#         # Get all parts
#         parts = re.findall(patterns['amendment_to']['part'], point.strip() + '\n', re.IGNORECASE)

#         # If the part exists
#         if parts:
#             # Iterate for every parts
#             for part in parts:
#                 # Get part about/name
#                 part_about = re.search(patterns['part']['about'], part, re.IGNORECASE)[1].strip()
#                 part_about = re.sub(r'\n', ': ', part_about, flags=re.IGNORECASE)

#                 # Get all paragraphs
#                 paragraphs = re.findall(patterns['chapter']['paragraph'], part.strip() + '\n', re.IGNORECASE)
                
#                 # If the paragraph exists
#                 if paragraphs:
#                     # Iterate for every paragraphs
#                     for paragraph in paragraphs:
#                         # Get paragraph about/name
#                         paragraph_about = re.search(patterns['paragraph']['about'], paragraph, re.IGNORECASE)[1].strip()
#                         paragraph_about = re.sub(r'\n', ': ', paragraph_about, flags=re.IGNORECASE)
                        
#                         regulation_dict, definition_list, article_dict = parse_articles(
#                             text=paragraph,
#                             chapter_num=0,
#                             part_about=part_about,
#                             paragraph_about=paragraph_about,
#                             id_template=id_template,
#                             regulation_dict=regulation_dict,
#                             definition_list=definition_list,
#                             article_dict=article_dict,
#                             amendment=True,
#                             amend_regulations=amend_regulations
#                         )

#                 else:
#                     regulation_dict, definition_list, article_dict = parse_articles(
#                         text=part,
#                         chapter_num=0,
#                         part_about=part_about,
#                         paragraph_about='',
#                         id_template=id_template,
#                         regulation_dict=regulation_dict,
#                         definition_list=definition_list,
#                         article_dict=article_dict,
#                         amendment=True,
#                         amend_regulations=amend_regulations
#                     )

#         else:
#             # Get all paragraphs
#             paragraphs = re.findall(patterns['amendment_to']['paragraph'], point.strip() + '\n', re.IGNORECASE)
            
#             # If the paragraph exists
#             if paragraphs:
#                 # Iterate for every paragraphs
#                 for paragraph in paragraphs:
#                     # Get paragraph about/name
#                     paragraph_about = re.search(patterns['paragraph']['about'], paragraph, re.IGNORECASE)[1].strip()
#                     paragraph_about = re.sub(r'\n', ': ', paragraph_about, flags=re.IGNORECASE)

#                     regulation_dict, definition_list, article_dict = parse_articles(
#                         text=paragraph,
#                         chapter_num=0,
#                         part_about='',
#                         paragraph_about=paragraph_about,
#                         id_template=id_template,
#                         regulation_dict=regulation_dict,
#                         definition_list=definition_list,
#                         article_dict=article_dict,
#                         amendment=True,
#                         amend_regulations=amend_regulations
#                     )

#             else:
#                 regulation_dict, definition_list, article_dict = parse_articles(
#                 	text=point,
#                     chapter_num=0,
#                     part_about='',
#                     paragraph_about='',
#                 	id_template=id_template,
#                     regulation_dict=regulation_dict,
#                 	definition_list=definition_list,
#                     article_dict=article_dict,
#                     amendment=True,
#                     amend_regulations=amend_regulations
#                 )

#     return regulation_dict, definition_list, article_dict


# def parse_articles(
#         text: str,
#         chapter_num: int,
#         part_about: str,
#         paragraph_about: str,
#         id_template: str,
#         regulation_dict: dict,
#         definition_list: list,
#         article_dict: dict,
#         amendment: bool = False,
#         amend_regulations: list = list() 
# ) -> tuple[dict, list]:
    
#     # Get all articles
#     articles = re.findall(patterns['chapter']['article'], text, re.IGNORECASE)

#     # Iterate for every articles
#     for article in articles:
#         article_num = re.search(patterns['article']['number'], article, re.IGNORECASE)[1]
#         article_text = re.search(patterns['article']['text'], article, re.IGNORECASE)[1].strip()
#         article_text = re.sub(r'\n+', '\n', article_text)
        
#         # Get definition (Article/Pasal 1)
#         if article_num == '1':
#             definitions = re.findall(patterns['article']['definition'], article_text)
#             for index, definition_data in enumerate(definitions):
#                 definition, name = definition_data
#                 definition_list.append({
#                     'id': id_template.format(
#                         reg_section=encode['section']['definition'],
#                         section_num=str(index + 1).zfill(3),
#                         extra_section_number='00'
#                     ),
#                     'name': name.strip(),
#                     'definition': definition.strip()
#                 })
#             regulation_dict['content']['definitions'] = definition_list

#         # Define article ID
#         article_id = ''

#         if amendment:
#             # Create article ID
#             article_alphabet = re.search(r'\d+([A-Z]+)', str(article_num), re.IGNORECASE)

#             if article_alphabet:
#                 article_id = id_template.format(
#                     reg_section=encode['section']['article'],
#                     section_num=re.search(r'\d+', str(article_num), re.IGNORECASE)[0].zfill(3),
#                     extra_section_number=mapping_alphabet_to_numeric[article_alphabet[1]]
#                 )
#             else:
#                 article_id = id_template.format(
#                     reg_section=encode['section']['article'],
#                     section_num=str(article_num).zfill(3),
#                     extra_section_number='00'
#                 )

#             # Store article data
#             regulation_dict['content']['articles'][article_num] = {
#                 'id': article_id,
#                 'part': part_about,
#                 'paragraph': paragraph_about,
#                 'text': article_text
#             }

#             # Store article to article_dict
#             article_dict[article_id] = regulation_dict['content']['articles'][article_num]

#             # Mendefinisikan daftar pasasl yang diamandemen
#             regulation_dict['content']['articles'][article_num]['amend'] = list()
            
#             # Ambil daftar peraturan amend
#             for regulation_id in amend_regulations:
#                 # Buat ulang (tebak) ID Pasal di peraturan sebelum revisi
#                 other_article_id = regulation_id[:-6] + '6' + article_id[-5:]

#                 # Jika ID tersebut ada di article_dict maka ID nya pasti ada, terus simpan 
#                 if other_article_id in article_dict.keys():
#                     regulation_dict['content']['articles'][article_num]['part'] = \
#                         part_about if part_about else article_dict[other_article_id]['part']
#                     regulation_dict['content']['articles'][article_num]['paragraph'] = \
#                         paragraph_about if paragraph_about else article_dict[other_article_id]['paragraph']
#                     regulation_dict['content']['articles'][article_num]['amend'].append(other_article_id)

#         else:
#             # Create article ID
#             article_id = id_template.format(
#                 reg_section=encode['section']['article'],
#                 section_num=str(article_num).zfill(3),
#                 extra_section_number='00'
#             )

#             # Store article
#             regulation_dict['content']['chapters'][chapter_num]['articles'][article_num] = {
#                 'id': article_id,
#                 'part': part_about,
#                 'paragraph': paragraph_about,
#                 'text': article_text
#             }

#             # Store article to article_dict
#             article_dict[article_id] = regulation_dict['content']['chapters'][chapter_num]['articles'][article_num]

#             # Get previous article ID
#             if article_num != '1':
#                 regulation_dict['content']['chapters'][chapter_num]['articles'][article_num]['previous_article'] = \
#                     id_template.format(
#                         reg_section=encode['section']['article'],
#                         section_num=str(int(article_num) - 1).zfill(3),
#                         extra_section_number='00'
#                     )
#             else:
#                 regulation_dict['content']['chapters'][chapter_num]['articles'][article_num]['previous_article'] = ''
            
#             # Get article reference to other article
#             regulation_dict['content']['chapters'][chapter_num]['articles'][article_num]['references'] = list()
#             reference_type_1 = list(set(re.findall(patterns['article']['reference_1'], article_text, re.IGNORECASE)))
#             reference_type_2 = list(set(re.findall(patterns['article']['reference_2'], article_text, re.IGNORECASE)))

#             article_references = merge_article_references(reference_type_1, reference_type_2)

#             for article_reference_num in article_references:
#                 regulation_dict['content']['chapters'][chapter_num]['articles'][article_num]['references'].append(
#                     id_template.format(
#                         reg_section=encode['section']['article'],
#                         section_num=str(article_reference_num).zfill(3),
#                         extra_section_number='00'
#                     )
#                 )
                    
#     return regulation_dict, definition_list, article_dict


# def merge_article_references(list1, list2):
#     set1 = set(int(x) for x in list1)
#     set2 = set()

#     for start, end in list2:
#         set2.update(range(int(start), int(end) + 1))

#     return sorted(set1.union(set2))

In [None]:
# input_dir = os.path.join('data', 'markdown', 'fix', 'temp')
# json_input = os.path.join('data', 'regulation_data_modified.json')
# json_output = os.path.join('data', 'regulation_data_final.json')

# regulation_data = parse_regulations_content(
#     input_dir=input_dir,
#     json_input=json_input,
#     json_output=json_output,
#     verbose=True
# )

In [None]:
# encode = {
#     'type': {
#         'UU': '01',
#         'PERPPU': '02',
#         'PP': '03',
#         'PERPRES': '04',
#         'PERMENKOMINFO': '05'
#     },
#     'section': {
#         'document': '1',
#         'considering': '2',
#         'observing': '3',
#         'definition': '4',
#         'chapter': '5',
#         'article': '6',
#         'section': '7',
#     }
# }

# patterns = {
#     'document': {
#         'metadata': r'^(\w+)_(\w+)_(\w+)'    # Jenis, tahun, dan nomor peraturan
#     },
#     'main': {
#         'considering': r'(?<=## menimbang)([\S\s]*?)(?=## mengingat)',                                                 # Menimbang
#         'observing': r'(?<=## mengingat)([\S\s]*?)(?=(?:dengan persetujuan bersama|## memperhatikan|## memutuskan))',  # Mengingat
#         'amendment_to': r'^Perubahan',                                                                                 # Cek Peraturan Revisi
#         'chapter': r'(## BAB[\S\s]*?)(?=\s+(?:## BAB|agar setiap orang mengetahuinya))'                                # Daftar Bab
#     },
#     'chapter': {
#         # For every chapters
#         'about': r'## (BAB [\w\s\-\/\,]+)##',                            # Nama Bab, ganti [\-\/\,] dengan semua tanda baca kecuali #
#         'part': r'(## Bagian [\S\s]*?)(?=\s+(?:## Bagian|$))',           # Daftar Bagian
#         'paragraph': r'(## Paragraf [\S\s]*?)(?=\s+(?:## Paragraf|$))',  # Daftar Paragraf
#         'article': r'(## Pasal \w+[\S\s]*?)(?=(?:##|$))'                 # Daftar Pasal
#     },
#     'part': {
#         # For every parts
#         'about': r'## (Bagian [\w\s\-\/\,]+)##'    # Nama Bagian
#     },
#     'paragraph': {
#         # For every paragraphs
#         'about': r'## (Paragraf [\w\s\-\/\,]+)##'  # Nama Paragraf
#     },
#     'article': {
#         # For every articles
#         'number': r'## Pasal (\d+\w*)',                                       # Nomor Pasal
#         'text': r'## Pasal \w+\n*([\S\s]*)',                                  # Isi Pasal
#         'definition': r'\(\d+[a-z]?\) (([A-Z][a-z]*(?:\s[A-Z][a-z]*)*) .*)',  # Daftar Definisi
#         'reference_1': r'Pasal (\d+\w*)',                                     # Pasal Referensi Jenis 1
#         'reference_2': r'Pasal (\d+\w*) sampai dengan Pasal (\d+\w*)'         # Pasal Referensi Jenis 2
#     },
#     'amendment_to': {
#         'amendment_point_1': r'(## \d+\.[\S\s]*?)(?=\s+(?:## \d+\.|## Pasal II))',  # Poin Amandemen Jenis 1
#         'amendment_point_2': r'(?<=## Pasal I)([\s\S]*?)(?=## Pasal II)',           # Poin Amandemen Jenis 2
#         'amendment_articles': r'(## Pasal \d+[A-Z]?[\S\s]*?)(?=(?:##|$))',          # Pasal Amandemen
#         'part': r'(## Bagian [\S\s]*?)(?=\s+(?:## Bagian|$))',                      # Daftar Bagian
#         'paragraph': r'(## Paragraf [\S\s]*?)(?=\s+(?:## Paragraf|$))',             # Daftar Paragraf
#         # 'article': r'(## Pasal \w+[\S\s]*?)(?=(?:##|$))'                            # Daftar Pasal
#     }
# }

In [None]:
# file = 'PERMENKOMINFO_2020_002.md'  # Tanpa Poin Revisi
# file = 'PERMENKOMINFO_2021_010.md'  # Tanpa Poin Revisi
# file = 'PERMENKOMINFO_2007_038.md'  # Dengan Poin Revisi Lebih dari 1 Pasal
# file = 'PERMENKOMINFO_2021_001.md'  # Dengan Poin Revisi Paragraf
# with open(f'data/markdown/fix/amendment/{file}', 'r', encoding='utf8') as file:
# 	text = file.read()

# amendment_points = re.findall(r'(## \d+\.[\S\s]*?)(?=\s+(?:## \d+\.|## Pasal II))', text, re.IGNORECASE)
# if not amendment_points:
# 	amendment_points = re.search(r'(?<=## Pasal I)([\s\S]*?)(?=## Pasal II)', text, re.IGNORECASE)[1].strip()
# 	first_sentence = re.search(r'^.*', amendment_points, re.IGNORECASE)[0].strip()  # Dapatkan kalimat pertama
# 	amendment_points = [amendment_points.replace(first_sentence, '').strip()]		   # Hapus kalimat pertama

# for point in amendment_points:
# 	# Get all parts
# 	parts = re.findall(patterns['amendment_to']['part'], point.strip() + '\n', re.IGNORECASE)

# 	# If the part exists
# 	if parts:
# 		# Iterate for every parts
# 		for part in parts:
# 			# Get part about/name
# 			part_about = re.search(patterns['part']['about'], part, re.IGNORECASE)[1].strip()
# 			part_about = re.sub(r'\n', ': ', part_about, flags=re.IGNORECASE)

# 			# Get all paragraphs
# 			paragraphs = re.findall(patterns['chapter']['paragraph'], part.strip() + '\n', re.IGNORECASE)
			
# 			# If the paragraph exists
# 			if paragraphs:
# 				# Iterate for every paragraphs
# 				for paragraph in paragraphs:
# 					# Get paragraph about/name
# 					paragraph_about = re.search(patterns['paragraph']['about'], paragraph, re.IGNORECASE)[1].strip()
# 					paragraph_about = re.sub(r'\n', ': ', paragraph_about, flags=re.IGNORECASE)
# 					pass
# 					# regulation_dict, definition_list, article_id_list = parse_articles(
# 					# 	text=paragraph,
# 					# 	chapter_num=chapter_num,
# 					# 	part_about=part_about,
# 					# 	paragraph_about=paragraph_about,
# 					# 	regulation_dict=regulation_dict,
# 					# 	definition_list=definition_list,
# 					# 	article_id_list=article_id_list,
# 					# 	id_template=id_template
# 					# )

# 			else:
# 				pass
# 				# regulation_dict, definition_list, article_id_list = parse_articles(
# 				# 	text=part,
# 				# 	chapter_num=chapter_num,
# 				# 	part_about=part_about,
# 				# 	paragraph_about='',
# 				# 	regulation_dict=regulation_dict,
# 				# 	definition_list=definition_list,
# 				# 	article_id_list=article_id_list,
# 				# 	id_template=id_template
# 				# )

# 	else:
# 		# Get all paragraphs
# 		paragraphs = re.findall(patterns['amendment_to']['paragraph'], point.strip() + '\n', re.IGNORECASE)
		
# 		# If the paragraph exists
# 		if paragraphs:
# 			# Iterate for every paragraphs
# 			for paragraph in paragraphs:
# 				# Get paragraph about/name
# 				paragraph_about = re.search(patterns['paragraph']['about'], paragraph, re.IGNORECASE)[1].strip()
# 				paragraph_about = re.sub(r'\n', ': ', paragraph_about, flags=re.IGNORECASE)
# 				print(paragraph_about)
# 				print(paragraph)
# 				print('=' * 100)

# 				# regulation_dict, definition_list, article_id_list = parse_articles(
# 				# 	text=paragraph,
# 				# 	chapter_num=chapter_num,
# 				# 	part_about=part_about,
# 				# 	paragraph_about=paragraph_about,
# 				# 	regulation_dict=regulation_dict,
# 				# 	definition_list=definition_list,
# 				# 	article_id_list=article_id_list,
# 				# 	id_template=id_template
# 				# )

# 		else:
# 			print(point)
# 			print('=' * 100)
# 			# regulation_dict, definition_list, article_id_list = parse_articles(
# 			# 	text=part,
# 			# 	chapter_num=chapter_num,
# 			# 	part_about=part_about,
# 			# 	paragraph_about='',
# 			# 	regulation_dict=regulation_dict,
# 			# 	definition_list=definition_list,
# 			# 	article_id_list=article_id_list,
# 			# 	id_template=id_template
# 			# )
			

In [None]:
# def merge_article_references(list1, list2):
#     list1_int = set(int(x) for x in list1)
#     list2_int_expanded = set()
#     for start, end in list2:
#         list2_int_expanded.update(range(int(start), int(end) + 1))
#     return sorted(list1_int.union(list2_int_expanded))


# def parse_amendment_articles(
#         text: str,
#         id_template: str,
#         regulation_dict: dict,
#         definition_list: list,
#         article_dict: dict,
#         amendment: bool = True,
#         amend_regulations: list = list() 
# ):
#     amendment_points = re.findall(r'(## \d+\.[\S\s]*?)(?=\s+(?:## \d+\.|## Pasal II))', text, re.IGNORECASE)
#     if not amendment_points:
#         amendment_points = re.search(r'(?<=## Pasal I)([\s\S]*?)(?=## Pasal II)', text, re.IGNORECASE)[1].strip()
#         first_sentence = re.search(r'^.*', amendment_points, re.IGNORECASE)[0].strip()  # Dapatkan kalimat pertama
#         amendment_points = [amendment_points.replace(first_sentence, '').strip()]		   # Hapus kalimat pertama
    
#     for point in amendment_points:
#         # Get all parts
#         parts = re.findall(patterns['amendment_to']['part'], point.strip() + '\n', re.IGNORECASE)

#         # If the part exists
#         if parts:
#             # Iterate for every parts
#             for part in parts:
#                 # Get part about/name
#                 part_about = re.search(patterns['part']['about'], part, re.IGNORECASE)[1].strip()
#                 part_about = re.sub(r'\n', ': ', part_about, flags=re.IGNORECASE)

#                 # Get all paragraphs
#                 paragraphs = re.findall(patterns['chapter']['paragraph'], part.strip() + '\n', re.IGNORECASE)
                
#                 # If the paragraph exists
#                 if paragraphs:
#                     # Iterate for every paragraphs
#                     for paragraph in paragraphs:
#                         # Get paragraph about/name
#                         paragraph_about = re.search(patterns['paragraph']['about'], paragraph, re.IGNORECASE)[1].strip()
#                         paragraph_about = re.sub(r'\n', ': ', paragraph_about, flags=re.IGNORECASE)
                        
#                         regulation_dict, definition_list, article_dict = parse_articles(
#                             text=paragraph,
#                             chapter_num=0,
#                             part_about=part_about,
#                             paragraph_about=paragraph_about,
#                             id_template=id_template,
#                             regulation_dict=regulation_dict,
#                             definition_list=definition_list,
#                             article_dict=article_dict,
#                             amendment=amendment,
#                             amend_regulations=amend_regulations
#                         )

#                 else:
#                     regulation_dict, definition_list, article_dict = parse_articles(
#                         text=part,
#                         chapter_num=0,
#                         part_about=part_about,
#                         paragraph_about='',
#                         id_template=id_template,
#                         regulation_dict=regulation_dict,
#                         definition_list=definition_list,
#                         article_dict=article_dict,
#                         amendment=amendment,
#                         amend_regulations=amend_regulations
#                     )

#         else:
#             # Get all paragraphs
#             paragraphs = re.findall(patterns['amendment_to']['paragraph'], point.strip() + '\n', re.IGNORECASE)
            
#             # If the paragraph exists
#             if paragraphs:
#                 # Iterate for every paragraphs
#                 for paragraph in paragraphs:
#                     # Get paragraph about/name
#                     paragraph_about = re.search(patterns['paragraph']['about'], paragraph, re.IGNORECASE)[1].strip()
#                     paragraph_about = re.sub(r'\n', ': ', paragraph_about, flags=re.IGNORECASE)

#                     regulation_dict, definition_list, article_dict = parse_articles(
#                         text=paragraph,
#                         chapter_num=0,
#                         part_about='',
#                         paragraph_about=paragraph_about,
#                         id_template=id_template,
#                         regulation_dict=regulation_dict,
#                         definition_list=definition_list,
#                         article_dict=article_dict,
#                         amendment=amendment,
#                         amend_regulations=amend_regulations
#                     )

#             else:
#                 regulation_dict, definition_list, article_dict = parse_articles(
#                 	text=point,
#                 	chapter_num=0,
#                 	part_about='',
#                 	paragraph_about='',
#                 	id_template=id_template,
#                 	regulation_dict=regulation_dict,
#                 	definition_list=definition_list,
#                 	article_dict=article_dict,
#                     amendment=amendment,
#                     amend_regulations=amend_regulations
#                 )

#     return regulation_dict, definition_list, article_dict


# def parse_articles(
#         text: str,
#         chapter_num: int,
#         part_about: str,
#         paragraph_about: str,
#         id_template: str,
#         regulation_dict: dict,
#         definition_list: list,
#         article_dict: dict,
#         amendment: bool = False,
#         amend_regulations: list = list() 
# ) -> tuple[dict, list]:
    
#     mapping_alphabet_to_numeric = {chr(i): f"{i - 64:02}" for i in range(65, 91)}  # A-Z -> 01-26
    
#     # Get all articles
#     articles = re.findall(patterns['chapter']['article'], text, re.IGNORECASE)

#     # Iterate for every articles
#     for article in articles:
#         article_num = re.search(patterns['article']['number'], article, re.IGNORECASE)[1]
#         article_text = re.search(patterns['article']['text'], article, re.IGNORECASE)[1].strip()
#         article_text = re.sub(r'\n+', '\n', article_text)
#         # print(f'Pasal {article_num}')
#         # print(article_text, '\n')
        
#         # Get definition (Article/Pasal 1)
#         if article_num == '1':
#             definitions = re.findall(patterns['article']['definition'], article_text)
#             for index, definition_data in enumerate(definitions):
#                 definition, name = definition_data
#                 definition_list.append({
#                     'id': id_template.format(
#                         reg_section=encode['section']['definition'],
#                         section_num=str(index + 1).zfill(3),
#                         extra_section_number='00'
#                     ),
#                     'name': name.strip(),
#                     'definition': definition.strip()
#                 })
#             regulation_dict['content']['definitions'] = definition_list

#         article_id = ''

#         if amendment:
#             # Create article ID
#             article_alphabet = re.search(r'\d+([A-Z]+)', str(article_num), re.IGNORECASE)

#             if article_alphabet:
#                 article_id = id_template.format(
#                     reg_section=encode['section']['article'],
#                     section_num=re.search(r'\d+', str(article_num), re.IGNORECASE)[0].zfill(3),
#                     extra_section_number=mapping_alphabet_to_numeric[article_alphabet[1]]
#                 )
#             else:
#                 article_id = id_template.format(
#                     reg_section=encode['section']['article'],
#                     section_num=str(article_num).zfill(3),
#                     extra_section_number='00'
#                 )

#             # Store article
#             regulation_dict['content']['articles'][article_num] = {
#                 'id': article_id,
#                 'part': part_about,
#                 'paragraph': paragraph_about,
#                 'text': article_text
#             }

#             # Store article to article_dict
#             article_dict[article_id] = regulation_dict['content']['articles'][article_num]

#         else:
#             # Create article ID
#             article_id = id_template.format(
#                 reg_section=encode['section']['article'],
#                 section_num=str(article_num).zfill(3),
#                 extra_section_number='00'
#             )

#             # Store article
#             regulation_dict['content']['chapters'][chapter_num]['articles'][article_num] = {
#                 'id': article_id,
#                 'part': part_about,
#                 'paragraph': paragraph_about,
#                 'text': article_text
#             }

#             # Store article to article_dict
#             article_dict[article_id] = regulation_dict['content']['chapters'][chapter_num]['articles'][article_num]

#             # Get previous article ID
#             if article_num != '1':
#                 regulation_dict['content']['chapters'][chapter_num]['articles'][article_num]['previous_article'] = \
#                     id_template.format(
#                         reg_section=encode['section']['article'],
#                         section_num=str(int(article_num) - 1).zfill(3),
#                         extra_section_number='00'
#                     )
#             else:
#                 regulation_dict['content']['chapters'][chapter_num]['articles'][article_num]['previous_article'] = ''
            
#             # Get article reference to other article
#             regulation_dict['content']['chapters'][chapter_num]['articles'][article_num]['references'] = list()
#             reference_type_1 = list(set(re.findall(patterns['article']['reference_1'], article_text, re.IGNORECASE)))
#             reference_type_2 = list(set(re.findall(patterns['article']['reference_2'], article_text, re.IGNORECASE)))

#             article_references = merge_article_references(reference_type_1, reference_type_2)

#             for article_reference_num in article_references:
#                 regulation_dict['content']['chapters'][chapter_num]['articles'][article_num]['references'].append(
#                     id_template.format(
#                         reg_section=encode['section']['article'],
#                         section_num=str(article_reference_num).zfill(3),
#                         extra_section_number='00'
#                     )
#                 )
            
#         # TODO 1: Membuat parsing khusus untuk peraturan revisi
#         # TODO 2: Menambah referensi peraturan amandemen
#         # Misal, jika pasal 5 merupakan pasal revisi, maka harus dihubungkan ke pasal yang sama di dokumen yang sebelumnya
#         # Lalu kode untuk pasal 10A, itu harus diakhiri dengan extra_section_number='01', kalau 10B maka '02', dst. 
#         # TODO 3: Membuat setiap JSON peraturan sebagai file terpisah? Karena terlalu rumit kalau jadi satu
#         if amendment:
#             regulation_dict['content']['articles'][article_num]['amend'] = list()
            
#             # Ambil daftar peraturan amend
#             for regulation_id in amend_regulations:
#                 # Buat ulang (tebak) ID Pasal di peraturan sebelum revisi
#                 other_article_id = regulation_id[:-6] + '6' + article_id[-5:]

#                 # Jika ID tersebut ada di article_dict maka ID nya pasti ada, terus simpan 
#                 if other_article_id in article_dict.keys():
#                     regulation_dict['content']['articles'][article_num]['part'] = \
#                         part_about if part_about else article_dict[other_article_id]['part']
#                     regulation_dict['content']['articles'][article_num]['paragraph'] = \
#                         paragraph_about if paragraph_about else article_dict[other_article_id]['paragraph']
#                     regulation_dict['content']['articles'][article_num]['amend'].append(other_article_id)
                    
#     return regulation_dict, definition_list, article_dict

In [None]:
# DIR_PATH = os.path.join('data', 'markdown', 'fix', 'temp')
# article_dict = dict()
# result = list()
# files = list()

# start_time = time.time()

# for filename in os.listdir(DIR_PATH):
#     if filename.endswith(".md"):
#         files.append((os.path.join(DIR_PATH, filename), filename))

# for index, data in enumerate(files):
#     # Initialize data
#     path, filename = data
#     regulation_dict = dict()
#     definition_list = list()
#     print(f'File: {path}')

#     # Get file metadata
#     metadata = re.search(patterns['document']['metadata'], filename)
#     regulation_type = encode['type'][metadata[1]]
#     regulation_year = metadata[2]
#     regulation_num = int(metadata[3])

#     id_template = f'{regulation_year}{regulation_type}{str(regulation_num).zfill(3)}' + '{reg_section}{section_num}{extra_section_number}'

#     regulation_id = id_template.format(reg_section=encode['section']['document'], section_num='000', extra_section_number='00')

#     with open('data/regulation_data_modified.json') as input_file:
#         for regulation_data in json.load(input_file):
#             if regulation_data['id'] == regulation_id:
#                 regulation_dict = regulation_data

#     with open(path, 'r', encoding='utf8') as file:
#         # Read file
#         text = file.read()

#         # Define main content
#         regulation_dict['content'] = dict()

#         # Get main structure
#         regulation_dict['content']['considering'] = {
#             'id': id_template.format(reg_section=encode['section']['considering'], section_num='000', extra_section_number='00'),
#             'text': re.search(patterns['main']['considering'], text, re.IGNORECASE)[1].strip()
#         }
#         regulation_dict['content']['observing'] = {
#             'id': id_template.format(reg_section=encode['section']['observing'], section_num='000', extra_section_number='00'), 
#             'text': re.search(patterns['main']['observing'], text, re.IGNORECASE)[1].strip()
#         }

#         # If it is an amendment regulation
#         if re.search(patterns['main']['amendment_to'], regulation_dict['about'], re.IGNORECASE):
#             regulation_dict['content']['articles'] = dict()
#             regulation_dict, definition_list, article_dict = parse_amendment_articles(
#                 text=text,
#                 id_template=id_template,
#                 regulation_dict=regulation_dict,
#                 definition_list=definition_list,
#                 article_dict=article_dict,
#                 amendment=True,
#                 amend_regulations=regulation_dict['status']['amend']
#             )
        
#             # text=paragraph,
#             # chapter_num=chapter_num,
#             # part_about=part_about,
#             # paragraph_about=paragraph_about,
#             # regulation_dict=regulation_dict,
#             # definition_list=definition_list,
#             # article_dict=article_dict,
#             # id_template=id_template
        
#         else:
#             # Get all chapters
#             chapters = re.findall(patterns['main']['chapter'], text, re.IGNORECASE)
#             regulation_dict['content']['chapters'] = dict()

#             # Iterate for every chapters
#             for num, chapter in enumerate(chapters):
#                 chapter_num = num + 1
#                 chapter_about = re.search(patterns['chapter']['about'], chapter, re.IGNORECASE)[1].strip().upper()
#                 chapter_about = re.sub(r'\n', ': ', chapter_about, flags=re.IGNORECASE)

#                 regulation_dict['content']['chapters'][chapter_num] = {
#                     'id': id_template.format(
#                         reg_section=encode['section']['chapter'],
#                         section_num=str(chapter_num).zfill(3),
#                         extra_section_number='00'
#                     ),
#                     'about': chapter_about
#                 }
            
#                 regulation_dict['content']['chapters'][chapter_num]['articles'] = dict()

#                 # Get all parts
#                 parts = re.findall(patterns['chapter']['part'], chapter.strip() + '\n', re.IGNORECASE)

#                 # If the part exists
#                 if parts:
#                     # Iterate for every parts
#                     for part in parts:
#                         # Get part about/name
#                         part_about = re.search(patterns['part']['about'], part, re.IGNORECASE)[1].strip()
#                         part_about = re.sub(r'\n', ': ', part_about, flags=re.IGNORECASE)

#                         # Get all paragraphs
#                         paragraphs = re.findall(patterns['chapter']['paragraph'], part.strip() + '\n', re.IGNORECASE)
                        
#                         # If the paragraph exists
#                         if paragraphs:
#                             # Iterate for every paragraphs
#                             for paragraph in paragraphs:
#                                 # Get paragraph about/name
#                                 paragraph_about = re.search(patterns['paragraph']['about'], paragraph, re.IGNORECASE)[1].strip()
#                                 paragraph_about = re.sub(r'\n', ': ', paragraph_about, flags=re.IGNORECASE)
                                
#                                 regulation_dict, definition_list, article_dict = parse_articles(
#                                     text=paragraph,
#                                     chapter_num=chapter_num,
#                                     part_about=part_about,
#                                     paragraph_about=paragraph_about,
#                                     id_template=id_template,
#                                     regulation_dict=regulation_dict,
#                                     definition_list=definition_list,
#                                     article_dict=article_dict,
#                                     amendment=False,
#                                     amend_regulations=list()
#                                 )

#                         else:
#                             regulation_dict, definition_list, article_dict = parse_articles(
#                                 text=part,
#                                 chapter_num=chapter_num,
#                                 part_about=part_about,
#                                 paragraph_about='',
#                                 id_template=id_template,
#                                 regulation_dict=regulation_dict,
#                                 definition_list=definition_list,
#                                 article_dict=article_dict,
#                                 amendment=False,
#                                 amend_regulations=list()
#                             )

#                 else:
#                     regulation_dict, definition_list, article_dict = parse_articles(
#                         text=chapter,
#                         chapter_num=chapter_num,
#                         part_about='',
#                         paragraph_about='',
#                         id_template=id_template,
#                         regulation_dict=regulation_dict,
#                         definition_list=definition_list,
#                         article_dict=article_dict,
#                         amendment=False,
#                         amend_regulations=list()
#                     )
        
#         result.append(regulation_dict)

# end_time = time.time() - start_time
# display(f'{round(end_time * 1000, 2)} milisecond')

# # https://www.freecodecamp.org/news/how-to-pretty-print-json-in-python/
# # https://www.geeksforgeeks.org/how-to-convert-python-dictionary-to-json/
# # Convert the data to a JSON formatted string with 4 spaces of indentation
# with open("output_new_new.json", "w") as outfile: 
#     json.dump(result, outfile, indent=4)
#     output_json_str = json.dumps(result, indent=4)
#     # Print the pretty-printed JSON string
#     # print(output_json_str)
    
#     # display(len(article_dict))
#     display(article_dict)

In [None]:
# encode = {
#     'type': {
#         'UU': '01',
#         'PERPPU': '02',
#         'PP': '03',
#         'PERPRES': '04',
#         'PERMENKOMINFO': '05'
#     },
#     'section': {
#         'document': '1',
#         'considering': '2',
#         'observing': '3',
#         'definition': '4',
#         'chapter': '5',
#         'article': '6',
#         'section': '7',
#     }
# }

# patterns = {
#     'document': {
#         'metadata': r'^(\w+)_(\w+)_(\w+)'    # Jenis, tahun, dan nomor peraturan
#     },
#     'main': {
#         'considering': r'(?<=## menimbang)([\S\s]*?)(?=## mengingat)',                                                 # Menimbang
#         'observing': r'(?<=## mengingat)([\S\s]*?)(?=(?:dengan persetujuan bersama|## memperhatikan|## memutuskan))',  # Mengingat
#         'chapter': r'(## BAB[\S\s]*?)(?=\s+(?:## BAB|agar setiap orang mengetahuinya))'                                # Daftar Bab
#     },
#     'chapter': {
#         # For every chapters
#         'about': r'## (BAB [\w\s\-\/\,]+)##',                              # Nama Bab
#         'part': r'(## Bagian [\S\s]*?)(?=\s+(?:## Bagian|$))',           # Daftar Bagian
#         'paragraph': r'(## Paragraf [\S\s]*?)(?=\s+(?:## Paragraf|$))',  # Daftar Paragraf
#         'article': r'(## Pasal \w+[\S\s]*?)(?=(?:##|$))'                 # Daftar Pasal
#     },
#     'part': {
#         # For every parts
#         'about': r'## (Bagian [\w\s\-\/\,]+)##'    # Nama Bagian
#     },
#     'paragraph': {
#         # For every paragraphs
#         'about': r'## (Paragraf [\w\s\-\/\,]+)##'  # Nama Paragraf
#     },
#     'article': {
#         # For every articles
#         'number': r'## Pasal (\d+\w*)',                                 # Nomor Pasal
#         'text': r'## Pasal \w+\n*([\S\s]*)',                            # Isi Pasal
#         'definition': r'\(\d+\) (([A-Z][a-z]*(?:\s[A-Z][a-z]*)*) .*)',  # Daftar Definisi
#         'reference_1': r'Pasal (\d+\w*)',                               # Pasal Referensi Jenis 1
#         'reference_2': r'Pasal (\d+\w*) sampai dengan Pasal (\d+\w*)'   # Pasal Referensi Jenis 2
#     }
# }

In [None]:
# text = '''## BAB II
# PENGELOLA NAMA DOMAIN

# ## Bagian Kesatu
# Umum

# ## Pasal 5
# (1) Pengelolaan Nama Domain diselenggarakan oleh Pengelola Nama Domain.
# (2) Nama Domain terdiri atas:
# 	a. Nama Domain Tingkat Tinggi Generik;
# 	b. Nama Domain Tingkat Tinggi Indonesia;
# 	c. Nama Domain Indonesia Tingkat Kedua; dan
# 	d. Nama Domain Indonesia Tingkat Turunan.
# (3) Pengelola Nama Domain sebagaimana dimaksud pada ayat (1) terdiri atas:
# 	a. Registri Nama Domain; dan
# 	b. Registrar Nama Domain.

# ## Pasal 6
# Pengelola Nama Domain sebagaimana dimaksud dalam Pasal 5 ayat (3) dapat diselenggarakan oleh Pemerintah dan/atau masyarakat.

# ## Bagian Kedua
# Registri Nama Domain

# ## Paragraf 1
# Umum

# ## Pasal 7
# (1) Registri Nama Domain sebagaimana dimaksud dalam Pasal 5 ayat (3) huruf a melaksanakan pengelolaan Nama Domain Tingkat Tinggi Generik dan Nama Domain Tingkat Tinggi Indonesia.
# (2) Registri Nama Domain dapat memberikan kewenangan pendaftaran Nama Domain Tingkat Tinggi Generik dan Nama Domain Tingkat Tinggi Indonesia kepada Registrar Nama Domain.
# (3) Registri Nama Domain berfungsi:
# 	a. memberikan masukan terhadap rencana pengaturan Nama Domain kepada Menteri;
# 	b. melakukan pengawasan terhadap Registrar Nama Domain; dan
# 	c. menyelesaikan perselisihan Nama Domain.

# ## Pasal 8
# Registri Nama Domain sebagaimana dimaksud dalam Pasal 7 ayat (1) terdiri atas:
# a. Registri Nama Domain Tingkat Tinggi Generik; dan
# b. Registri Nama Domain Tingkat Tinggi Indonesia.

# ## Paragraf 2
# Registri Nama Domain Tingkat Tinggi Generik

# ## Pasal 9
# (1) Registri Nama Domain Tingkat Tinggi Generik sebagaimana dimaksud dalam Pasal 8 huruf a melaksanakan pengelolaan Nama Domain Tingkat Tinggi Generik.
# (2) Registri Nama Domain Tingkat Tinggi Generik sebagaimana dimaksud pada ayat (1) wajib mengikuti ketentuan pengelolaan Nama Domain Internasional serta ketentuan peraturan perundang-undangan.

# ## Pasal 10
# Registri Nama Domain Tingkat Tinggi Generik memiliki tugas:
# a. merumuskan kebijakan di bidang pengelolaan Nama Domain Tingkat Tinggi Generik;
# b. menyiapkan, mengoperasikan, dan memelihara
# infrastruktur yang dibutuhkan serta menyediakan sistem elektronik untuk pengelolaan Nama Domain Tingkat Tinggi Generik; dan
# c. menyelenggarakan pendaftaran Nama Domain Tingkat Tinggi Generik sesuai dengan ketentuan pengelolaan Nama Domain Internasional dan ketentuan peraturan perundang-undangan.

# ## Pasal 11
# Registri Nama Domain Tingkat Tinggi Generik sebagaimana dimaksud dalam Pasal 9 bertanggung jawab dalam melakukan pengelolaan, pengoperasian, dan pemeliharaan penyelenggaraan sistem elektronik Nama Domain Tingkat Tinggi Generik yang berdomisili di Indonesia.

# ## Pasal 12
# (1) Nama Domain Tingkat Tinggi Generik yang akan didaftarkan dengan menggunakan nama wilayah, geografis, budaya, dan/atau situs nasional Indonesia serta bersifat unik harus mendapat persetujuan Menteri.
# (2) Dalam memberikan persetujuan sebagaimana dimaksud pada ayat (1), Menteri melakukan koordinasi dengan instansi terkait.

# ## Paragraf 3
# Registri Nama Domain Tingkat Tinggi Indonesia

# ## Pasal 13
# Registri Nama Domain Tingkat Tinggi Indonesia sebagaimana dimaksud dalam Pasal 8 huruf b melaksanakan pengelolaan Nama Domain Tingkat Tinggi Indonesia.

# ## Pasal 14
# Registri Nama Domain Tingkat Tinggi Indonesia memiliki tugas:
# a. merumuskan kebijakan di bidang pengelolaan Nama Domain Tingkat Tinggi Indonesia paling sedikit memuat:
# 	1) ketentuan Nama Domain yang dipublikasikan; dan
# 	2) daftar Nama Domain yang dibatasi, atas pertimbangan ketentuan peraturan perundang- undangan, kepatutan yang berlaku dalam masyarakat, dan iktikad baik.
# b. menyiapkan, mengoperasikan, dan memelihara infrastruktur yang dibutuhkan serta menyediakan sistem elektronik untuk pengelolaan Nama Domain Tingkat Tinggi Indonesia;
# c. menyelenggarakan pendaftaran Nama Domain Tingkat Tinggi Indonesia sesuai dengan ketentuan peraturan perundang-undangan, kepatutan yang berlaku dalam masyarakat, dan prinsip kehati-hatian;
# d. melaksanakan seleksi Registrar Nama Domain;
# e. memberikan peringatan kepada Registrar Nama Domain jika terindikasi melakukan pelanggaran;
# f. mencabut hak operasional Registrar Nama Domain jika terbukti melakukan pelanggaran; dan
# g. melakukan pengawasan operasional dan teknis Registrar Nama Domain.

# ## Pasal 15
# Registri Nama Domain Tingkat Tinggi Indonesia wajib:
# a. menjamin sistem elektronik Registri Nama Domain Indonesia beroperasi dengan baik, stabil, aman didukung dengan layanan yang dapat diandalkan;
# b. menempatkan pusat data dan pusat pemulihan bencana di wilayah Indonesia;
# c. melakukan pengawasan terhadap Registrar Nama Domain;
# d. memfasilitasi penyelesaian perselisihan Nama Domain Indonesia;
# e. melaporkan daftar Registrar Nama Domain kepada Menteri;
# f. mengikuti ketentuan pengelolaan Nama Domain internasional dan peraturan perundang-undangan di Indonesia; dan
# g. menyampaikan laporan berkala kepada Menteri sekurang-kurangnya 1 (satu) kali dalam setahun.

# ## Pasal 16
# Registri Nama Domain Tingkat Tinggi Indonesia berwenang:
# a. menolak pendaftaran Nama Domain Tingkat Tinggi Indonesia apabila Nama Domain tersebut tidak memenuhi persyaratan;
# b. menonaktifkan sementara penggunaan Nama Domain Tingkat Tinggi Indonesia; dan
# c. menghapus Nama Domain Tingkat Tinggi Indonesia apabila pengguna Nama Domain tersebut melanggar ketentuan peraturan perundang undangan setelah ada putusan pengadilan dan/atau lembaga arbitrase yang berkekuatan hukum tetap.

# ## Pasal 17
# Registri Nama Domain Tingkat Tinggi Indonesia sebagaimana dimaksud dalam Pasal 8 huruf b bertanggung jawab dalam melakukan pengelolaan, pengoperasian, dan pemeliharaan penyelenggaraan sistem elektronik Nama Domain Tingkat Tinggi Indonesia.

# ## Pasal 18
# Dalam hal Registri Nama Domain Tingkat Tinggi Indonesia bermaksud menghentikan kegiatannya, paling lambat 3 (tiga) bulan sebelum penghentian kegiatannya wajib menyerahkan seluruh pengelolaan Nama Domain yang dikelolanya kepada Menteri dengan memperhatikan kelangsungan Nama Domain Indonesia.

# ## Bagian Ketiga
# Registrar Nama Domain

# ## Paragraf 1
# Umum

# ## Pasal 19
# (1) Registrar Nama Domain sebagaimana dimaksud dalam Pasal 5 ayat (3) huruf b melaksanakan pendaftaran Nama Domain Tingkat Kedua dan Tingkat Turunan.
# (2) Pendaftaran Nama Domain sebagaimana dimaksud pada ayat (1) dilakukan atas permohonan Pengguna Nama Domain.

# ## Pasal 20
# Registrar Nama Domain terdiri atas:
# a. Registrar Nama Domain Instansi; dan
# b. Registrar Nama Domain Selain Instansi.

# ## Paragraf 2
# Registrar Nama Domain Instansi

# ## Pasal 21
# (1) Registrar Nama Domain Instansi sebagaimana dimaksud dalam Pasal 20 huruf a melaksanakan pendaftaran Nama Domain untuk kebutuhan Instansi Penyelenggara Negara.
# (2) Nama Domain untuk kebutuhan Instansi Penyelenggara Negara sebagaimana dimaksud pada ayat (1) terdiri atas Nama Domain .go.id dan .mil.id.
# (3) Registrar Nama Domain Instansi sebagaimana dimaksud pada ayat (1) dilaksanakan oleh Menteri.
# (4) Ketentuan lebih lanjut mengenai Registrar Nama Domain Instansi sebagaimana dimaksud pada ayat (1) dan ayat (3) diatur secara terpisah dalam peraturan menteri tersendiri.

# ## Paragraf 3
# Registrar Nama Domain Selain Instansi

# ## Pasal 22
# (1) Registrar Nama Domain Selain Instansi sebagaimana dimaksud dalam Pasal 20 huruf b melakukan pendaftaran Nama Domain Tingkat Kedua dan Nama Domain Tingkat Turunan untuk pengguna komersial dan nonkomersial.
# (2) Registrar Nama Domain Selain Instansi sebagaimana dimaksud pada ayat (1) terdiri atas:
# 	a. Registrar Nama Domain dengan Registri di Indonesia; dan
# 	b. Registrar Nama Domain dengan Registri di luar Indonesia.

# ## Pasal 23
# Registrar Nama Domain dengan Registri di Indonesia menyediakan jasa pendaftaran Nama Domain yang dikelola oleh Registri Nama Domain yang berdomisili di Indonesia.

# ## Pasal 24
# Registrar Nama Domain dengan Registri di Indonesia wajib mengikuti ketentuan Nama Domain yang ditetapkan oleh Registri Nama Domain sebagaimana dimaksud dalam Pasal 14 huruf a.

# ## Pasal 25
# Registrar Nama Domain dengan Registri di Indonesia berhak memperoleh pendapatan dengan memungut biaya pendaftaran dan/atau penggunaan Nama Domain dari Pengguna Nama Domain.

# ## Pasal 26
# Registrar Nama Domain dengan Registri di Indonesia dalam memberikan layanannya tidak bertanggung jawab terhadap segala implikasi hukum yang berkenaan dengan Nama Domain, kecuali yang diakibatkan karena kelalaiannya.

# ## Pasal 27
# (1) Dalam hal Registrar Nama Domain bermaksud menghentikan kegiatannya dan/atau tidak bisa melanjutkan kegiatannya, paling lambat 3 (tiga) bulan sebelum penghentian kegiatan, Registrar Nama Domain wajib menginformasikan kepada Registri Nama Domain dimana registar dimaksud terdaftar dan Pengguna Nama Domain.
# (2) Dalam hal Registrar Nama Domain habis masa berlaku, dicabut haknya, atau dalam keadaan memaksa, maka Registri Nama Domain berhak mengalihkan seluruh pengelolaan Nama Domain kepada Registrar Nama Domain lain yang dipilih oleh Pengguna Nama Domain.

# ## Pasal 28
# (1) Registrar Nama Domain dengan Registri di luar Indonesia menyediakan jasa pendaftaran Nama Domain yang dikelola oleh Registri Nama Domain yang berdomisili di luar Indonesia.
# (2) Registrar Nama Domain dengan Registri di luar Indonesia wajib mengikuti ketentuan Nama Domain yang ditetapkan oleh Registrinya sepanjang tidak bertentangan dengan ketentuan peraturan perundang- undangan.

# ## Pasal 29
# Registrar Nama Domain dengan Registri di luar Indonesia wajib mengikuti ketentuan sebagai pengelola Nama Domain sebagaimana diatur dalam Peraturan Menteri ini.
# '''

# parts = re.findall(r'(## Bagian [\S\s]*?)(?=\s+(?:## Bagian|$))', text, re.IGNORECASE)
# print(len(parts))
# display(parts)

In [None]:
# def merge_article_references(list1, list2):
#     list1_int = set(int(x) for x in list1)
#     list2_int_expanded = set()
#     for start, end in list2:
#         list2_int_expanded.update(range(int(start), int(end) + 1))
#     return sorted(list1_int.union(list2_int_expanded))


# def parse_articles(
#         text: str,
#         chapter_num: int,
#         part_about: str,
#         paragraph_about: str,
#         regulation_dict: dict,
#         definition_list: list,
#         id_template: str
# ) -> tuple[dict, list]:
    
#     # Get all articles
#     articles = re.findall(patterns['chapter']['article'], text, re.IGNORECASE)

#     # Iterate for every articles
#     for article in articles:
#         article_num = int(re.search(patterns['article']['number'], article, re.IGNORECASE)[1])
#         article_text = re.search(patterns['article']['text'], article, re.IGNORECASE)[1].strip()
#         article_text = re.sub(r'\n+', '\n', article_text)
#         # print(f'Pasal {article_num}')
#         # print(article_text, '\n')
        
#         # Get definition (Article/Pasal 1)
#         if article_num == 1:
#             definitions = re.findall(patterns['article']['definition'], article_text)
#             for index, definition_data in enumerate(definitions):
#                 definition, name = definition_data
#                 definition_list.append({
#                     'id': id_template.format(
#                         reg_section=encode['section']['definition'],
#                         section_num=str(index + 1).zfill(3),
#                         extra_section_number='00'
#                     ),
#                     'name': name.strip(),
#                     'definition': definition.strip()
#                 })
#             regulation_dict['content']['definitions'] = definition_list
        
#         # Store article
#         regulation_dict['content']['chapters'][chapter_num]['articles'][article_num] = {
#             'id': id_template.format(
#                 reg_section=encode['section']['article'],
#                 section_num=str(article_num).zfill(3),
#                 extra_section_number='00'
#             ),
#             'part': part_about,
#             'paragraph': paragraph_about,
#             'text': article_text
#         }

#         # Get article reference to other article
#         regulation_dict['content']['chapters'][chapter_num]['articles'][article_num]['references'] = list()
#         reference_type_1 = list(set(re.findall(patterns['article']['reference_1'], article_text, re.IGNORECASE)))
#         reference_type_2 = list(set(re.findall(patterns['article']['reference_2'], article_text, re.IGNORECASE)))

#         article_references = merge_article_references(reference_type_1, reference_type_2)

#         for article_reference_num in article_references:
#             regulation_dict['content']['chapters'][chapter_num]['articles'][article_num]['references'].append(
#                 id_template.format(
#                     reg_section=encode['section']['article'],
#                     section_num=str(article_reference_num).zfill(3),
#                     extra_section_number='00'
#                 )
#             )
        
#         # TODO 1: Membuat parsing khusus untuk peraturan revisi
#         # TODO 2: Menambah referensi peraturan amandemen
#         # Misal, jika pasal 5 merupakan pasal revisi, maka harus dihubungkan ke pasal yang sama di dokumen yang sebelumnya
#         # Lalu kode untuk pasal 10A, itu harus diakhiri dengan extra_section_number='01', kalau 10B maka '02', dst. 
#         # TODO 3: Membuat setiap JSON peraturan sebagai file terpisah? Karena terlalu rumit kalau jadi satu
#         regulation_dict['content']['chapters'][chapter_num]['articles'][article_num]['amend'] = list()
    
#     return regulation_dict, definition_list

In [None]:
# DIR_PATH = os.path.join('data', 'markdown', 'fix', 'temp')
# result = list()
# files = list()

# start_time = time.time()

# for filename in os.listdir(DIR_PATH):
#     if filename.endswith(".md"):
#         files.append((os.path.join(DIR_PATH, filename), filename))

# for index, data in enumerate(files):
#     # Initialize data
#     path, filename = data
#     regulation_dict = dict()
#     definition_list = list()
#     # section_index = 0

#     # # Just for filtering
#     # if filename not in  ['PERMENKOMINFO_002_2016.md']:
#     # # if filename not in  ['UU_NO_11_2008.md', 'PP_NO_71_2019.md', 'UU_NO_27_2022.md']:
#     # # if filename not in  ['UU_NO_11_2008.md']:
#     #     continue
#     print(f'File: {path}')

#     # Get file metadata
#     metadata = re.search(patterns['document']['metadata'], filename)
#     regulation_type = encode['type'][metadata[1]]
#     regulation_year = metadata[2]
#     regulation_num = int(metadata[3])

#     id_template = f'{regulation_year}{regulation_type}{str(regulation_num).zfill(3)}' + '{reg_section}{section_num}{extra_section_number}'

#     regulation_id = id_template.format(reg_section=encode['section']['document'], section_num='000', extra_section_number='00')

#     with open('data/regulation_data_modified.json') as input_file:
#         for regulation_data in json.load(input_file):
#             if regulation_data['id'] == regulation_id:
#                 regulation_dict = regulation_data

#     with open(path, 'r', encoding='utf8') as file:
#         # Read file
#         text = file.read()

#         # Define main content
#         regulation_dict['content'] = dict()

#         # Get main structure
#         regulation_dict['content']['considering'] = {
#             'id': id_template.format(reg_section=encode['section']['considering'], section_num='000', extra_section_number='00'),
#             'text': re.search(patterns['main']['considering'], text, re.IGNORECASE)[1].strip()
#         }
#         regulation_dict['content']['observing'] = {
#             'id': id_template.format(reg_section=encode['section']['observing'], section_num='000', extra_section_number='00'), 
#             'text': re.search(patterns['main']['observing'], text, re.IGNORECASE)[1].strip()
#         }

#         # Get all chapters
#         chapters = re.findall(patterns['main']['chapter'], text, re.IGNORECASE)
#         regulation_dict['content']['chapters'] = dict()

#         # Iterate for every chapters
#         for num, chapter in enumerate(chapters):
#             # if num == 1:
#             #     print(chapter)
#             chapter_num = num + 1
#             chapter_about = re.search(patterns['chapter']['about'], chapter, re.IGNORECASE)[1].strip().upper()
#             chapter_about = re.sub(r'\n', ': ', chapter_about, flags=re.IGNORECASE)

#             regulation_dict['content']['chapters'][chapter_num] = {
#                 'id': id_template.format(
#                     reg_section=encode['section']['chapter'],
#                     section_num=str(chapter_num).zfill(3),
#                     extra_section_number='00'
#                 ),
#                 'about': chapter_about
#             }
        
#             regulation_dict['content']['chapters'][chapter_num]['articles'] = dict()

#             # Get all parts
#             parts = re.findall(patterns['chapter']['part'], chapter.strip() + '\n', re.IGNORECASE)

#             # If the part exists
#             if parts:
#                 # Iterate for every parts
#                 for part in parts:
#                     # Get part about/name
#                     part_about = re.search(patterns['part']['about'], part, re.IGNORECASE)[1].strip()
#                     part_about = re.sub(r'\n', ': ', part_about, flags=re.IGNORECASE)

#                     # Get all paragraphs
#                     paragraphs = re.findall(patterns['chapter']['paragraph'], part.strip() + '\n', re.IGNORECASE)
                    
#                     # If the paragraph exists
#                     if paragraphs:
#                         # Iterate for every paragraphs
#                         for paragraph in paragraphs:
#                             # Get paragraph about/name
#                             paragraph_about = re.search(patterns['paragraph']['about'], paragraph, re.IGNORECASE)[1].strip()
#                             paragraph_about = re.sub(r'\n', ': ', paragraph_about, flags=re.IGNORECASE)
                            
#                             regulation_dict, definition_list = parse_articles(
#                                 text=paragraph,
#                                 chapter_num=chapter_num,
#                                 part_about=part_about,
#                                 paragraph_about=paragraph_about,
#                                 regulation_dict=regulation_dict,
#                                 definition_list=definition_list,
#                                 id_template=id_template
#                             )

#                     else:
#                         regulation_dict, definition_list = parse_articles(
#                             text=part,
#                             chapter_num=chapter_num,
#                             part_about=part_about,
#                             paragraph_about='',
#                             regulation_dict=regulation_dict,
#                             definition_list=definition_list,
#                             id_template=id_template
#                         )

#             else:
#                 regulation_dict, definition_list = parse_articles(
#                     text=chapter,
#                     chapter_num=chapter_num,
#                     part_about='',
#                     paragraph_about='',
#                     regulation_dict=regulation_dict,
#                     definition_list=definition_list,
#                     id_template=id_template
#                 )
        
#         result.append(regulation_dict)

# end_time = time.time() - start_time
# display(f'{round(end_time * 1000, 2)} milisecond')

# # https://www.freecodecamp.org/news/how-to-pretty-print-json-in-python/
# # https://www.geeksforgeeks.org/how-to-convert-python-dictionary-to-json/
# # Convert the data to a JSON formatted string with 4 spaces of indentation
# with open("output_new_new.json", "w") as outfile: 
#     json.dump(result, outfile, indent=4)
#     output_json_str = json.dumps(result, indent=4)
#     # Print the pretty-printed JSON string
#     print(output_json_str)

In [None]:
            # # Get all parts
            # parts = re.findall(patterns['chapter']['part'], chapter, re.IGNORECASE)
            # if parts:
            #     for part in parts:
            #         # Get all paragraphs
            #         paragraphs = re.findall(patterns['chapter']['paragraph'], part, re.IGNORECASE)
            # else:
            #     pass


            # def parse_articles(text: str):

In [None]:
# encode = {
#     'type': {
#         'UU': '01',
#         'PERPPU': '02',
#         'PP': '03',
#         'PERPRES': '04',
#         'PERMENKOMINFO': '05'
#     },
#     'section': {
#         'document': '1',
#         'considering': '2',
#         'observing': '3',
#         'definition': '4',
#         'chapter': '5',
#         'article': '6',
#         'section': '7',
#     }
# }

# patterns = {
#     'document': {
#         'metadata': r'^(\w+)_(\w+)_(\w+)'  # Jenis, tahun, dan nomor peraturan
#     },
#     'main': {
#         'considering': r'(?<=## menimbang)([\S\s]*?)(?=## mengingat)',  # Menimbang
#         'observing': r'(?<=## mengingat)([\S\s]*?)(?=(?:dengan persetujuan bersama|## memperhatikan|## memutuskan))',  # Mengingat
#         'chapter': r'(## BAB[\S\s]*?)(?=\s+(?:## BAB|agar setiap orang mengetahuinya))',  # Daftar Bab
#     },
#     'chapter': {
#         # For every chapter
#         'about': r'## BAB [IVXLCDM]+\s([\w\s]+)##',  # Nama Bab
#         'article': r'(## Pasal \w+[\S\s]*?)(?=(?:##|$))'  # Daftar Pasal [GANTI \d jadi \w karena ada Pasal 51A]
#     },
#     'article': {
#         # For every article
#         'number': r'## Pasal (\w+)',  # Nomor Pasal
#         'text': r'## Pasal \w+\n*([\S\s]*)',  # Isi Pasal
#         'definition': r'\(\d+\) (([A-Z][a-z]*(?:\s[A-Z][a-z]*)*) .*)',  # Daftar Definisi
#         # 'definition': r'\(\d+\) ((.*)(?:\badalah\b) .*)',  # Daftar Definisi
#         # \(\d+\) (([A-Z][a-z]*(?:\s[A-Z][a-z]*)*) .*)
#         # \(\d+\) (([\w\s]+) (?:yang selanjutnya|adalah).*)
#         # 'section': r'-\s*(\(\d\)) ([\S\s]*?)(?=(?:- (\(\d+\))|$))',  # Daftar Ayat (Kalau mau motong sampai per ayat, tapi ngga sempurna)
#         'reference_1': r'Pasal (\w+)',  # Pasal Referensi Jenis 1
#         'reference_2': r'Pasal (\w+) sampai dengan Pasal (\w+)',  # Pasal Referensi Jenis 2
#     }
# }

In [None]:
# def merge_article_references(list1, list2):
#     list1_int = set(int(x) for x in list1)
#     list2_int_expanded = set()
#     for start, end in list2:
#         list2_int_expanded.update(range(int(start), int(end) + 1))
#     return sorted(list1_int.union(list2_int_expanded))

In [None]:
# DIR_PATH = os.path.join('output', 'output')
# result = list()
# files = list()

# start_time = time.time()

# for filename in os.listdir(DIR_PATH):
#     if filename.endswith(".md"):
#         files.append((os.path.join(DIR_PATH, filename), filename))

# for index, data in enumerate(files[:5]):
#     # TODO: Tes 5 peraturan, peraturannya ada yang masih tidak lengkap isinya, peraturan revisi tidak di parsing dengan baik
#     # Initialize data
#     path, filename = data
#     regulation_dict = dict()
#     definition_list = list()
#     # section_index = 0

#     # # Just for filtering
#     # if filename not in  ['PERMENKOMINFO_002_2016.md']:
#     # # if filename not in  ['UU_NO_11_2008.md', 'PP_NO_71_2019.md', 'UU_NO_27_2022.md']:
#     # # if filename not in  ['UU_NO_11_2008.md']:
#     #     continue
#     print(f'File: {path}')

#     # Get file metadata
#     metadata = re.search(patterns['document']['metadata'], filename)
#     regulation_type = encode['type'][metadata[1]]
#     regulation_num = int(metadata[2])
#     regulation_year = metadata[3]

#     id_template = f'{regulation_year}{regulation_type}{str(regulation_num).zfill(3)}' + '{reg_section}{section_num}'

#     regulation_id = id_template.format(reg_section=encode['section']['document'], section_num='000')
#     with open('data/regulation_data_modified.json') as input_file:
#         for regulation_data in json.load(input_file):
#             if regulation_data['id'] == regulation_id:
#                 regulation_dict = regulation_data

#     with open(path, 'r', encoding='utf8') as file:
#         # Read file
#         text = file.read()

#         # Define main content
#         regulation_dict['content'] = dict()

#         # Get main structure
#         regulation_dict['content']['considering'] = {
#             'id': id_template.format(reg_section=encode['section']['considering'], section_num='000'),
#             'text': re.search(patterns['main']['considering'], text, re.IGNORECASE)[1].strip()
#         }
#         regulation_dict['content']['observing'] = {
#             'id': id_template.format(reg_section=encode['section']['observing'], section_num='000'), 
#             'text': re.search(patterns['main']['observing'], text, re.IGNORECASE)[1].strip()
#         }

#         # Get all chapters
#         chapters = re.findall(patterns['main']['chapter'], text, re.IGNORECASE)
#         regulation_dict['content']['chapters'] = dict()

#         # Iterate for every chapters
#         for num, chapter in enumerate(chapters):
#             chapter_num = num + 1
#             regulation_dict['content']['chapters'][chapter_num] = {
#                 'id': id_template.format(reg_section=encode['section']['chapter'], section_num=str(chapter_num).zfill(3)),
#                 'about': re.search(patterns['chapter']['about'], chapter, re.IGNORECASE)[1].strip()
#             }

#             # Get all articles
#             articles = re.findall(patterns['chapter']['article'], chapter, re.IGNORECASE)
#             regulation_dict['content']['chapters'][chapter_num]['articles'] = dict()

#             # Iterate for every articles
#             for article in articles:
#                 article_num = int(re.search(patterns['article']['number'], article, re.IGNORECASE)[1])
#                 article_text = re.search(patterns['article']['text'], article, re.IGNORECASE)[1].strip()
#                 article_text = re.sub(r'\n+', '\n', article_text)
                
#                 # Get definition (special article 1)
#                 if article_num == 1:
#                     definitions = re.findall(patterns['article']['definition'], article_text)
#                     for index, definition_data in enumerate(definitions):
#                         definition, name = definition_data
#                         definition_list.append({
#                             'id': id_template.format(reg_section=encode['section']['definition'], section_num=str(index + 1).zfill(3)),
#                             'name': name.strip(),
#                             'definition': definition.strip()
#                         })
#                     regulation_dict['content']['definitions'] = definition_list
                
#                 # Store article
#                 regulation_dict['content']['chapters'][chapter_num]['articles'][article_num] = {
#                     'id': id_template.format(reg_section=encode['section']['article'], section_num=str(article_num).zfill(3)),
#                     'text': article_text
#                 }

#                 # Get article reference to other article
#                 regulation_dict['content']['chapters'][chapter_num]['articles'][article_num]['references'] = list()
#                 reference_type_1 = list(set(re.findall(patterns['article']['reference_1'], article_text, re.IGNORECASE)))
#                 reference_type_2 = list(set(re.findall(patterns['article']['reference_2'], article_text, re.IGNORECASE)))

#                 article_references = merge_article_references(reference_type_1, reference_type_2)

#                 for article_reference_num in article_references:
#                     regulation_dict['content']['chapters'][chapter_num]['articles'][article_num]['references'].append(
#                         id_template.format(reg_section=encode['section']['article'], section_num=str(article_reference_num).zfill(3))
#                     )
        
#         result.append(regulation_dict)

# end_time = time.time() - start_time
# display(f'{round(end_time * 1000, 2)} milisecond')

# # https://www.freecodecamp.org/news/how-to-pretty-print-json-in-python/
# # https://www.geeksforgeeks.org/how-to-convert-python-dictionary-to-json/
# # Convert the data to a JSON formatted string with 4 spaces of indentation
# with open("output_new.json", "w") as outfile: 
#     json.dump(result, outfile, indent=4)
#     output_json_str = json.dumps(result, indent=4)
#     # Print the pretty-printed JSON string
#     print(output_json_str)

In [None]:
# with open('data/regulation_data_modified.json') as input_file:
#     for regulation_data in json.load(input_file):
#         if regulation_data['id'] == '1999010361000':
#             display(regulation_data)

In [None]:
# test = {
#     'key1': 'value1',
#     'key2': 'value2',
#     'key3': 'value3'
# }

# for i in test.values():
#     print(i)

# print(list(test.values()))
# print(type(list(test.values())))
# # print(test.items())

In [None]:
# encode = {
#     'type': {
#         'UU': '01',
#         'PERPPU': '02',
#         'PP': '03',
#         'PERPRES': '04',
#         'PERMENKOMINFO': '05'
#     },
#     'section': {
#         'document': '1',
#         'considering': '2',
#         'observing': '3',
#         'definition': '4',
#         'chapter': '5',
#         'article': '6',
#         'section': '7',
#     }
# }

# patterns = {
#     'document': {
#         'metadata': r'^(.*)_NO_(\d+)_(\d+)'  # Jenis, nomor, dan tahun peraturan
#     },
#     'stopword': {
#         'hukumonline': r'(www.hukumonline.com)',  # Header/footer www.hukumonline.com
#         'image': r'(<!-- image -->)',  # Tanda image
#     },
#     'main': {
#         # 'name': r'([\S\s]*?)(?=## DENGAN RAHMAT TUHAN YANG MAHA ESA)',  # Nama Peraturan
#         'about': r'(?<=TENTANG)([\S\s]*?)(?=## DENGAN RAHMAT TUHAN YANG MAHA ESA)',  # Menetapkan (tentang)
#         'considering': r'(?<=## Menimbang:)([\S\s]*?)(?=## Mengingat)',  # Menimbang
#         'observing': r'(?<=## Mengingat:)([\S\s]*?)(?=(?:Dengan Persetujuan Bersama|## MEMUTUSKAN))',  # Mengingat
#         'chapter': r'(## BAB[\S\s]*?)(?=\s+(?:## BAB|Agar setiap orang))',  # Daftar Bab
#     },
#     'chapter': {
#         # For every chapter
#         'about': r'## BAB [IVXLCDM]+[\s\S]*?(## .*)',  # Nama Bab
#         'article': r'(## Pasal \d+[\S\s]*?)(?=(?:##|$))'  # Daftar Pasal [GANTI \d jadi \w karena ada Pasal 51A]
#     },
#     'article': {
#         # For every article
#         'number': r'## Pasal (\d+)',  # Nomor Pasal
#         'text': r'## Pasal \d+\s*([\S\s]*)',  # Isi Pasal
#         'definition': r'-\s*\d+\. (([A-Z][a-z]*(?:\s[A-Z][a-z]*)*) .*)',  # Daftar Definisi
#         'section': r'-\s*(\(\d\)) ([\S\s]*?)(?=(?:- (\(\d+\))|$))',  # Daftar Ayat
#         'reference_1': r'Pasal (\d+)',  # Pasal Referensi Jenis 1
#         'reference_2': r'Pasal (\d+) sampai dengan Pasal (\d+)',  # Pasal Referensi Jenis 2
#     }
# }

In [None]:
# def merge_article_references(list1, list2):
#     list1_int = set(int(x) for x in list1)
#     list2_int_expanded = set()
#     for start, end in list2:
#         list2_int_expanded.update(range(int(start), int(end) + 1))
#     return sorted(list1_int.union(list2_int_expanded))

In [None]:
# DIRECTORY_PATH = os.path.join('data', 'good')
# output = list()
# files = list()

# start_time = time.time()

# for filename in os.listdir(DIRECTORY_PATH):
#     if filename.endswith(".md"):
#         files.append((os.path.join(DIRECTORY_PATH, filename), filename))

# for index, data in enumerate(files):
#     # Initialize data
#     path, filename = data
#     regulation_dict = dict()
#     definition_dict = dict()
#     # section_index = 0

#     # Just for filtering
#     if filename not in  ['UU_NO_11_2008.md', 'PP_NO_71_2019.md', 'UU_NO_27_2022.md']:
#     # if filename not in  ['UU_NO_11_2008.md']:
#         continue
#     print(f'File: {path}')

#     # Get file metadata
#     metadata = re.search(patterns['document']['metadata'], filename)
#     regulation_type = encode['type'][metadata[1]]
#     regulation_num = metadata[2]
#     regulation_year = metadata[3]

#     id_template = f'{regulation_year}{regulation_type}{str(regulation_num).zfill(3)}' + '{reg_section}{section_num}'

#     regulation_dict['metadata'] = {
#         'id': id_template.format(reg_section=encode['section']['document'], section_num='000'),
#         'type': metadata[1],
#         'num': regulation_num,
#         'year': regulation_year
#     }
#     # print(f"Regulation ID: {regulation_dict['metadata']['id']}")

#     with open(path, 'r', encoding='utf8') as file:
#         # Read file
#         text = file.read()

#         # Remove stopword
#         text = str.strip(re.sub(patterns['stopword']['hukumonline'], '', text))
#         text = str.strip(re.sub(patterns['stopword']['image'], '', text))

#         # Get main structure
#         regulation_dict['metadata']['about'] = str.strip(re.search(patterns['main']['about'], text, re.IGNORECASE)[1]).title()
#         regulation_dict['considering'] = {
#             'id': id_template.format(reg_section=encode['section']['considering'], section_num='000'),
#             'text': str.strip(re.search(patterns['main']['considering'], text, re.IGNORECASE)[1])
#         }
#         regulation_dict['observing'] = {
#             'id': id_template.format(reg_section=encode['section']['observing'], section_num='000'), 
#             'text': str.strip(re.search(patterns['main']['observing'], text, re.IGNORECASE)[1])
#         }
#         chapters = re.findall(patterns['main']['chapter'], text, re.IGNORECASE)

#         # display(f"NAMA: {regulation_dict['name']}")
#         # print(f"MENIMBANG:\n\n{regulation_dict['considering']}")
#         # display(f"MENGINGAT: {regulation_dict['observing']}")
#         # display(f"MENETAPKAN: {regulation_dict['to_enact']}")
#         # display(chapters)

#         # Get chapter structure
#         regulation_dict['chapters'] = dict()
#         for num, chapter in enumerate(chapters):
#             chapter_num = num + 1
#             articles = re.findall(patterns['chapter']['article'], chapter, re.IGNORECASE)
#             regulation_dict['chapters'][chapter_num] = {
#                 'id': id_template.format(reg_section=encode['section']['chapter'], section_num=str(chapter_num).zfill(3)),
#                 'about': re.search(patterns['chapter']['about'], chapter, re.IGNORECASE)[1]
#             }
#             regulation_dict['chapters'][chapter_num]['articles'] = dict()

#             # print(f'\nBAB {chapter_num} {regulation_dict['chapters'][chapter_num]['about']}\n')
#             # display(articles)

#             # Get article structure
#             for article in articles:
#                 article_num = int(re.search(patterns['article']['number'], article)[1])
#                 article_text = re.search(patterns['article']['text'], article)[1]
#                 article_text = str.strip(re.sub(r'\n+', '\n', article_text))
#                 sections = re.findall(patterns['article']['section'], article_text, re.IGNORECASE)
                
#                 if article_num == 1:
#                     definition_ls = re.findall(patterns['article']['definition'], article_text)
#                     for index, definition_data in enumerate(definition_ls):
#                         definition, name = definition_data
#                         definition_dict[index + 1] = {
#                             'id': id_template.format(reg_section=encode['section']['definition'], section_num=str(index + 1).zfill(3)),
#                             'about': name,
#                             'text': str.strip(definition)
#                         }
#                     regulation_dict['definitions'] = definition_dict
#                     # display(definition_dict)
                
#                 regulation_dict['chapters'][chapter_num]['articles'][article_num] = {
#                     'id': id_template.format(reg_section=encode['section']['article'], section_num=str(article_num).zfill(3)),
#                     'text': article_text
#                 }

#                 regulation_dict['chapters'][chapter_num]['articles'][article_num]['references'] = dict()
#                 reference_type_1 = list(set(re.findall(patterns['article']['reference_1'], article_text, re.IGNORECASE)))
#                 reference_type_2 = list(set(re.findall(patterns['article']['reference_2'], article_text, re.IGNORECASE)))
                
#                 # if reference_type_1:
#                     # print(reference_type_1)
#                 # if reference_type_2: 
#                     # print(reference_type_2)
                
#                 article_references = merge_article_references(reference_type_1, reference_type_2)
#                 # print(article_references)

#                 for index, article_reference_num in enumerate(article_references):
#                     regulation_dict['chapters'][chapter_num]['articles'][article_num]['references'][index + 1] = \
#                         id_template.format(reg_section=encode['section']['article'], section_num=str(article_reference_num).zfill(3))

#                 # print(f'Pasal {article_num}')
#                 # if sections: 
#                 #     section_dict = dict()
#                 #     for section in sections:
#                 #         section_index += 1
#                 #         section_dict[section[0]] = {
#                 #             'id': id_template.format(reg_section=encode['section']['section'], section_num=str(section_index).zfill(3)),
#                 #             'text': str.strip(section[1])
#                 #         }
#                 #         regulation_dict['chapters'][chapter_num]['articles'][article_num]['sections'] = section_dict
#                 #         print(section[0], str.strip(section[1]))
#                 #     print()
#                 # else:
#                 #     print(article_text, '\n')
        
#         output.append(regulation_dict)

# end_time = time.time() - start_time
# display(f'{round(end_time * 1000, 2)} milisecond')

# # https://www.freecodecamp.org/news/how-to-pretty-print-json-in-python/
# # https://www.geeksforgeeks.org/how-to-convert-python-dictionary-to-json/
# # Convert the data to a JSON formatted string with 4 spaces of indentation
# with open("output.json", "w") as outfile: 
#     json.dump(output, outfile, indent=4)
#     output_json_str = json.dumps(output, indent=4)
#     # Print the pretty-printed JSON string
#     print(output_json_str)

In [None]:
# pattern = r'## BAB [IVXLCDM]+[\s\S]*?\n(## .*)'
# for chapter in chapters:
#     print(re.search(pattern, chapter)[1])