### Import libraries

In [2]:
import re
import json

from pdf2image import convert_from_path
import pytesseract

### Convert PDF to Images and then extract the text using Tesseract-OCR

In [4]:
pdf_file_path = '../Data/91~PMK.01~2017Per.pdf'

poppler_path = 'usr/bin'

images = convert_from_path(pdf_file_path)

extracted_text = ''
for i, image in enumerate(images):
    text = pytesseract.image_to_string(image)
    extracted_text += text

print(extracted_text)


MENTERI KEUANGAN
REPUBLIK INDONESIA

SALINAN

PERATURAN MENTERI KEUANGAN REPUBLIK INDONESIA
NOMOR 91. /PMK.01/2017
TENTANG
ORGANISASI DAN TATA KERJA PUSAT INVESTASI PEMERINTAH

DENGAN RAHMAT TUHAN YANG MAHA ESA

MENTERI KEUANGAN REPUBLIK INDONESIA,

Menimbang : a. bahwa berdasarkan Peraturan Menteri Keuangan
. Nomor 52/PMK.01/2007 tentang Organisasi dan Tata
Kerja Pusat Investasi Pemerintah, telah dibentuk Pusat
Investasi Pemerintah sebagai unit khusus yang
bertugas’ melaksanakan kewenangan  operasional
dalam pengelolaan investasi pemerintah pusat dan
bertanggung jawab kepada Menteri Keuangan;

b. bahwa untuk menjamin keberlangsungan program
pembiayaan kepada usaha mikro, kecil, dan menengah
telah dialokasikan dana pada Pusat _ Investasi
Pemerintah di dalam Anggaran Pendapatan dan
Belanja Negara;

c. bahwa agar program pembiayaan sebagaimana
dimaksud dalam huruf b dapat dikelola  secara
transparan dan akuntabel sesuai dengan ketentuan
pengelolaan keuangan negara, perlu dilakukan
penata

### Cleaning the extracted_text

In [5]:
def clean_text(text):
    """
    Clean the given text by removing unwanted characters and patterns.

    Args:
        text (str): The text to be cleaned.

    Returns:
        str: The cleaned text.
    """
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'-\s+', '-', text)
    text = re.sub('_', '', text)
    text = re.sub(r'-\d+-', '', text) 
    text = re.sub(r'-[a-zA-Z]+-', '', text)
    text = re.sub(r'\b\w*\\u\w*\b', '', text)
    text = re.sub(r'www\.peraturan\.go\.id\s+\d{4},\s+No\.\s+\d+', '', text)
    text = re.sub(r'(www\.\S+\s+)\d{4}\b', r'\1', text)
    text = re.sub(r'\b(?:https?://|www\.)\S+\b', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    
    return text

### Split Content into sections

In [6]:
def split_content(content_text):
    """
    Split the content text into sections based on the pattern 'Pasal \d+'.

    Args:
        content_text (str): The text to be split.

    Returns:
        list: A list of sections extracted from the content text.
    """
    content_pattern = re.compile(r'(?:(?<=\.)|(?<=\n)|^)\s*(Pasal \d+)')
    content = re.split(content_pattern, content_text)[1:]  
    return content

### Parsing

In [7]:
def parse_text(cleaned_text):
    """Parse the cleaned text and extract relevant information.

    Args:
        cleaned_text (str): The cleaned text to be parsed.

    Returns:
        list: A list of dictionaries containing the parsed information.
            Each dictionary represents a parsed section of the text.
    """
    stop_phrase = "Ditetapkan di Jakarta"
    pattern = re.compile(r'BAB\s+([IVXLCDM]+)[\s\n]+([^0-9]+)\s+Pasal\s+(\d+)', re.IGNORECASE)
    matches = pattern.findall(cleaned_text)

    parsed_text = []

    for match in matches:
        content_match = re.search(f'{match[1]}(.*?)(?=BAB|\Z)', cleaned_text, re.DOTALL)
        if content_match:
            content = content_match.group(1).strip()
            content = split_content(content)

            for i in range(0, len(content), 2):
                content_number, content_text = content[i], content[i + 1]
                pasal_number = re.search(r'(\d+)', content_number).group(1)

                # References to other pasal
                references = re.findall(r'Pasal\s+(\d+)', content_text)

                additional_context = [{'Text': f'pasal-{ref}'} for ref in references]

                if stop_phrase in content_text:
                    content_text = content_text.split(stop_phrase)[0].strip()

                parsed_text.append({
                    'additional_context': additional_context,
                    'bab': f'bab-{match[0].lower()}',
                    'bagian': 'none',
                    'content': content_text.strip(),
                    'context': match[1],
                    'paragraf': 'none',
                    'pasal': f'pasal-{pasal_number}',
                    'ref': 'none',
                    'type': 'CONTENT_PASAL'                   
                })

    return parsed_text


In [11]:
cleaned_text = clean_text(extracted_text)
parsed_content = parse_text(cleaned_text)

output_json_file = 'parsed_data.json'

with open(output_json_file, 'w') as json_file:
    json.dump(parsed_content, json_file, indent=4)

print(f"Parsed content saved to {output_json_file}")

Parsed content saved to parsed_data.json
