<a href="https://colab.research.google.com/github/cbi-automation/lk-extraction/blob/main/TLKM3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import warnings
import logging

# Matikan logging untuk pdfminer
logging.getLogger("pdfminer").setLevel(logging.ERROR)

# (opsional) Matikan warning global
warnings.filterwarnings("ignore")

import os
!pip install pdfplumber
import pdfplumber
import pandas as pd
from google.colab import files



In [None]:
# Path dasar
base_path = "/content/drive/MyDrive/Bank-Indonesia/program_personalized"
output_path = os.path.join(base_path, "output")
os.makedirs(output_path, exist_ok=True)

# Daftar perusahaan yang ingin diproses
companies = ["TLKM"]  # Tambahkan jika ada lebih banyak

# Tahun dan kuartal yang ingin diproses
years = ["2022","2023", "2024"]
quarters = ["Q1", "Q2", "Q3"]

In [None]:
import re

def normalize_bulan(text):
    bulan_map = {
        "januari": r"j\s*a\s*n\s*u\s*a\s*r\s*i",
        "februari": r"f\s*e\s*b\s*r\s*u\s*a\s*r\s*i",
        "maret": r"m\s*a\s*r\s*e\s*t",
        "april": r"a\s*p\s*r\s*i\s*l",
        "mei": r"m\s*e\s*i",
        "juni": r"j\s*u\s*n\s*i",
        "juli": r"j\s*u\s*l\s*i",
        "agustus": r"a\s*g\s*u\s*s\s*t\s*u\s*s",
        "september": r"s\s*e\s*p\s*t\s*e\s*m\s*b\s*e\s*r",
        "oktober": r"o\s*k\s*t\s*o\s*b\s*e\s*r",
        "november": r"n\s*o\s*v\s*e\s*m\s*b\s*e\s*r",
        "desember": r"d\s*e\s*s\s*e\s*m\s*b\s*e\s*r|d\s*e\s*s\s*e\s*m\s*b\s*e\s*r"
    }

    for bulan, pattern in bulan_map.items():
        text = re.sub(pattern, bulan, text, flags=re.IGNORECASE)
    return text

def normalize(text):
    text = normalize_bulan(text)  # perbaiki dulu bulan
    return re.sub(r"\s+", " ", text.strip().lower())

def find_paragraphs_by_marker_pairs(text, marker_pairs, kuartal, titles=None):
    hasil = []
    text_norm = normalize(text)
    text_lower = text.lower()

    for idx, (start_marker, end_marker) in enumerate(marker_pairs):
        title = titles[idx] if titles and idx < len(titles) else f"Informasi {idx+1}"

        start_norm = normalize(start_marker)
        end_norm = normalize(end_marker)

        start_idx = text_norm.find(start_norm)
        if start_idx == -1:
            print(f"[❗] Start marker tidak ditemukan di {kuartal}: {start_marker}")
            continue

        # Cari END marker paling dekat SETELAH start marker
        search_range = text_norm[start_idx:]
        end_relative = search_range.find(end_norm)
        if end_relative == -1:
            print(f"[❗] End marker tidak ditemukan setelah start marker di {kuartal}: {end_marker}")
            continue

        end_idx = start_idx + end_relative

        # Ambil isi antar marker dari versi normalisasi
        content_norm = text_norm[start_idx + len(start_norm): end_idx]

        # Sekarang ambil bagian asli (versi original, tanpa marker)
        orig_start_idx = text.lower().find(start_marker.lower())
        orig_end_idx = text.lower().find(end_marker.lower(), orig_start_idx)

        if orig_start_idx != -1 and orig_end_idx != -1:
            content_raw = text[orig_start_idx + len(start_marker): orig_end_idx]
            snippet = content_raw.strip()
            sumber = "Marker Pair (Exact)"
        else:
            snippet = content_norm.strip()
            sumber = "Marker Pair (Fallback)"

        hasil.append({
            "Kuartal": kuartal,
            "Sumber": sumber,
            "Judul": title,
            "Isi": snippet
        })

        print(f"[✅] {title} ditemukan di {kuartal}")

    return hasil

In [None]:
marker_pairs = [
    ("nilai tukar mata uang:", "Analisis sensitivitas"),
    ("pada khususnya tingkat bunga, tidak berubah.", "variabel lain tidak berubah.")
]

titles = [
    "Risiko nilai tukar mata uang asing",
    "Analisis sensitivitas"
]

In [None]:
# Fungsi untuk mengekstrak teks dari PDF
def extract_text(file_path):
    text = ""
    with pdfplumber.open(file_path) as doc:
        for page in doc.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text


In [None]:
import re

def format_marker_output(text_result, kuartal="TLKM-2024-Q3"):
    header = f"📄 [Hasil Marker - {kuartal}]\n" + "=" * 40 + "\n\n"

    # Ambil bagian 1: antara 'nilai tukar mata uang:' sampai 'Analisis sensitivitas'
    match1 = re.search(
        r"nilai tukar mata uang:(.*?)Analisis sensitivitas",
        text_result, re.DOTALL | re.IGNORECASE
    )
    nilai_tukar_section = match1.group(1).strip() if match1 else "[❗] Tidak ditemukan"

    # Ambil bagian 2: setelah 'Ekuitas/ laba (rugi)' sampai akhir kalimat logis
    match2 = re.search(
        r"Ekuitas/ laba \(rugi\)(.*?)(?=\n\S|$)",
        text_result, re.DOTALL | re.IGNORECASE
    )
    sensitivitas_section = match2.group(1).strip() if match2 else "[❗] Tidak ditemukan"
    # Format akhir
    return (
        header +
        "Risiko nilai tukar mata uang asing (dalam miliar):\n\n" +
        nilai_tukar_section + "\n\n" +
        "Analisis sensitivitas:\n\nEkuitas/ laba (rugi)\n" +
        sensitivitas_section + "\n"
    )


In [None]:
def find_paragraphs_by_marker_pairs(text, marker_pairs, kuartal, titles=None):
    hasil = []
    text_norm = normalize(text)
    text_lower = text.lower()

    for idx, (start_marker, end_marker) in enumerate(marker_pairs):
        title = titles[idx] if titles and idx < len(titles) else f"Informasi {idx+1}"

        start_norm = normalize(start_marker)
        end_norm = normalize(end_marker)

        start_idx = text_norm.find(start_norm)
        if start_idx == -1:
            print(f"[❗] Start marker tidak ditemukan di {kuartal}: {start_marker}")
            continue

        # Cari end marker terdekat setelah start marker
        search_range = text_norm[start_idx:]
        end_relative = search_range.find(end_norm)
        if end_relative == -1:
            print(f"[❗] End marker tidak ditemukan setelah start marker di {kuartal}: {end_marker}")
            continue

        end_idx = start_idx + end_relative

        # Ambil isi antar marker dari versi normalisasi (tanpa marker)
        content_norm = text_norm[start_idx + len(start_norm): end_idx]

        # Cari di versi asli
        orig_start_idx = text.lower().find(start_marker.lower())
        orig_end_idx = text.lower().find(end_marker.lower(), orig_start_idx + len(start_marker))

        if orig_start_idx != -1 and orig_end_idx != -1:
            content_raw = text[orig_start_idx + len(start_marker): orig_end_idx]
            snippet = content_raw.strip()
            sumber = "Marker Pair (Exact)"
        else:
            snippet = content_norm.strip()
            sumber = "Marker Pair (Fallback)"

        hasil.append({
            "Kuartal": kuartal,
            "Sumber": sumber,
            "Judul": title,
            "Isi": snippet
        })

        print(f"[✅] {title} ditemukan di {kuartal}")

    return hasil


In [None]:
# VERSI 2
import re

def format_marker_output(text_result, kuartal="TLKM-2024-Q3"):
    header = f"📄 [Hasil Marker - {kuartal}]\n" + "=" * 40 + "\n\n"

    # Ambil bagian 1: Risiko Nilai Tukar
    match1 = re.search(
        r"nilai tukar mata uang asing\s*(.*?)\n\s*\d{1,2}\s*\w+\s*\d{4}",
        text_result, re.DOTALL | re.IGNORECASE
    )
    nilai_tukar_section = match1.group(1).strip() if match1 else "[❗] Tidak ditemukan"

    # Ambil bagian 2: Analisis sensitivitas hingga baris yang mengandung 'Yen Jepang'
    match2 = re.search(
        r"(Ekuitas/ laba \(rugi\).*?Yen Jepang.*?)(?:\n|$)",
        text_result, re.DOTALL | re.IGNORECASE
    )
    sensitivitas_section = match2.group(1).strip() if match2 else "[❗] Tidak ditemukan"

    return (
        header +
        "1. Risiko nilai tukar mata uang asing\n" +
        nilai_tukar_section + "\n\n" +
        "2. Analisis sensitivitas\n" +
        sensitivitas_section + "\n"
    )


In [None]:
import time
import os

# Simpan semua hasil marker di sini
all_results = []
execution_times = []  # untuk menyimpan waktu proses per kuartal

titles = [
    "Risiko nilai tukar mata uang asing",
    "Analisis sensitivitas"
]

for company in companies:
    for year in years:
        total_time_company = 0  # total waktu untuk satu company
        for i, quarter in enumerate(quarters):
            pdf_file = f"/content/drive/MyDrive/Bank-Indonesia/program_personalized/{company}/laporan/{company}-{year}-{quarter}.pdf"
            output_txt_file = os.path.join(output_path, f"{company}-{year}-{quarter}.txt")
            kuartal = f"{company}-{year}-{quarter}"

            try:
                # Mulai timer
                start_time = time.time()

                # Ekstrak teks asli dari PDF
                extracted_text = extract_text(pdf_file)

                # Ambil hasil antar marker, lengkap dengan judul
                hasil_marker = find_paragraphs_by_marker_pairs(extracted_text, marker_pairs, kuartal, titles)

                # Tambahkan ke hasil global
                all_results.extend(hasil_marker)

                # Buat hasil format per bagian
                hasil_format = [f"{j+1}. {item['Judul']}\n{item['Isi'].strip()}" for j, item in enumerate(hasil_marker)]
                filtered_text = "\n\n".join(hasil_format)

                # Simpan ke file TXT
                with open(output_txt_file, "w", encoding="utf-8") as txt_file:
                    txt_file.write(filtered_text)

                # Hitung waktu eksekusi kuartal ini
                elapsed_time = time.time() - start_time
                execution_times.append(elapsed_time)
                total_time_company += elapsed_time

                # Hitung total waktu sampai kuartal saat ini
                total_until_now = sum(execution_times)

                # Tampilkan output dan waktu
                print(f"\n📄 [Hasil Marker - {kuartal}]\n{'=' * (20 + len(kuartal))}")
                print(filtered_text)
                print("=" * (20 + len(kuartal)))

                print(f"⏱️ Waktu proses {kuartal}: {elapsed_time:.2f} detik")
                print(f"📊 Total waktu hingga kuartal ke-{i+1}: {total_until_now:.2f} detik\n")

            except Exception as e:
                print(f"[⚠️] Gagal memproses {pdf_file}: {e}")

[⚠️] Gagal memproses /content/drive/MyDrive/Bank-Indonesia/program_personalized/TLKM/laporan/TLKM-2022-Q1.pdf: [Errno 2] No such file or directory: '/content/drive/MyDrive/Bank-Indonesia/program_personalized/TLKM/laporan/TLKM-2022-Q1.pdf'
[⚠️] Gagal memproses /content/drive/MyDrive/Bank-Indonesia/program_personalized/TLKM/laporan/TLKM-2022-Q2.pdf: [Errno 2] No such file or directory: '/content/drive/MyDrive/Bank-Indonesia/program_personalized/TLKM/laporan/TLKM-2022-Q2.pdf'
[⚠️] Gagal memproses /content/drive/MyDrive/Bank-Indonesia/program_personalized/TLKM/laporan/TLKM-2022-Q3.pdf: [Errno 2] No such file or directory: '/content/drive/MyDrive/Bank-Indonesia/program_personalized/TLKM/laporan/TLKM-2022-Q3.pdf'
[✅] Risiko nilai tukar mata uang asing ditemukan di TLKM-2023-Q1
[✅] Analisis sensitivitas ditemukan di TLKM-2023-Q1

📄 [Hasil Marker - TLKM-2023-Q1]
1. Risiko nilai tukar mata uang asing
31 Mar e t 2023 31 Dese m ber 2022
Dolar A.S. Yen Jepang Dolar A.S. Yen Jepang
(dalam miliar) (