In [1]:
from IPython import get_ipython
from IPython.display import display
import argparse
import os
import re
import time
import urllib
from concurrent.futures import ThreadPoolExecutor, wait
from datetime import date
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import logging
import threading

In [2]:
collected = 0
pbar = None
lock = threading.Lock()

def create_path(folder_name):
    path = os.path.join(os.getcwd(), folder_name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path

def open_page(link):
    count = 0
    while count < 3:
        try:
            return BeautifulSoup(requests.get(link).text, "lxml")
        except:
            count += 1
            time.sleep(5)

def get_detail(soup, keyword):
    try:
        text = (
            soup.find(lambda tag: tag.name == "td" and keyword in tag.text)
            .find_next()
            .get_text()
            .strip()
        )
        return text
    except:
        return ""

def get_pdf(url, path_pdf):
    try:
        file = urllib.request.urlopen(url)
        file_name = file.info().get_filename().replace("/", " ")
        file_content = file.read()
        with open(f"{path_pdf}/{file_name}", "wb") as out_file:
            out_file.write(file_content)
        return file_name
    except:
        return None

def is_url_already_scraped(url, destination):
    """
    Checks if a URL has already been scraped and saved in the CSV file.

    Args:
        url (str): The URL to check.
        destination (str): The path to the output CSV file.

    Returns:
        bool: True if the URL exists in the CSV, False otherwise.
    """
    if not os.path.isfile(f"{destination}.csv"):
      return False

    try:
        df = pd.read_csv(f"{destination}.csv")
        return url in df["link"].values
    except pd.errors.EmptyDataError:
      return False

In [3]:
def extract_data(link, keyword_url):
    global collected
    global pbar
    global lock
    global path_output
    global path_pdf

    path_output = create_path('data/CSV')
    path_pdf = create_path('data/PDF')

    keyword_url = keyword_url.replace("/", " ")
    if keyword_url.startswith("https"):
        keyword_url = ""
    destination = f"{path_output}/putusan_ma_{keyword_url}"

    with lock:
        if collected >= 50:
            return

    if is_url_already_scraped(link, destination):
        print(f"Skipping duplicate URL: {link}")
        logging.info(f"Skipping duplicate URL: {link}")
        return

    soup = open_page(link)
    table = soup.find("table", {"class": "table"})
    judul = table.find("h2").text
    table.find("h2").decompose()

    nomor = get_detail(table, "Nomor")
    tingkat_proses = get_detail(table, "Tingkat Proses")
    klasifikasi = get_detail(table, "Klasifikasi")
    kata_kunci = get_detail(table, "Kata Kunci")
    tahun = get_detail(table, "Tahun")
    tanggal_register = get_detail(table, "Tanggal Register")
    lembaga_peradilan = get_detail(table, "Lembaga Peradilan")
    jenis_lembaga_peradilan = get_detail(table, "Jenis Lembaga Peradilan")
    hakim_ketua = get_detail(table, "Hakim Ketua")
    hakim_anggota = get_detail(table, "Hakim Anggota")
    panitera = get_detail(table, "Panitera")
    amar = get_detail(table, "Amar")
    amar_lainnya = get_detail(table, "Amar Lainnya")
    catatan_amar = get_detail(table, "Catatan Amar")
    tanggal_musyawarah = get_detail(table, "Tanggal Musyawarah")
    tanggal_dibacakan = get_detail(table, "Tanggal Dibacakan")
    kaidah = get_detail(table, "Kaidah")
    abstrak = get_detail(table, "Abstrak")

    link_pdf = ""
    file_name_pdf = ""
    try:
        link_pdf = soup.find("a", href=re.compile(r"/pdf/"))["href"]
        file_name_pdf = get_pdf(link_pdf, path_pdf)
        if file_name_pdf is None:
            raise Exception("PDF download failed")
    except Exception as e:
        logging.warning(f"Failed to download PDF for {link}: {str(e)}")
        return

    data = [
        judul,
        nomor,
        tingkat_proses,
        klasifikasi,
        kata_kunci,
        tahun,
        tanggal_register,
        lembaga_peradilan,
        jenis_lembaga_peradilan,
        hakim_ketua,
        hakim_anggota,
        panitera,
        amar,
        amar_lainnya,
        catatan_amar,
        tanggal_musyawarah,
        tanggal_dibacakan,
        kaidah,
        abstrak,
        link,
        link_pdf,
        file_name_pdf,
    ]
    result = pd.DataFrame(
        [data],
        columns=[
            "judul",
            "nomor",
            "tingkat_proses",
            "klasifikasi",
            "kata_kunci",
            "tahun",
            "tanggal_register",
            "lembaga_peradilan",
            "jenis_lembaga_peradilan",
            "hakim_ketua",
            "hakim_anggota",
            "panitera",
            "amar",
            "amar_lainnya",
            "catatan_amar",
            "tanggal_musyawarah",
            "tanggal_dibacakan",
            "kaidah",
            "abstrak",
            "link",
            "link_pdf",
            "file_name_pdf",
        ],
    )

    print(f"Saving data for {link}")
    if not os.path.isfile(f"{destination}.csv"):
        result.to_csv(f"{destination}.csv", header=True, index=False)
    else:
        result.to_csv(f"{destination}.csv", mode="a", header=False, index=False)

    logging.info(f"Successfully scraped and saved data and PDF for {link} (PDF: {file_name_pdf})")

    with lock:
        collected += 1
        pbar.update(1)

In [4]:
def run_scraper(keyword=None, url=None, sort_date=True, download_pdf=True):
    global collected
    global pbar

    path_logs = create_path('data/Logs')
    logging.basicConfig(level=logging.INFO,
                        filename=os.path.join(path_logs, f"scraper_log_{date.today().strftime('%Y-%m-%d')}.log"),
                        format='%(asctime)s - %(levelname)s - %(message)s')

    if not keyword and not url:
        print("Please provide a keyword or URL")
        return

    path_output = create_path('data/CSV')
    path_pdf = create_path('data/PDF')

    link = f"https://putusan3.mahkamahagung.go.id/search.html?q={keyword}&page=1"

    if url:
        link = url

    soup = open_page(link)

    try:
        last_page = int(
            soup.find_all("a", {"class": "page-link"})[-1].get("data-ci-pagination-page")
        )
    except:
        print("Could not find pagination. Perhaps no results or different structure.")
        return

    keyword_url = url if url else keyword

    destination = f"{path_output}/putusan_ma_{keyword_url.replace('/', ' ')}"

    if os.path.isfile(f"{destination}.csv"):
        try:
            df = pd.read_csv(f"{destination}.csv")
            collected = len(df)
        except pd.errors.EmptyDataError:
            collected = 0
    else:
        collected = 0

    pbar = tqdm(total=50, desc="Scraping progress", initial=collected)

    if collected >= 50:
        print("Already reached or exceeded 50 data entries.")
        pbar.close()
        return

    if url:
        print(f"Scraping with url: {url} - approx {20 * last_page} data - {last_page} pages")
    else:
        print(f"Scraping with keyword: {keyword} - approx {20 * last_page} data - {last_page} pages")

    futures = []
    with ThreadPoolExecutor(max_workers=4) as executor:
        for page in range(1, last_page + 1):
            futures.append(
                executor.submit(run_process, keyword_url, page, sort_date)
            )
    wait(futures)
    pbar.close()

In [None]:
def run_process(keyword_url, page, sort_date):
    if keyword_url.startswith("https"):
        link = keyword_url
        if page > 1:
            if '?' in link:
                link += f"&page={page}"
            else:
                link += f"?page={page}"
    else:
        link = f"https://putusan3.mahkamahagung.go.id/search.html?q={keyword_url}&page={page}"
    if sort_date:
        if '?' in link:
            link += "&obf=TANGGAL_PUTUS&obm=desc"
        else:
            link += "?obf=TANGGAL_PUTUS&obm=desc"

    print(link)
    logging.info(f"Processing page: {link}")

    soup = open_page(link)
    links = soup.find_all("a", {"href": re.compile("/direktori/putusan")})

    for a in links:
        extract_data(a["href"], keyword_url)

def scrape_specific_url(url, download_pdf=True):
    if not url or not url.startswith("https://"):
        print("Please provide a valid URL")
        return

    path_output = create_path('data/CSV')
    path_pdf = create_path('data/PDF')

    extract_data(url, url)

# Putusan PN SURABAYA Narkotika dan Psikotropika Putus Tahun 2025
run_scraper(url="https://putusan3.mahkamahagung.go.id/direktori/index/pengadilan/pn-surabaya/kategori/narkotika-dan-psikotropika-1/tahunjenis/putus/tahun/2025.html")

Scraping progress:   0%|          | 0/50 [00:00<?, ?it/s]

Scraping with url: https://putusan3.mahkamahagung.go.id/direktori/index/pengadilan/pn-surabaya/kategori/narkotika-dan-psikotropika-1/tahunjenis/putus/tahun/2025.html - approx 640 data - 32 pages
https://putusan3.mahkamahagung.go.id/direktori/index/pengadilan/pn-surabaya/kategori/narkotika-dan-psikotropika-1/tahunjenis/putus/tahun/2025.html?obf=TANGGAL_PUTUS&obm=desc
https://putusan3.mahkamahagung.go.id/direktori/index/pengadilan/pn-surabaya/kategori/narkotika-dan-psikotropika-1/tahunjenis/putus/tahun/2025.html?page=2&obf=TANGGAL_PUTUS&obm=desc
https://putusan3.mahkamahagung.go.id/direktori/index/pengadilan/pn-surabaya/kategori/narkotika-dan-psikotropika-1/tahunjenis/putus/tahun/2025.html?page=3&obf=TANGGAL_PUTUS&obm=desc
https://putusan3.mahkamahagung.go.id/direktori/index/pengadilan/pn-surabaya/kategori/narkotika-dan-psikotropika-1/tahunjenis/putus/tahun/2025.html?page=4&obf=TANGGAL_PUTUS&obm=desc
