# BPK Regulation Scraping Prototype

This notebook scrapes regulation data from BPK website and updates the database.


In [1]:
import requests
from bs4 import BeautifulSoup
import json
import re
from typing import Optional, Dict, Any

In [3]:
class BPKRegulationScraper:
    def __init__(self):
        self.base_url = "https://peraturan.bpk.go.id"
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
    
    def scrape_regulation_detail(self, url: str) -> Dict[str, Any]:
        """Scrape regulation details from BPK website"""
        print(f"Scraping URL: {url}")
        
        response = self.session.get(url)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Initialize result dictionary
        result = {
            'nama_peraturan': None,
            'link_peraturan': url,
            'tipe_dokumen': None,
            'materi_pokok': None,
            'judul': None,
            'teu': None,
            'nomor': None,
            'bentuk': None,
            'bentuk_singkat': None,
            'tahun': None,
            'tempat_penetapan': None,
            'tanggal_penetapan': None,
            'tanggal_pengundangan': None,
            'tanggal_berlaku': None,
            'sumber': None,
            'status': None,
            'bahasa': None,
            'lokasi': None,
            'bidang': None,
            'subjek': None,  # Add subjek field
            'dicabut_dengan': [],
            'mencabut': [],
            'diubah_dengan': [],
            'mengubah': [],
            'ujimateri_mk': [],
            'file_peraturan': [],
            'file_pdf': None
        }
        
        # Extract title from page header
        title_element = soup.find('h4', class_='mb-8')
        if title_element:
            result['nama_peraturan'] = title_element.get_text(strip=True)
        
        # Extract full title/subject
        subject_element = soup.find('h1', class_='text-white')
        if subject_element:
            result['materi_pokok'] = subject_element.get_text(strip=True)
        
        # Extract detailed information from rows
        detail_rows = soup.find_all('div', class_=['py-4', 'bg-light-primary'])
        
        for row in detail_rows:
            label_element = row.find('div', class_='fw-bold')
            if not label_element:
                continue
                
            label = label_element.get_text(strip=True)
            value_element = label_element.find_next_sibling('div')
            
            if not value_element:
                continue
                
            value = value_element.get_text(strip=True)
            
            # Map labels to fields
            if 'Judul' in label:
                result['judul'] = value
            elif 'Nomor' in label:
                result['nomor'] = value
            elif 'Tahun' in label:
                result['tahun'] = value
            elif 'Tempat Penetapan' in label:
                result['tempat_penetapan'] = value
            elif 'Tanggal Penetapan' in label:
                result['tanggal_penetapan'] = self._parse_date(value)
            elif 'Tanggal Pengundangan' in label:
                result['tanggal_pengundangan'] = self._parse_date(value)
            elif 'Tanggal Berlaku' in label:
                result['tanggal_berlaku'] = self._parse_date(value)
            elif 'Sumber' in label:
                result['sumber'] = value
            elif 'Status' in label:
                result['status'] = value
            elif 'Bahasa' in label:
                result['bahasa'] = value
            elif 'Subjek' in label:
                result['subjek'] = value if value else None
            elif 'Bidang' in label:
                result['bidang'] = value if value else None
        
        # Extract regulation type from nama_peraturan
        if result['nama_peraturan']:
            # Extract type like "Peraturan Pemerintah (PP)"
            type_match = re.search(r'^([^N]+)(?=\s+Nomor|\s+No\.)', result['nama_peraturan'])
            if type_match:
                result['tipe_dokumen'] = type_match.group(1).strip()
                result['bentuk'] = result['tipe_dokumen']
                
                # Extract short form like "PP"
                short_match = re.search(r'\(([^)]+)\)', result['tipe_dokumen'])
                if short_match:
                    result['bentuk_singkat'] = short_match.group(1)
        
        # Set default values
        result['teu'] = 'Indonesia, Pemerintah Pusat'
        result['lokasi'] = 'Pemerintah Pusat'
        
        # Extract PDF download link
        pdf_links = soup.find_all('a', class_='download-file')
        if len(pdf_links) >= 2:
            pdf_link = pdf_links[1]  # Second occurrence
            if pdf_link and pdf_link.get('href'):
                result['file_pdf'] = self.base_url + pdf_link['href']
        
        # Extract relationships (mencabut, diubah_dengan, etc.)
        self._extract_relationships(soup, result)
        
        return result
    
    def _parse_date(self, date_text: str) -> Optional[str]:
        """Parse date from various formats"""
        if not date_text or date_text.strip() == '':
            return None
            
        # Clean the text
        date_text = date_text.strip()
        
        # Try to extract date patterns
        date_patterns = [
            r'(\d{1,2})\s+(\w+)\s+(\d{4})',  # 30 Juni 1961
            r'(\d{4})-(\d{2})-(\d{2})',      # 1961-06-30
            r'(\d{1,2})/(\d{1,2})/(\d{4})',  # 30/06/1961
        ]
        
        month_mapping = {
            'januari': '01', 'februari': '02', 'maret': '03', 'april': '04',
            'mei': '05', 'juni': '06', 'juli': '07', 'agustus': '08',
            'september': '09', 'oktober': '10', 'november': '11', 'desember': '12'
        }
        
        # Pattern 1: Indonesian date format
        match = re.search(date_patterns[0], date_text, re.IGNORECASE)
        if match:
            day, month_name, year = match.groups()
            month_num = month_mapping.get(month_name.lower())
            if month_num:
                return f"{year}-{month_num}-{day.zfill(2)}"
        
        # Pattern 2: ISO format
        match = re.search(date_patterns[1], date_text)
        if match:
            return date_text
        
        return None
    
    def _extract_relationships(self, soup: BeautifulSoup, result: Dict[str, Any]):
        """Extract regulation relationships (mencabut, diubah_dengan, etc.)"""
        
        # Look for containers with fs-6 class (relationship sections)
        containers = soup.find_all('div', class_='container')
        
        for container in containers:
            if 'fs-6' not in container.get('class', []):
                continue
                
            # Find all relationship sections within this container
            rows = container.find_all('div', class_='row')
            
            current_relationship_type = None
            
            for row in rows:
                # Check if this row contains a relationship header
                header_div = row.find('div', class_=['fw-semibold', 'bg-light-primary'])
                if header_div:
                    header_text = header_div.get_text(strip=True).lower()
                    
                    # Determine the relationship type
                    if 'diubah dengan' in header_text:
                        current_relationship_type = 'diubah_dengan'
                    elif 'mencabut' in header_text:
                        current_relationship_type = 'mencabut'
                    elif 'mengubah' in header_text:
                        current_relationship_type = 'mengubah'
                    elif 'dicabut dengan' in header_text:
                        current_relationship_type = 'dicabut_dengan'
                    else:
                        current_relationship_type = None
                    continue
                
                # Check if this row contains the actual list data
                if current_relationship_type:
                    ol_element = row.find('ol')
                    if ol_element:
                        items = []
                        for li in ol_element.find_all('li'):
                            link = li.find('a')
                            if link:
                                # Get the full text and clean it
                                full_text = li.get_text(strip=True)
                                
                                # Clean up whitespace but preserve structure
                                full_text = re.sub(r'\s+', ' ', full_text)
                                
                                # Get the link and make it absolute
                                href = link.get('href', '')
                                if href.startswith('/'):
                                    href = self.base_url + href
                                
                                items.append({
                                    'text': full_text,
                                    'link': href
                                })
                        
                        if items:
                            result[current_relationship_type] = items
                        
                        # Reset the relationship type after processing
                        current_relationship_type = None

In [5]:
# Test the scraper
test_url = "https://peraturan.bpk.go.id/Details/49482/pp-no-34-tahun-2005"

scraper = BPKRegulationScraper()

# Scrape the data
print("Starting scraping...")
scraped_data = scraper.scrape_regulation_detail(test_url)

# Display scraped data
print("\n=== SCRAPED DATA ===")
for key, value in scraped_data.items():
    if isinstance(value, list) and len(value) > 0:
        print(f"{key}: {json.dumps(value, indent=2, ensure_ascii=False)}")
    elif value is not None and value != '':
        print(f"{key}: {value}")
    else:
        print(f"{key}: (empty)")

Starting scraping...
Scraping URL: https://peraturan.bpk.go.id/Details/49482/pp-no-34-tahun-2005

=== SCRAPED DATA ===
nama_peraturan: Peraturan Pemerintah (PP) No. 34 Tahun 2005
link_peraturan: https://peraturan.bpk.go.id/Details/49482/pp-no-34-tahun-2005
tipe_dokumen: Peraturan Pemerintah (PP)
materi_pokok: Perubahan Atas Peraturan Pemerintah Nomor 35 Tahun 2004 Tentang Kegiatan Usaha Hulu Minyak Dan Gas Bumi
judul: Peraturan Pemerintah (PP)  Nomor 34 Tahun 2005 tentang Perubahan Atas Peraturan Pemerintah Nomor 35 Tahun 2004 Tentang Kegiatan Usaha Hulu Minyak Dan Gas Bumi
teu: Indonesia, Pemerintah Pusat
nomor: 34
bentuk: Peraturan Pemerintah (PP)
bentuk_singkat: PP
tahun: 2005
tempat_penetapan: Jakarta
tanggal_penetapan: 2005-09-10
tanggal_pengundangan: 2005-09-10
tanggal_berlaku: 2005-09-10
sumber: LN. 2005 No. 81, TLN No.  4530, LL SETNEG : 4 HLM
status: Berlaku
bahasa: Bahasa Indonesia
lokasi: Pemerintah Pusat
bidang: (empty)
subjek: PERTAMBANGAN MIGAS, MINERAL DAN ENERGI
dicabut