# Web Scraping SSCASN BKN: From Interactive Elements to Meaningful Data

Okay, so here’s the deal: I’m using this notebook to scrape data from the SSCASN BKN website. Why? Because I’m trying to figure out which job formations have a better chance of getting me through the CPNS 2025 selection process. Instead of manually sifting through pages and pages of information, I thought, "Why not let Python do the boring work?"

This project is super personal and I’m basically building my own cheat sheet to make smarter decisions during the application process. Plus, I get to brush up on my web scraping skills while I’m at it. Win-win, right?

What’s the Plan?
The SSCASN website has everything I need, but it’s not exactly user-friendly for data nerds like me. The challenge is to deal with:

- Dropdown menus that dynamically load options.
- Pagination, because of course, all the good stuff is spread across multiple pages.
- Dynamic tables that make scraping a bit tricky.

The goal? Automate the whole process so I can get clean data in one go.

In [3]:
import pandas as pd
import math
import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

In [2]:

path_chromedriver = os.path.join(os.getcwd(), "chromedriver-win64", "chromedriver.exe")

chrome_options = Options()

service = Service(executable_path=path_chromedriver)


driver = webdriver.Chrome(service=service, options=chrome_options)

In [3]:
driver = webdriver.Chrome(service=service, options=chrome_options)
url = "https://sscasn.bkn.go.id/"
driver.get(url)

In [4]:
def reset_scrap(): #buat reset web
    url = "https://sscasn.bkn.go.id/"
    driver.get(url)  
reset_scrap()

In [5]:
jenjang = "S-1/Sarjana"
prodi = "S-1 ILMU STATISTIK"
pengadaan = "CPNS"

In [6]:
prodi_list = [
    "S-1 ILMU STATISTIK",
    "S-1 ILMU STATISTIKA",
    "S-1 KEPENDUDUKAN DAN STATISTIK",
    "S-1 KOMPUTASI STATISTIKA",
    "S-1 MATEMATIKA STATISTIKA",
    "S-1 STATISTIK",
    "S-1 STATISTIK (DATA ANALISIS)",
    "S-1 STATISTIK KEUANGAN",
    "S-1 STATISTIK KOMPUTASI",
    "S-1 STATISTIKA",
    "S-1 STATISTIKA BISNIS DAN INDUSTRI",
    "S-1 STATISTIKA DAN SAINS DATA",
    "S-1 STATISTIKA TERAPAN",
    "S-1/D-IV STATISTIK"
]

In [7]:
all_data = []
def extract_data_prodi(prodi):
    global all_data

    # Pilih jenjang S1
    jenjang_pendidikan = driver.find_element(By.XPATH, '//*[@id="pencarian"]/div/div/form/div[1]/div[1]/div/div/div/input')
    jenjang_pendidikan.click()
    options = driver.find_elements(By.XPATH, "//li[contains(text(), 'S-1/Sarjana')]")

    for option in options:
        if option.text == jenjang:
            option.click() 
            break
    time.sleep(1)

    # Pilih Prodi
    prodi_path  = '//*[@id="pencarian"]/div/div/form/div[1]/div[2]/div/div/div/input'
    prodi_select = driver.find_element(By.XPATH,prodi_path)
    prodi_select.click()

    options = driver.find_elements(By.XPATH,f"//li[contains(text(),'{prodi}')]")

    for option in options:
        if option.text == prodi:
            option.click()
            break

    time.sleep(1)
    # Klik pengadaan CPNS
    pengadaan_path = '//*[@id="pencarian"]/div/div/form/div[1]/div[4]/div/div/div/input'
    pengadaan_select = driver.find_element(By.XPATH,pengadaan_path)
    pengadaan_select.click()

    options = driver.find_elements(By.XPATH,f"//li[contains(text(),'{pengadaan}')]")

    for option in options:
        if option.text == pengadaan:
            option.click()

    time.sleep(1)
    # Klik search
    cari_path = '//*[@id="pencarian"]/div/div/form/div[1]/div[5]/a'
    driver.find_element(By.XPATH,cari_path).click()


    time.sleep(5)

    #hitung berapa halaman
    total_formasi_path = driver.find_element(By.XPATH, '//*[@id="daftarFormasi"]/div[2]/div/div/div[3]/ul/li[1]')

    text_halaman = total_formasi_path.text
    total_formasi = text_halaman.split(': ')[1].split()[0]
    total_page = math.ceil(int(total_formasi)/10)
    
    page_index = 10  
    current_page = 1

    for i in range(current_page,total_page):
        data = extract_data()
        all_data.extend(data)

        if current_page<=3:
            klik(page_index)

        elif current_page == total_page-3 or current_page == 4:
            page_index = 11
            klik(page_index)

        elif current_page >= total_page-2:
            page_index = 10
            klik(page_index)

        else:
            page_index = 12
            klik(page_index)
        
        current_page += 1
    reset_scrap()
    time.sleep(3)

In [8]:
def extract_data():
    table_data = []
    table = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, "//table"))
    )

    rows = table.find_elements(By.XPATH, ".//tbody/tr")

    for row in rows:

        cells = row.find_elements(By.TAG_NAME, "td")
        row_data = {
            "Jabatan": cells[0].text,
            "Instansi": cells[1].text,
            "Unit Kerja": cells[2].text,
            "Formasi": cells[3].text,
            "(PPPK) Khusus disabilitas? (CPNS) Dapat Diisi Disabilitas?": cells[4].text,
            "Penghasilan (juta)": cells[5].text,
            "Jumlah Kebutuhan": cells[6].text,
            "Jumlah Lulus verifikasi": cells[7].text,
            "Link": cells[8].find_element(By.TAG_NAME, "a").get_attribute('href')
        }

        table_data.append(row_data)
    return table_data

In [9]:
def klik(index):
    button_xpath = f'//*[@id="daftarFormasi"]/div[2]/div/div/div[3]/ul/li[{index}]/button'
    button = driver.find_element(By.XPATH,button_xpath)
    button.click()


In [10]:
for prodi in prodi_list:
    extract_data_prodi(prodi)

In [14]:
df = pd.DataFrame(all_data)
df.to_excel("extracted_data.xlsx", index=False) 