# Preparation

Few thing to prepare before using Selenium to obtain data from a website, you need to download Chrome Driver as the web driver. You can download it on the following site according to the device you have.

https://developer.chrome.com/docs/chromedriver/downloads

In [1]:
import pandas as pd
import requests
import codecs
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.by import By

# Web Scrapping

In [19]:
# Find Chrome Driver and open it in Webdriver which will be controlled by automated test software
DRIVER_PATH = "C:/Users/Annisa Sekar Tierra/Downloads/chromedriver-win32/chromedriver-win32/chromedriver.exe"
service = Service(DRIVER_PATH)
driver = webdriver.Chrome(service=service)

In [16]:
# Select the year and province ID for which you want to retrieve data
tahun = [2023, 2022]
prov = [11,12,13,14,15,16,17,18,19,21,31,32,33,34,35,36,51,52,53,61,62,63,64,65,71,72,73,74,75,76,81,82,91,94]

In [11]:
# Function to retrieve numbers from an HTML string
def extract_number(html_string):
    html_string = html_string.replace(',', '')
    import re
    match = re.search(r'\d+', html_string)
    if match:
        return int(match.group())
    else:
        return None

In [20]:
# Create a new data frame containing columns according to the data you want to obtain
all_data = pd.DataFrame(columns = ['Tahun', 'Provinsi', 'Jumlah Bencana Banjir', 'Korban Meninggal', 'Korban Hilang', 'Korban Terluka',
                                   'Korban Menderita','Korban Mengungsi','Rumah Terdampak', 'Fasilitas Pendidikan Terdampak', 
                                   'Fasilitas Kesehatan Terdampak', 'Fasilitas Peribadatan Terdampak', 'Fasilitas Umum Terdampak'])

# Loop to retrieve data for each year and province
for year in tahun:
    for region in prov:
        # Navigating to Web Page
        driver.get('https://dibi.bnpb.go.id/')

        # If needed, you can asks Webdriver to wait a while for the dropdown to load
        # WebDriverWait(driver, 3).until(
        #     EC.presence_of_element_located((By.ID, 'th'))
        # )

        # Locating Dropdown Menu for Year, Region, and Disaster Type
        select_element = Select(driver.find_element("id","th"))  # Replace 'dropdown_id' with the actual ID of the dropdown element
        select_element.select_by_value(str(year)) 

        select_element = Select(driver.find_element("id","pr"))  # Replace 'dropdown_id' with the actual ID of the dropdown element
        select_element.select_by_value(str(region)) 

        select_element = Select(driver.find_element("id","jn"))  # Replace 'dropdown_id' with the actual ID of the dropdown element
        select_element.select_by_value("101") 

        response = driver.page_source

        # Retrieve data using Beautiful soup
        soup= BeautifulSoup(response,'html.parser' )
        
        try:
            year_prov = [year, region]
            victim = soup.find('table').find('tbody').find_all('td', attrs={'align':"right"})
            
            tables = soup.find_all('table')
            second_table = tables[1]
            building = second_table.find('tbody').find_all('td', attrs={'align':"right"})
            all_loss = victim + building

            asstring = list(map(str, all_loss))
            asnumber = [extract_number(html) for html in asstring]
        
        except:
            asnumber = [None for _ in range(11)]

        dataAll = year_prov + asnumber
        df_all = pd.DataFrame([dataAll], columns = all_data.columns)
        
        # Saves all data from the loop results
        all_data = pd.concat([all_data, df_all], ignore_index=True)
        

In [21]:
# Show all data
all_data

Unnamed: 0,Tahun,Provinsi,Jumlah Bencana Banjir,Korban Meninggal,Korban Hilang,Korban Terluka,Korban Menderita,Korban Mengungsi,Rumah Terdampak,Fasilitas Pendidikan Terdampak,Fasilitas Kesehatan Terdampak,Fasilitas Peribadatan Terdampak,Fasilitas Umum Terdampak
0,2023,11,97,5,0,7,329350,25240,1354,1,0,0,0
1,2023,12,112,10,11,102,157929,3193,101,9,3,3,0
2,2023,13,72,7,2,4593,65459,4175,702,12,3,11,1
3,2023,14,79,1,0,0,433175,8784,11202,73,10,107,84
4,2023,15,24,5,0,4,59018,6400,490,101,6,9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,2022,76,27,1,1,0,89806,1595,630,1,0,3,13
64,2022,81,19,7,0,3,19451,340,43,0,0,0,1
65,2022,82,33,1,0,0,18467,291,62,0,0,0,5
66,2022,91,18,3,0,10,5473,2061,256,0,0,0,0


In [22]:
# Change the province ID to its name using the dictionary
provinsi_mapping = {
    11: 'Aceh',
    12: 'Sumatera Utara',
    13: 'Sumatera Barat',
    14: 'Riau',
    15: 'Jambi',
    16: 'Sumatera Selatan',
    17: 'Bengkulu',
    18: 'Lampung',
    19: 'Kepulauan Bangka Belitung',
    21: 'Kepulauan Riau',
    31: 'DKI Jakarta',
    32: 'Jawa Barat',
    33: 'Jawa Tengah',
    34: 'DI Yogyakarta',
    35: 'Jawa Timur',
    36: 'Banten',
    51: 'Bali',
    52: 'Nusa Tenggara Barat',
    53: 'Nusa Tenggara Timur',
    61: 'Kalimantan Barat',
    62: 'Kalimantan Tengah',
    63: 'Kalimantan Selatan',
    64: 'Kalimantan Timur',
    65: 'Kalimantan Utara',
    71: 'Sulawesi Utara',
    72: 'Sulawesi Tengah',
    73: 'Sulawesi Selatan',
    74: 'Sulawesi Tenggara',
    75: 'Gorontalo',
    76: 'Sulawesi Barat',
    81: 'Maluku',
    82: 'Maluku Utara',
    91: 'Papua',
    94: 'Papua Barat'
}

In [23]:
all_data['Provinsi'] = all_data['Provinsi'].replace(provinsi_mapping)
all_data

Unnamed: 0,Tahun,Provinsi,Jumlah Bencana Banjir,Korban Meninggal,Korban Hilang,Korban Terluka,Korban Menderita,Korban Mengungsi,Rumah Terdampak,Fasilitas Pendidikan Terdampak,Fasilitas Kesehatan Terdampak,Fasilitas Peribadatan Terdampak,Fasilitas Umum Terdampak
0,2023,Aceh,97,5,0,7,329350,25240,1354,1,0,0,0
1,2023,Sumatera Utara,112,10,11,102,157929,3193,101,9,3,3,0
2,2023,Sumatera Barat,72,7,2,4593,65459,4175,702,12,3,11,1
3,2023,Riau,79,1,0,0,433175,8784,11202,73,10,107,84
4,2023,Jambi,24,5,0,4,59018,6400,490,101,6,9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,2022,Sulawesi Barat,27,1,1,0,89806,1595,630,1,0,3,13
64,2022,Maluku,19,7,0,3,19451,340,43,0,0,0,1
65,2022,Maluku Utara,33,1,0,0,18467,291,62,0,0,0,5
66,2022,Papua,18,3,0,10,5473,2061,256,0,0,0,0


In [24]:
# Save data frame to local
path = "Dampak Banjir 2023-2022.xlsx"
all_data.to_excel(path, index=False)