# BNBP Data Scraping

---

For introduction of Selenium, please be kind to open [**this site**](https://www.scrapingbee.com/blog/selenium-python/)

## Import modules

`%pip freeze > requirements.txt`

In [67]:
# Web scraping
from selenium import webdriver
from bs4 import BeautifulSoup

# Data manipulation
import pandas as pd

# Regular expression
import re

## Load the Chromedriver

Read how to download webdriver for Chrome [**here**]('https://chromedriver.chromium.org/downloads')

In [236]:
# Main link
main_link = 'https://dibi.bnpb.go.id/xdibi?pr=&kb=&jn=&th=&bl=&tb=2&st=3&kf=0&start=3000'

In [220]:
# Open the driver
DRIVER_PATH = '../bin/chromedriver'
driver = webdriver.Chrome(executable_path = DRIVER_PATH)

In [237]:
# Access to main link
driver.get(main_link)

## Core Procedure

### 1 Get information of BNBP

#### Get column names

In [238]:
# Column data collections
dataCollection = driver.find_element_by_class_name('col-md-12')

In [239]:
# Get the column elements
columnCollection = dataCollection.find_element_by_tag_name('thead').find_elements_by_tag_name('th')

In [240]:
# Get column names
listColNames = []

for elemCol in columnCollection:
    listColNames.append(elemCol.text)

# Result
listCols = [re.sub('[^a-zA-Z\d]', '', x) for x in listColNames]
listCols[len(listCols) - 1] = 'Links'

In [241]:
# Dictionary with blank list
dict_init = {key: [] for key in listCols}
dict_init

{'No': [],
 'KIB': [],
 'Wilayah': [],
 'Bencana': [],
 'Kejadian': [],
 'Detail': [],
 'Links': []}

#### Get the values in table for non-span object

In [242]:
# Data collections
valueCollection = dataCollection.find_element_by_tag_name('tbody').find_elements_by_tag_name('tr')
print('Length of data in one page: {} rows'.format(len(valueCollection)))

Length of data in one page: 10 rows


In [243]:
# Get data
for row in valueCollection:
    elemValues = row.find_elements_by_tag_name('td')
    for col in range(len(elemValues)):
        try:
            value = elemValues[col].text
        except:
            value = None
        # Append values
        dict_init[list(dict_init.keys())[col]].append(value)

In [244]:
dict_init

{'No': ['3001',
  '3002',
  '3003',
  '3004',
  '3005',
  '3006',
  '3007',
  '3008',
  '3009',
  '3010'],
 'KIB': ['3514101202103131',
  '1308101202103131',
  '3301105202103131',
  '3603102202103131',
  '3202102202103131',
  '1213999202103131',
  '3315101202103131',
  '7308999202103131',
  '3316101202103121',
  '7104101202103121'],
 'Wilayah': ['Kab. Pasuruan, Jawa Timur',
  'Kab. Lima Puluh Kota, Sumatera Barat',
  'Kab. Cilacap, Jawa Tengah',
  'Kab. Tangerang, Banten',
  'Kab. Sukabumi, Jawa Barat',
  'Kab. Langkat, Sumatera Utara',
  'Kab. Grobogan, Jawa Tengah',
  'Kab. Maros, Sulawesi Selatan',
  'Kab. Blora, Jawa Tengah',
  'Kab. Kepulauan Talaud, Sulawesi Utara'],
 'Bencana': ['BANJIR',
  'BANJIR',
  'PUTING BELIUNG',
  'TANAH LONGSOR',
  'TANAH LONGSOR',
  'Lainnya',
  'Banjir',
  'Lainnya',
  'BANJIR',
  'BANJIR'],
 'Kejadian': ['BANJIR',
  'BANJIR',
  'ANGIN KENCANG',
  'TANAH LONGSOR',
  'PERGESERAN TANAH',
  'Kebakaran Pemungkiman',
  'Banjir',
  'orang jatuh',
  'BANJIR'

#### Get the links

In [245]:
# Get links
links = driver.find_elements_by_class_name('btn.btn-info.btn-xs')
listLink = [link.get_attribute('href') for link in links]

In [246]:
listLink

['https://dibi.bnpb.go.id/xdibi/read/39037//////2//3001',
 'https://dibi.bnpb.go.id/xdibi/read/39038//////2//3002',
 'https://dibi.bnpb.go.id/xdibi/read/39039//////2//3003',
 'https://dibi.bnpb.go.id/xdibi/read/39040//////2//3004',
 'https://dibi.bnpb.go.id/xdibi/read/39041//////2//3005',
 'https://dibi.bnpb.go.id/xdibi/read/52907//////2//3006',
 'https://dibi.bnpb.go.id/xdibi/read/55748//////2//3007',
 'https://dibi.bnpb.go.id/xdibi/read/55294//////2//3008',
 'https://dibi.bnpb.go.id/xdibi/read/39020//////2//3009',
 'https://dibi.bnpb.go.id/xdibi/read/39021//////2//3010']

#### Get the values in table for span object

In [247]:
# Get the span detailed
spanDetailed = []
for i in dataCollection.find_elements_by_tag_name('span'):
    # Get the span type
    spanType = i.get_attribute('data-toggle')
    
    # Select only popover
    if spanType == 'popover':
        classValue = i.get_attribute('data-original-title')
        if re.sub('[^a-zA-Z\d]', '', classValue) in ['Keterangan', 'Korban', 'Kerusakan']:
            spanDetailed.append(i)

In [248]:
listDetailed = []
for idx in range(3, len(spanDetailed) + 3, 3):
    # Dictionary for storing data
    dictDetailed = {}
    
    # Looping per 3 object
    for j in range(idx - 3, idx):
        classValue = re.sub('[^a-zA-Z\d]', '', spanDetailed[j].get_attribute('data-original-title'))
        # Get the values
        if classValue != None:
            dictDetailed.update(
                {
                    classValue: spanDetailed[j].get_attribute('data-content')
                }
            )
        else:
            dictDetailed.update(
                {
                    classValue: None
                }
            )
    
    # Append the dictionary
    listDetailed.append(dictDetailed)

In [249]:
listDetailed

[{'Keterangan': '',
  'Korban': 'Meninggal : 0<br>Hilang : 0<br>Terluka : 0<br>Menderita : 156<br>Mengungsi : 0<br>',
  'Kerusakan': 'Rumah : 0<br>Fas.Pendidikan : 0<br>Fas.Kesehatan : 0<br>Fas.Peribadatan : 0<br>Fas.Umum : 0<br>Perkantoran : 0<br>Jembatan : 0<br>Pabrik : 0<br>Pertokoan : 0<br>'},
 {'Keterangan': '',
  'Korban': 'Meninggal : 0<br>Hilang : 0<br>Terluka : 0<br>Menderita : 1,223<br>Mengungsi : 0<br>',
  'Kerusakan': 'Rumah : 1<br>Fas.Pendidikan : 0<br>Fas.Kesehatan : 0<br>Fas.Peribadatan : 0<br>Fas.Umum : 0<br>Perkantoran : 0<br>Jembatan : 1<br>Pabrik : 0<br>Pertokoan : 0<br>'},
 {'Keterangan': '',
  'Korban': 'Meninggal : 0<br>Hilang : 0<br>Terluka : 0<br>Menderita : 0<br>Mengungsi : 0<br>',
  'Kerusakan': 'Rumah : 1<br>Fas.Pendidikan : 0<br>Fas.Kesehatan : 0<br>Fas.Peribadatan : 0<br>Fas.Umum : 0<br>Perkantoran : 0<br>Jembatan : 0<br>Pabrik : 0<br>Pertokoan : 0<br>'},
 {'Keterangan': '',
  'Korban': 'Meninggal : 0<br>Hilang : 0<br>Terluka : 0<br>Menderita : 500<br>Mengu

### 2 Update the dictionary

In [250]:
# Update the dictionary
dict_init.update(
    {
        'Detail': listDetailed,
        'Links': listLink
    }
)

In [251]:
dict_init

{'No': ['3001',
  '3002',
  '3003',
  '3004',
  '3005',
  '3006',
  '3007',
  '3008',
  '3009',
  '3010'],
 'KIB': ['3514101202103131',
  '1308101202103131',
  '3301105202103131',
  '3603102202103131',
  '3202102202103131',
  '1213999202103131',
  '3315101202103131',
  '7308999202103131',
  '3316101202103121',
  '7104101202103121'],
 'Wilayah': ['Kab. Pasuruan, Jawa Timur',
  'Kab. Lima Puluh Kota, Sumatera Barat',
  'Kab. Cilacap, Jawa Tengah',
  'Kab. Tangerang, Banten',
  'Kab. Sukabumi, Jawa Barat',
  'Kab. Langkat, Sumatera Utara',
  'Kab. Grobogan, Jawa Tengah',
  'Kab. Maros, Sulawesi Selatan',
  'Kab. Blora, Jawa Tengah',
  'Kab. Kepulauan Talaud, Sulawesi Utara'],
 'Bencana': ['BANJIR',
  'BANJIR',
  'PUTING BELIUNG',
  'TANAH LONGSOR',
  'TANAH LONGSOR',
  'Lainnya',
  'Banjir',
  'Lainnya',
  'BANJIR',
  'BANJIR'],
 'Kejadian': ['BANJIR',
  'BANJIR',
  'ANGIN KENCANG',
  'TANAH LONGSOR',
  'PERGESERAN TANAH',
  'Kebakaran Pemungkiman',
  'Banjir',
  'orang jatuh',
  'BANJIR'