# Web Scraping LPSE - Blacklist Data

---

For introduction of Selenium, please be kind to open [**this site**](https://www.scrapingbee.com/blog/selenium-python/)

## Import modules

`%pip freeze > requirements.txt`

In [1]:
# Module for web scraping
from selenium import webdriver
# Module for data manipulation
import pandas as pd
from bs4 import BeautifulSoup
# Module for regular expression
import re

## Load the Chromedriver

Read how to download webdriver for Chrome [**here**]('https://chromedriver.chromium.org/downloads')

In [2]:
# Main link
main_link = {
    'Aktif': 'https://inaproc.id/daftar-hitam?page=1',
    'Tidak Aktif': 'https://inaproc.id/daftar-hitam/non-aktif?page=1'
}

In [4]:
# Access to main link
DRIVER_PATH = '../bin/chromedriver'
driver = webdriver.Chrome(executable_path = DRIVER_PATH)
driver.get(main_link['Aktif'])

## Core Procedure

### 1 Get column names

#### Column 1

In [5]:
# element for column names - 1
colNamesfirst = driver.find_element_by_tag_name('thead').find_elements_by_tag_name('th')
print('Length of data in one page: {} columns'.format(len(colNamesfirst)))

Length of data in one page: 3 columns


In [6]:
# Column names
listColsFirst = []
for elem in colNamesfirst:
    col_raw = elem.text
    listColsFirst.append(col_raw)
# Result
listColsFirst = [i.replace('\n', ' ') for i in listColsFirst]
listColsFirst

['Penyedia', 'Alamat', 'Daftar Hitam']

#### Column 2

In [7]:
# element for column names - 2
colNamesElem = driver.find_element_by_tag_name('tbody').find_elements_by_tag_name('tbody')[0]
colNamesSecond = colNamesElem.find_elements_by_tag_name('td')
print('Length of data in one page: {} columns'.format(len(colNamesSecond)))

Length of data in one page: 8 columns


In [8]:
# element for column names - 2
colNamesElem = driver.find_element_by_tag_name('tbody').find_elements_by_tag_name('tbody')[9]
colNamesSecond = colNamesElem.find_elements_by_tag_name('td')
# Column names
listColsSecond = []
for elem in colNamesSecond[0::2]:
    col_raw = elem.text
    listColsSecond.append(col_raw)
# Result
listColsSecond = [i.replace('\n', ' ') for i in listColsSecond]
listColsSecond

['Status', 'Tanggal Penayangan', 'Masa Berlaku Sanksi', 'SK Penetapan']

### 2 Get the data

In [42]:
# Data collections
dataCollection = driver.find_element_by_tag_name('tbody')

In [43]:
# Prepare blank dictionary for full data set
dict_init = {}

# Length of rows in page
lengthRows = dataCollection.find_elements_by_tag_name('h5')
for row in range(len(lengthRows)):
    # Prepare blank dictionary for columns
    # Get data
    valVendor = dataCollection.find_elements_by_tag_name('h5')[row].text
    valNPWP = dataCollection.find_elements_by_class_name('npwp')[row].text
    valAddessGen = dataCollection.find_elements_by_class_name('header')[row].text
    valAddessDesc = dataCollection.find_elements_by_class_name('description')[row].text
    valId = dataCollection.find_elements_by_tag_name('a')[row].get_attribute('data-id')
    # Key-value for first column
    dict_val_first = {
        'Data ID': valId,
        'Penyedia': valVendor,
        'NPWP': valNPWP,
        'Alamat': valAddessGen,
        'Alamat Lengkap': valAddessDesc
    }
    
    # Key-value for second column
    colNamesElem = dataCollection.find_elements_by_tag_name('tbody')[row]
    colNamesSecond = colNamesElem.find_elements_by_tag_name('td')
    # Column names
    listColsSecond = []
    for elem in colNamesSecond[0::2]:
        col_raw = elem.text
        listColsSecond.append(col_raw)
    listColsSecond = [i.replace('\n', ' ') for i in listColsSecond]
    # Get the value
    valSecondColumn = []
    elemValues = colNamesElem.find_elements_by_tag_name('tr')
    for col in range(len(elemValues)):
        value = elemValues[col].text
        valSecondColumn.append(value)
    
    # Append between two columns
    data_row = {**dict_val_first, **dict(zip(listColsSecond, valSecondColumn))}
    
    # Append the dictionary
    dict_init = {**dict_init, **{
            valId: data_row
        }
    }

## Convert into JSON

In [46]:
# The data
dict_init

{'4541': {'Data ID': '4541',
  'Penyedia': 'PT. Esti Yasagama',
  'NPWP': 'NPWP: 01.210.912.0-016.000',
  'Alamat': 'Jakarta Selatan (Kota), DKI Jakarta',
  'Alamat Lengkap': 'Kios Taman Pondok Labu Lt. 1 Blok B No. 26 Jl. RS. Fatmawati Raya 27 Kelurahan Pondok Labu Kecamatan Cilandak Jakarta Selatan',
  'Status': 'Status Tayang',
  'Tanggal Penayangan': 'Tanggal Penayangan 20 Sep 2021',
  'Masa Berlaku Sanksi': 'Masa Berlaku Sanksi 20 Sep 2021 s/d 20 Sep 2023',
  'SK Penetapan': 'SK Penetapan PA/KPA Balai Pelaksanaan Jalan Nasional Nusa Tenggara Barat Kementerian Pekerjaan Umum dan Perumahan Rakyat No: 1086/KPTS/BPJN-NTB/2021'},
 '4537': {'Data ID': '4537',
  'Penyedia': 'CV. CAHAYA KATANGKA LESTARI',
  'NPWP': 'NPWP: 02.325.106.9-807.000',
  'Alamat': 'Gowa (Kab.), Sulawesi Selatan',
  'Alamat Lengkap': 'Jl. Mesjid Raya No.77, Tombolo, Sombu Upu.',
  'Status': 'Status Tayang',
  'Tanggal Penayangan': 'Tanggal Penayangan 17 Sep 2021',
  'Masa Berlaku Sanksi': 'Masa Berlaku Sanksi 10 S

## 3 Loop

In [25]:
# Get last page
pageElem = driver.find_element_by_class_name('pagination').find_elements_by_tag_name('a')
lastPage = pageElem[len(pageElem) - 1].text