# Web Scraping LPSE - Blacklist Data

---

For introduction of Selenium, please be kind to open [**this site**](https://www.scrapingbee.com/blog/selenium-python/)

## Import modules

`%pip freeze > requirements.txt`

In [10]:
# Module for web scraping
from selenium import webdriver
# Module for data manipulation
import pandas as pd
from bs4 import BeautifulSoup
# Module for regular expression
import re

## Load the Chromedriver

Read how to download webdriver for Chrome [**here**]('https://chromedriver.chromium.org/downloads')

In [2]:
# Main link
main_link = {
    'Aktif': 'https://inaproc.id/daftar-hitam?page=1',
    'Tidak Aktif': 'https://inaproc.id/daftar-hitam/non-aktif?page=1'
}

In [3]:
# Access to main link
DRIVER_PATH = '../bin/chromedriver'
driver = webdriver.Chrome(executable_path = DRIVER_PATH)
driver.get(main_link['Aktif'])

## Core Procedure

### 1 Get column names

#### Column 1

In [4]:
# element for column names - 1
colNamesfirst = driver.find_element_by_tag_name('thead').find_elements_by_tag_name('th')
print('Length of data in one page: {} columns'.format(len(colNamesfirst)))

Length of data in one page: 3 columns


In [5]:
# Column names
listColsFirst = []
for elem in colNamesfirst:
    col_raw = elem.text
    listColsFirst.append(col_raw)
# Result
listColsFirst = [i.replace('\n', ' ') for i in listColsFirst]
listColsFirst

['Penyedia', 'Alamat', 'Daftar Hitam']

#### Column 2

In [6]:
# element for column names - 2
colNamesElem = driver.find_element_by_tag_name('tbody').find_elements_by_tag_name('tbody')[0]
colNamesSecond = colNamesElem.find_elements_by_tag_name('td')
print('Length of data in one page: {} columns'.format(len(colNamesSecond)))

Length of data in one page: 8 columns


In [7]:
# Column names
listColsSecond = []
for elem in colNamesSecond[0::2]:
    col_raw = elem.text
    listColsSecond.append(col_raw)
# Result
listColsSecond = [i.replace('\n', ' ') for i in listColsSecond]
listColsSecond

['Status', 'Tanggal Penayangan', 'Masa Berlaku Sanksi', 'SK Penetapan']

### 2 Get the data

In [8]:
# Data collections
dataCollection = driver.find_element_by_tag_name('tbody')

#### Column 1

In [47]:
# Prepare blank dictionary for columns
first_column = {
    'Data ID': [],
    'Penyedia': [],
    'NPWP': [],
    'Alamat': [],
    'Alamat Lengkap': []
}

first_column

{'Data ID': [], 'Penyedia': [], 'NPWP': [], 'Alamat': [], 'Alamat Lengkap': []}

In [48]:
# Length of rows in page
lengthRows = dataCollection.find_elements_by_tag_name('h5')

for row in range(len(lengthRows)):
    # Get data
    valVendor = dataCollection.find_elements_by_tag_name('h5')[row].text
    valNPWP = dataCollection.find_elements_by_class_name('npwp')[row].text
    valAddessGen = dataCollection.find_elements_by_class_name('header')[row].text
    valAddessDesc = dataCollection.find_elements_by_class_name('description')[row].text
    valId = dataCollection.find_elements_by_tag_name('a')[row].get_attribute('data-id')
    # Key-value
    dict_val = {
        'data_id': valId,
        'vendor': valVendor,
        'npwp': valNPWP,
        'address_gen': valAddessGen,
        'address_desc': valAddessDesc
    }
    # Parse into list
    for col in range(len(dict_val.keys())):
        value = dict_val[list(dict_val.keys())[col]]
        first_column[list(first_column.keys())[col]].append(value)

In [49]:
first_column

{'Data ID': ['4541',
  '4537',
  '4538',
  '4530',
  '4536',
  '4534',
  '4535',
  '4533',
  '4532',
  '4528'],
 'Penyedia': ['PT. Esti Yasagama',
  'CV. CAHAYA KATANGKA LESTARI',
  'CV. GURUH',
  'CV Iswara Danadyaksa',
  'CV. SATRIA JANUAR PRATAMA',
  'PT.PUTRA KUBU KONSTRUKSI',
  'CV. LIBRA ABADI',
  'CV. WADAH KREASI UTAMA',
  'PT. MIRTADA SEJAHTERA',
  'PT. SINDANG MULTI MEGATAMA'],
 'NPWP': ['NPWP: 01.210.912.0-016.000',
  'NPWP: 02.325.106.9-807.000',
  'NPWP: 02.996.729.6-121.000',
  'NPWP: 94.157.082.2-543.000',
  'NPWP: 83.963.088.6-438.000',
  'NPWP: 83.097.404.4-907.000',
  'NPWP: 71.576.498.1-831.000',
  'NPWP: 03.320.242.5-411.000',
  'NPWP: 02.335.744.5-101.000',
  'NPWP: 03.093.722.1-031.000'],
 'Alamat': ['Jakarta Selatan (Kota), DKI Jakarta',
  'Gowa (Kab.), Sulawesi Selatan',
  'Medan (Kota), Sumatera Utara',
  'Bantul (Kab.), DI Yogyakarta',
  'Kuningan (Kab.), Jawa Barat',
  'Karang Asem (Kab.), Bali',
  'Dongala (Kab.), Sulawesi Tengah',
  'Tangerang Selatan (Kota

#### Column 2

In [50]:
# Second column
second_column = {key: [] for key in listColsSecond}
second_column

{'Status': [],
 'Tanggal Penayangan': [],
 'Masa Berlaku Sanksi': [],
 'SK Penetapan': []}

In [51]:
for elem in dataCollection.find_elements_by_tag_name('tbody'):
    elemValues = elem.find_elements_by_tag_name('tr')
    for col in range(len(elemValues)):
        value = elemValues[col].text
        # Append values
        second_column[list(second_column.keys())[col]].append(value)

In [52]:
second_column

{'Status': ['Status Tayang',
  'Status Tayang',
  'Status Tayang',
  'Status Tayang',
  'Status Tayang',
  'Status Tayang',
  'Status Tayang',
  'Status Tayang',
  'Status Tayang',
  'Status Tayang'],
 'Tanggal Penayangan': ['Tanggal Penayangan 20 Sep 2021',
  'Tanggal Penayangan 17 Sep 2021',
  'Tanggal Penayangan 15 Sep 2021',
  'Tanggal Penayangan 8 Sep 2021',
  'Tanggal Penayangan 8 Sep 2021',
  'Tanggal Penayangan 3 Sep 2021',
  'Tanggal Penayangan 3 Sep 2021',
  'Tanggal Penayangan 2 Sep 2021',
  'Tanggal Penayangan 2 Sep 2021',
  'Tanggal Penayangan 2 Sep 2021'],
 'Masa Berlaku Sanksi': ['Masa Berlaku Sanksi 20 Sep 2021 s/d 20 Sep 2023',
  'Masa Berlaku Sanksi 10 Sep 2021 s/d 10 Sep 2022',
  'Masa Berlaku Sanksi 13 Sep 2021 s/d 13 Sep 2022',
  'Masa Berlaku Sanksi 26 Agu 2021 s/d 26 Agu 2022',
  'Masa Berlaku Sanksi 27 Agu 2021 s/d 27 Agu 2023',
  'Masa Berlaku Sanksi 31 Agu 2021 s/d 31 Agu 2022',
  'Masa Berlaku Sanksi 3 Mei 2021 s/d 3 Mei 2022',
  'Masa Berlaku Sanksi 31 Agu 2

## Convert into JSON

In [63]:
# Dictionary for data
current_page = '1'
dict_full = {
    current_page: {
        **first_column,
        **second_column
    }
}

In [64]:
# Data
dict_full

{'1': {'Data ID': ['4541',
   '4537',
   '4538',
   '4530',
   '4536',
   '4534',
   '4535',
   '4533',
   '4532',
   '4528'],
  'Penyedia': ['PT. Esti Yasagama',
   'CV. CAHAYA KATANGKA LESTARI',
   'CV. GURUH',
   'CV Iswara Danadyaksa',
   'CV. SATRIA JANUAR PRATAMA',
   'PT.PUTRA KUBU KONSTRUKSI',
   'CV. LIBRA ABADI',
   'CV. WADAH KREASI UTAMA',
   'PT. MIRTADA SEJAHTERA',
   'PT. SINDANG MULTI MEGATAMA'],
  'NPWP': ['NPWP: 01.210.912.0-016.000',
   'NPWP: 02.325.106.9-807.000',
   'NPWP: 02.996.729.6-121.000',
   'NPWP: 94.157.082.2-543.000',
   'NPWP: 83.963.088.6-438.000',
   'NPWP: 83.097.404.4-907.000',
   'NPWP: 71.576.498.1-831.000',
   'NPWP: 03.320.242.5-411.000',
   'NPWP: 02.335.744.5-101.000',
   'NPWP: 03.093.722.1-031.000'],
  'Alamat': ['Jakarta Selatan (Kota), DKI Jakarta',
   'Gowa (Kab.), Sulawesi Selatan',
   'Medan (Kota), Sumatera Utara',
   'Bantul (Kab.), DI Yogyakarta',
   'Kuningan (Kab.), Jawa Barat',
   'Karang Asem (Kab.), Bali',
   'Dongala (Kab.), Su

## Convert into data frame

In [65]:
# Create a data frame
df = pd.DataFrame(
        data = dict_full['1']
)

In [66]:
print('Dimension: {} rows and {} columns'.format(len(df), len(df.columns)))
df.head()

Dimension: 10 rows and 9 columns


Unnamed: 0,Data ID,Penyedia,NPWP,Alamat,Alamat Lengkap,Status,Tanggal Penayangan,Masa Berlaku Sanksi,SK Penetapan
0,4541,PT. Esti Yasagama,NPWP: 01.210.912.0-016.000,"Jakarta Selatan (Kota), DKI Jakarta",Kios Taman Pondok Labu Lt. 1 Blok B No. 26 Jl....,Status Tayang,Tanggal Penayangan 20 Sep 2021,Masa Berlaku Sanksi 20 Sep 2021 s/d 20 Sep 2023,SK Penetapan PA/KPA Balai Pelaksanaan Jalan Na...
1,4537,CV. CAHAYA KATANGKA LESTARI,NPWP: 02.325.106.9-807.000,"Gowa (Kab.), Sulawesi Selatan","Jl. Mesjid Raya No.77, Tombolo, Sombu Upu.",Status Tayang,Tanggal Penayangan 17 Sep 2021,Masa Berlaku Sanksi 10 Sep 2021 s/d 10 Sep 2022,SK Penetapan PA/KPA KANWIL DITJEN PERBENDAHARA...
2,4538,CV. GURUH,NPWP: 02.996.729.6-121.000,"Medan (Kota), Sumatera Utara",JL. SMA II NO. 8 MEDAN,Status Tayang,Tanggal Penayangan 15 Sep 2021,Masa Berlaku Sanksi 13 Sep 2021 s/d 13 Sep 2022,SK Penetapan 38/KPTS/Bb2-wil1.S/2021
3,4530,CV Iswara Danadyaksa,NPWP: 94.157.082.2-543.000,"Bantul (Kab.), DI Yogyakarta","Perumahan Puri Gardenia Ruko No.09, Jl. Karang...",Status Tayang,Tanggal Penayangan 8 Sep 2021,Masa Berlaku Sanksi 26 Agu 2021 s/d 26 Agu 2022,SK Penetapan PA/KPA RUMAH SAKIT UMUM DR CIPTO ...
4,4536,CV. SATRIA JANUAR PRATAMA,NPWP: 83.963.088.6-438.000,"Kuningan (Kab.), Jawa Barat",Jl. RE Martadinata Rt. 01 Rw. 03,Status Tayang,Tanggal Penayangan 8 Sep 2021,Masa Berlaku Sanksi 27 Agu 2021 s/d 27 Agu 2023,SK Penetapan PA/KPA DINAS PEMBERDAYAAN MASYARA...


## 3 Loop

In [25]:
# Get last page
pageElem = driver.find_element_by_class_name('pagination').find_elements_by_tag_name('a')
lastPage = pageElem[len(pageElem) - 1].text