# Dewan Pers - Data Scrapper

---

For introduction of Selenium, please be kind to open [**this site**](https://www.scrapingbee.com/blog/selenium-python/)

## Import modules

In [None]:
%pip freeze > requirements.txt

In [1]:
# Modules for web scraping
from selenium import webdriver
# URL encoding
from requests.utils import requote_uri
# Module for data manipulation
import pandas as pd
# Module for regular expression
import re
# Module for file management
import os
# Module for timing
import time

## Load the Chromedriver

Read how to download webdriver for Chrome [**here**]('https://chromedriver.chromium.org/downloads')

**Options**

In [2]:
# Options for Chrome windows
options = webdriver.ChromeOptions()
options.add_argument('--start-maximized')
options.add_argument('window-size=2560,1440')

**Chromedriver path**

In [22]:
DRIVER_PATH = '../bin/chromedriver'
driver = webdriver.Chrome(executable_path = DRIVER_PATH, options = options)

## Core Procedure

**URL and Query**

In [42]:
# Main URL
url_query = 'https://dewanpers.or.id/data/perusahaanpers'
url_query

'https://dewanpers.or.id/data/perusahaanpers'

In [24]:
# Root directory
dir_path = os.getcwd()
dir_path

'E:\\AUDHI\\PROJECT\\DEWAN PERS\\notebook'

**Access website**

In [43]:
driver.get(url_query)
driver.implicitly_wait(20)

**Switch to iframe**

In [54]:
driver.switch_to.frame(driver.find_element_by_tag_name('iframe'))

**List all product and its characteristics**

In [57]:
driver

<selenium.webdriver.chrome.webdriver.WebDriver (session="e5285be4628951cba5b16cc9f3bc8fad")>

In [87]:
looping_index = driver.find_elements_by_class_name('kv-panel-pager')[0]
looping_index

<selenium.webdriver.remote.webelement.WebElement (session="e5285be4628951cba5b16cc9f3bc8fad", element="e395457e-c547-450f-afb0-db5bb09167dc")>

In [92]:
# url
url_idx = {}
index = looping_index.find_elements_by_tag_name('a')
for elem in index:
    url_elem = elem.get_attribute('href')
    index_elem = elem.get_attribute('text')
    # Update
    url_idx.update({url_elem: index_elem})

In [93]:
url_idx

{'https://datapers.dewanpers.or.id/site/iframe-verified?page=1&per-page=10': '1',
 'https://datapers.dewanpers.or.id/site/iframe-verified?page=2&per-page=10': '»',
 'https://datapers.dewanpers.or.id/site/iframe-verified?page=3&per-page=10': '3',
 'https://datapers.dewanpers.or.id/site/iframe-verified?page=4&per-page=10': '4',
 'https://datapers.dewanpers.or.id/site/iframe-verified?page=5&per-page=10': '5',
 'https://datapers.dewanpers.or.id/site/iframe-verified?page=6&per-page=10': '6',
 'https://datapers.dewanpers.or.id/site/iframe-verified?page=7&per-page=10': '7',
 'https://datapers.dewanpers.or.id/site/iframe-verified?page=8&per-page=10': '8',
 'https://datapers.dewanpers.or.id/site/iframe-verified?page=9&per-page=10': '9',
 'https://datapers.dewanpers.or.id/site/iframe-verified?page=10&per-page=10': '10'}

**Access each website**

In [100]:
url_selected = list(url_idx)[0]
url_selected

'https://datapers.dewanpers.or.id/site/iframe-verified?page=1&per-page=10'

In [101]:
driver.get(url_selected)
driver.implicitly_wait(20)

**Get the tables**

In [118]:
# Column length
cols = []
for elem in driver.find_elements_by_tag_name('tr')[0].find_elements_by_tag_name('a'):
    col = elem.get_attribute('text')
    cols.append(col)
    print(col)

print('\nColumn length is about {len_col}'.format(len_col = len(cols)))

Nama Media
Jenis Media
Penanggung Jawab
Pemimpin Redaksi
Badan Hukum
Provinsi
Alamat
Telp
Email
Website
Status
Tgl Approve

Column length is about 12


In [140]:
# Number of rows
rows = driver.find_elements_by_tag_name('tbody')[0].find_elements_by_tag_name('tr')
len(rows)

10

In [144]:
# Elements
first_row = rows[0].find_elements_by_tag_name('td')
len(first_row)

13

In [181]:
# Dictionary for saving the data
dic = {
    'number': [],
    'nama_media': [],
    'jenis_media': [],
    'penanggung_jawab': [],
    'pemimpin_redaksi': [],
    'badan_hukum': [],
    'provinsi': [],
    'alamat': [],
    'telp': [],
    'email': [],
    'website': [],
    'status': [],
    'tgl_approve': []
}

In [182]:
for elem in rows:
    values = elem.find_elements_by_tag_name('td')
    for i in range(len(values)):
        dic[list(dic)[i]].append(values[i].text)

In [183]:
dic

{'number': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'],
 'nama_media': ['kaltimtoday.co',
  'Tekape.co',
  'digtara.com',
  'CNBC Indonesia',
  'Swara Kaltim',
  'viralsumsel.com',
  'Wideazone.com',
  'Pedoman Media',
  'mediaadvokasi.com',
  'metrobali.com'],
 'jenis_media': ['Siber',
  'Siber',
  'Siber',
  'Televisi',
  'Cetak',
  'Siber',
  'Siber',
  'Cetak',
  'Siber',
  'Siber'],
 'penanggung_jawab': ['Maturidi',
  'Indar Ismail Jamaluddin',
  'Irwansyah Putra Nasution',
  'Wahyu Daniel Panjaitan',
  'Muhammad Syafranuddin',
  'Darwin Syarkowi',
  'Oktap Riyadi',
  'Marselius Gusti Palumpun',
  'Saidiono',
  'I Wayan Suana'],
 'pemimpin_redaksi': ['Maturidi',
  'Indar Ismail Jamaluddin',
  'Irwansyah Putra Nasution',
  'Wahyu Daniel Panjaitan',
  'Mohammad Abdun Kuddu',
  'Darwin Syarkowi',
  'Oktap Riyadi',
  'Marselius Gusti Palumpun',
  'Saidiono',
  'I Wayan Suarna'],
 'badan_hukum': ['Media mandiri perkasa',
  'Tempat Kreatifitas Anak Muda Pore',
  'MEDIA IBE NUSAN

**Create a dataframe**

In [193]:
# Create a dataframe
df = pd.DataFrame.from_dict(
    data = dic
)
# Rename columns
df.columns = ['Number'] + cols
df.head()

Unnamed: 0,Number,Nama Media,Jenis Media,Penanggung Jawab,Pemimpin Redaksi,Badan Hukum,Provinsi,Alamat,Telp,Email,Website,Status,Tgl Approve
0,1,kaltimtoday.co,Siber,Maturidi,Maturidi,Media mandiri perkasa,Kalimantan Timur,"Jalan a wahab syahrani gang walet 1 No 2, sama...",082216557666,kaltimtoday99@gmail.com,www.kaltimtoday.co,Terverifikasi Administratif,2021-07-05
1,2,Tekape.co,Siber,Indar Ismail Jamaluddin,Indar Ismail Jamaluddin,Tempat Kreatifitas Anak Muda Pore,Sulawesi Selatan,"Perumahan New Graha Jannah Zarindah, Blok B3/3...",0853 9696 8189,redaksi@tekape.co | redaksitekape@gmail.com,https://tekape.co/,Terverifikasi Administratif,2021-07-05
2,3,digtara.com,Siber,Irwansyah Putra Nasution,Irwansyah Putra Nasution,MEDIA IBE NUSANTARA,Sumatera Utara,Komplek Taman Setia Budi Indah Blok JJ 10,0821-6648-2003/0852-6109-5279,redaksidigtara@gmail.com,digtara.com,Terverifikasi Administratif dan Faktual,2021-06-30
3,4,CNBC Indonesia,Televisi,Wahyu Daniel Panjaitan,Wahyu Daniel Panjaitan,Trans Berita Bisnis,DKI Jakarta,"Gedung Transmedia Lantai 3A, Jl. Kapten P. Ten...",021-79177000,legaltbb@gmail.com,www.cnbcindonesia.com,Terverifikasi Administratif dan Faktual,2021-06-30
4,5,Swara Kaltim,Cetak,Muhammad Syafranuddin,Mohammad Abdun Kuddu,Media Swara Kaltim,Kalimantan Timur,JL. Gerilya Gg. Keluarga Rt. 102 No. 027 Kelur...,0541 731240,mediaswarakaltim@yahoo.com,https://swarakaltim.com,Terverifikasi Administratif dan Faktual,2021-06-30


---

## Full version

In [2]:
# Options for Chrome windows
options = webdriver.ChromeOptions()
options.add_argument('--start-maximized')
options.add_argument('window-size=2560,1440')

In [3]:
DRIVER_PATH = '../bin/chromedriver'
driver = webdriver.Chrome(executable_path = DRIVER_PATH, options = options)

In [4]:
# Main URL
url_query = 'https://dewanpers.or.id/data/perusahaanpers'
url_query

'https://dewanpers.or.id/data/perusahaanpers'

In [5]:
driver.get(url_query)
driver.implicitly_wait(20)

In [6]:
# irframe
driver.switch_to.frame(driver.find_element_by_tag_name('iframe'))

In [7]:
# Loop url
looping_index = driver.find_elements_by_class_name('kv-panel-pager')[0]

In [8]:
# url
url_idx = {}
index = looping_index.find_elements_by_tag_name('a')
for elem in index:
    url_elem = elem.get_attribute('href')
    index_elem = elem.get_attribute('text')
    # Update
    url_idx.update({url_elem: index_elem})

In [9]:
url_idx

{'https://datapers.dewanpers.or.id/site/iframe-verified?page=1&per-page=10': '1',
 'https://datapers.dewanpers.or.id/site/iframe-verified?page=2&per-page=10': '»',
 'https://datapers.dewanpers.or.id/site/iframe-verified?page=3&per-page=10': '3',
 'https://datapers.dewanpers.or.id/site/iframe-verified?page=4&per-page=10': '4',
 'https://datapers.dewanpers.or.id/site/iframe-verified?page=5&per-page=10': '5',
 'https://datapers.dewanpers.or.id/site/iframe-verified?page=6&per-page=10': '6',
 'https://datapers.dewanpers.or.id/site/iframe-verified?page=7&per-page=10': '7',
 'https://datapers.dewanpers.or.id/site/iframe-verified?page=8&per-page=10': '8',
 'https://datapers.dewanpers.or.id/site/iframe-verified?page=9&per-page=10': '9',
 'https://datapers.dewanpers.or.id/site/iframe-verified?page=10&per-page=10': '10'}

In [10]:
# Dictionary for saving the data
dic = {
    'number': [],
    'nama_media': [],
    'jenis_media': [],
    'penanggung_jawab': [],
    'pemimpin_redaksi': [],
    'badan_hukum': [],
    'provinsi': [],
    'alamat': [],
    'telp': [],
    'email': [],
    'website': [],
    'status': [],
    'tgl_approve': []
}

In [11]:
loop = 1
len_rows = 10
while len_rows == 10:
    url_selected = 'https://datapers.dewanpers.or.id/site/iframe-verified?page={}&per-page=10'.format(loop)
    # Access the website
    driver.get(url_selected)
    driver.implicitly_wait(20)    
    # Get the rows
    rows = driver.find_elements_by_tag_name('tbody')[0].find_elements_by_tag_name('tr')
    for elem in rows:
        values = elem.find_elements_by_tag_name('td')
        for i in range(len(values)):
            dic[list(dic)[i]].append(values[i].text)
    # Save the column name
    if loop == 1:
        cols = []
        for elem in driver.find_elements_by_tag_name('tr')[0].find_elements_by_tag_name('a'):
            col = elem.get_attribute('text')
            cols.append(col)
    else:
        pass
    loop += 1
    len_rows = len(rows)

In [12]:
len(dic['number'])

1621

In [13]:
# Save to csv
# Create a dataframe
df = pd.DataFrame.from_dict(
    data = dic
)
# Rename columns
df.columns = ['Number'] + cols
df.head()

Unnamed: 0,Number,Nama Media,Jenis Media,Penanggung Jawab,Pemimpin Redaksi,Badan Hukum,Provinsi,Alamat,Telp,Email,Website,Status,Tgl Approve
0,1,kaltimtoday.co,Siber,Maturidi,Maturidi,Media mandiri perkasa,Kalimantan Timur,"Jalan a wahab syahrani gang walet 1 No 2, sama...",082216557666,kaltimtoday99@gmail.com,www.kaltimtoday.co,Terverifikasi Administratif,2021-07-05
1,2,Tekape.co,Siber,Indar Ismail Jamaluddin,Indar Ismail Jamaluddin,Tempat Kreatifitas Anak Muda Pore,Sulawesi Selatan,"Perumahan New Graha Jannah Zarindah, Blok B3/3...",0853 9696 8189,redaksi@tekape.co | redaksitekape@gmail.com,https://tekape.co/,Terverifikasi Administratif,2021-07-05
2,3,digtara.com,Siber,Irwansyah Putra Nasution,Irwansyah Putra Nasution,MEDIA IBE NUSANTARA,Sumatera Utara,Komplek Taman Setia Budi Indah Blok JJ 10,0821-6648-2003/0852-6109-5279,redaksidigtara@gmail.com,digtara.com,Terverifikasi Administratif dan Faktual,2021-06-30
3,4,CNBC Indonesia,Televisi,Wahyu Daniel Panjaitan,Wahyu Daniel Panjaitan,Trans Berita Bisnis,DKI Jakarta,"Gedung Transmedia Lantai 3A, Jl. Kapten P. Ten...",021-79177000,legaltbb@gmail.com,www.cnbcindonesia.com,Terverifikasi Administratif dan Faktual,2021-06-30
4,5,Swara Kaltim,Cetak,Muhammad Syafranuddin,Mohammad Abdun Kuddu,Media Swara Kaltim,Kalimantan Timur,JL. Gerilya Gg. Keluarga Rt. 102 No. 027 Kelur...,0541 731240,mediaswarakaltim@yahoo.com,https://swarakaltim.com,Terverifikasi Administratif dan Faktual,2021-06-30


In [15]:
# Save into local computer
df.to_csv('../data/raw/raw_dewan_pers.csv', sep = ';', index = False)