# Scraping Pilpres 2024 Data

## Import Libraries & Setup

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from datetime import datetime

In [2]:
# setup chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")  # ensure GUI is off
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.page_load_strategy = 'normal'

## Functions

In [3]:
def get_data(province, url):
    # set up the webdriver
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(url)
    
    table_xpath = "/html/body/div/div[1]/div/div[3]/div[2]/div[2]/div[2]/div/div/table"
    table_element = WebDriverWait(driver, 100).until(
        EC.presence_of_element_located((By.XPATH, table_xpath))
    )

    # Now that the table is loaded, extract the data
    # This example assumes the table rows are direct children of the table_element located by the XPath
    rows = table_element.find_elements(By.XPATH, ".//tr")
    data = []
    for row in rows:
        # Extract text from each cell in the row
        cells = row.find_elements(By.XPATH, ".//td")
        if not cells:  # If the first row consists of headers <th>
            cells = row.find_elements(By.XPATH, ".//th")
        data.append([cell.text for cell in cells])

    # Assuming the first row contains headers
    headers = data[0] if data else []
    data_rows = data[1:] if len(data) > 1 else []

    # Convert to DataFrame
    df = pd.DataFrame(data_rows, columns=headers)
    df['province'] = province
    df = df[df['Wilayah'] != 'Total\nProgress']
    # Clean Wilayah name
    df['Wilayah'] = df['Wilayah'].str.split('\n').str[0]
    
    return df

In [4]:
def process_data(df):
    # Rename columns
    new_column_names = ['kabupaten_kota','candidate_1_votes','candidate_2_votes','candidate_3_votes', 'province']
    df.columns = new_column_names
    
    # Add refreshed time
    current_time = datetime.now()
    df['refreshed_time'] = current_time
    
    # Remove DATA SEDANG DALAM PROSES to be null and correct the datatype
    columns_to_convert = ['candidate_1_votes', 'candidate_2_votes', 'candidate_3_votes']
    for col in columns_to_convert:
        df[col] = df[col].apply(lambda x: None if x == "Data sedang dalam proses" else x)
        df[col] = df[col].apply(lambda x: int(x.replace('.', '')) if x else None)

    # Add total votes column
    df['total_votes'] = df['candidate_1_votes'] + df['candidate_2_votes'] + df['candidate_3_votes']
    
    # Move refreshed_time & province to the left
    refreshed_column = df.pop('refreshed_time')
    province_column = df.pop('province')
    df.insert(0, 'province', province_column)
    df.insert(0, 'refreshed_time', refreshed_column)
    
    return df

## Dataset

In [5]:
# Link dataset for scraping kabupaten kota data
link_scraping = pd.read_csv('link_scraping.csv')
link_scraping['province'] = link_scraping['province'].str.split('\r').str[0]
# Replace spaces with empty string only for 'P A P U A'
link_scraping['province'] = link_scraping['province'].apply(lambda x: x.replace(' ', '') if x == 'P A P U A' else x)
link_scraping.head()

Unnamed: 0,province,link
0,ACEH,https://pemilu2024.kpu.go.id/pilpres/hitung-su...
1,BALI,https://pemilu2024.kpu.go.id/pilpres/hitung-su...
2,BANTEN,https://pemilu2024.kpu.go.id/pilpres/hitung-su...
3,BENGKULU,https://pemilu2024.kpu.go.id/pilpres/hitung-su...
4,DAERAH ISTIMEWA YOGYAKARTA,https://pemilu2024.kpu.go.id/pilpres/hitung-su...


## Main

In [6]:
# Table xpath national and per province are different
#national_table_xpath = "/html/body/div/div[1]/div/div[2]/div/div[2]/div[3]/div/div/table"
#province_table_xpath = "/html/body/div/div[1]/div/div[3]/div[2]/div[2]/div[2]/div/div/table"

To get the province data, sometimes we encounter some errors, so we wrote this code below

In [7]:
# Initialize an empty DataFrame
df = pd.DataFrame()

# to save if there is error rows
error_rows = []

for index, row in link_scraping.iterrows():
    try:
        temp_df = get_data(row['province'], row['link'])
        # Check if temp_df is not empty before concatenating
        if not temp_df.empty:
            df = pd.concat([df, temp_df], ignore_index=True)
            print(row['province'], "success")
        else:
            print(row['province'], "has no data")
    except Exception as e:
        print(f"Error processing {row['province']}: {str(e)}")
        error_rows.append(row)

# Retry processing the rows that encountered errors
for row in error_rows:
    try:
        temp_df = get_data(row['province'], row['link'])
        if not temp_df.empty:
            df = pd.concat([df, temp_df], ignore_index=True)
            print(row['province'], "success on retry")
        else:
            print(row['province'], "has no data on retry")
    except Exception as e:
        print(f"Error processing {row['province']} on retry: {str(e)}")

ACEH success
BALI success
BANTEN success
BENGKULU success
DAERAH ISTIMEWA YOGYAKARTA success
DKI JAKARTA success
GORONTALO success
JAMBI success
JAWA BARAT success
JAWA TENGAH success
JAWA TIMUR success
KALIMANTAN BARAT success
KALIMANTAN SELATAN success
KALIMANTAN TENGAH success
KALIMANTAN TIMUR success
KALIMANTAN UTARA success
KEPULAUAN BANGKA BELITUNG success
Error processing KEPULAUAN RIAU: Message: unknown error: net::ERR_CONNECTION_RESET
  (Session info: chrome-headless-shell=121.0.6167.185)
Stacktrace:
	GetHandleVerifier [0x00007FF715947012+3522402]
	(No symbol) [0x00007FF715568352]
	(No symbol) [0x00007FF715415ABB]
	(No symbol) [0x00007FF715411760]
	(No symbol) [0x00007FF715404C7D]
	(No symbol) [0x00007FF715403C8D]
	(No symbol) [0x00007FF715402EDC]
	(No symbol) [0x00007FF715402E7F]
	(No symbol) [0x00007FF715401385]
	(No symbol) [0x00007FF715401C6C]
	(No symbol) [0x00007FF715418149]
	(No symbol) [0x00007FF71549C717]
	(No symbol) [0x00007FF71547F05A]
	(No symbol) [0x00007FF71549B

In [8]:
df = df[df['province'].isin(link_scraping['province'])]

In [9]:
link_scraping.head()

Unnamed: 0,province,link
0,ACEH,https://pemilu2024.kpu.go.id/pilpres/hitung-su...
1,BALI,https://pemilu2024.kpu.go.id/pilpres/hitung-su...
2,BANTEN,https://pemilu2024.kpu.go.id/pilpres/hitung-su...
3,BENGKULU,https://pemilu2024.kpu.go.id/pilpres/hitung-su...
4,DAERAH ISTIMEWA YOGYAKARTA,https://pemilu2024.kpu.go.id/pilpres/hitung-su...


In [10]:
df.head()

Unnamed: 0,Wilayah,"H. ANIES RASYID BASWEDAN, Ph.D. - Dr. (H.C.) H. A. MUHAIMIN ISKANDAR",H. PRABOWO SUBIANTO - GIBRAN RAKABUMING RAKA,"H. GANJAR PRANOWO, S.H., M.I.P. - Prof. Dr. H. M. MAHFUD MD",province
0,ACEH BARAT,57.655,18.065,1.523,ACEH
1,ACEH BARAT DAYA,40.696,7.064,880.0,ACEH
2,ACEH BESAR,164.506,22.023,2.829,ACEH
3,ACEH JAYA,35.88,6.151,766.0,ACEH
4,ACEH SELATAN,90.274,16.478,1.857,ACEH


In [11]:
df = process_data(df)

In [12]:
df.head()

Unnamed: 0,refreshed_time,province,kabupaten_kota,candidate_1_votes,candidate_2_votes,candidate_3_votes,total_votes
0,2024-02-20 23:15:39.355539,ACEH,ACEH BARAT,57655.0,18065.0,1523.0,77243.0
1,2024-02-20 23:15:39.355539,ACEH,ACEH BARAT DAYA,40696.0,7064.0,880.0,48640.0
2,2024-02-20 23:15:39.355539,ACEH,ACEH BESAR,164506.0,22023.0,2829.0,189358.0
3,2024-02-20 23:15:39.355539,ACEH,ACEH JAYA,35880.0,6151.0,766.0,42797.0
4,2024-02-20 23:15:39.355539,ACEH,ACEH SELATAN,90274.0,16478.0,1857.0,108609.0


In [13]:
# Export to CSV
df.to_csv("data_pilpres_perprovince.csv", index=False)