# Scraping Pilpres 2024 Data

## Import Libraries & Setup

In [55]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from datetime import datetime

In [56]:
# setup chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")  # ensure GUI is off
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.page_load_strategy = 'normal'

## Functions

In [57]:
def get_data(url):
    # set up the webdriver
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(url)

    wait = WebDriverWait(driver, 100)  # Timeout after 100 seconds
    table_xpath = "/html/body/div/div[1]/div/div[2]/div/div[2]/div[3]/div/div/table"
    table_element = WebDriverWait(driver, 30).until(
        EC.presence_of_element_located((By.XPATH, table_xpath))
    )

    # Now that the table is loaded, extract the data
    # This example assumes the table rows are direct children of the table_element located by the XPath
    rows = table_element.find_elements(By.XPATH, ".//tr")
    data = []
    for row in rows:
        # Extract text from each cell in the row
        cells = row.find_elements(By.XPATH, ".//td")
        if not cells:  # If the first row consists of headers <th>
            cells = row.find_elements(By.XPATH, ".//th")
        data.append([cell.text for cell in cells])

    # Assuming the first row contains headers
    headers = data[0] if data else []
    data_rows = data[1:] if len(data) > 1 else []

    # Convert to DataFrame
    df = pd.DataFrame(data_rows, columns=headers)
    
    return df

In [58]:
def process_data(df):
    # Rename columns
    new_column_names = ['province','candidate_1_votes','candidate_2_votes','candidate_3_votes']
    df.columns = new_column_names
    
    # Add refreshed time
    current_time = datetime.now()
    df['refreshed_time'] = current_time
    
    # Correct the datatype
    columns_to_convert = ['candidate_1_votes', 'candidate_2_votes', 'candidate_3_votes']
    for col in columns_to_convert:
        df[col] = df[col].apply(lambda x: int(x.replace('.', '')))

    # Add total votes column
    df['total_votes'] = df['candidate_1_votes'] + df['candidate_2_votes'] + df['candidate_3_votes']
    
    # Clean province name
    df['province'] = df['province'].str.split('\n').str[0]
    
    return df

## Main

In [59]:
# set the target URL
#url = "https://pemilu2024.kpu.go.id/pilpres/hitung-suara/35"
url = "https://pemilu2024.kpu.go.id/"

df = get_data(url)
df = process_data(df)

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   province           40 non-null     object        
 1   candidate_1_votes  40 non-null     int64         
 2   candidate_2_votes  40 non-null     int64         
 3   candidate_3_votes  40 non-null     int64         
 4   refreshed_time     40 non-null     datetime64[ns]
 5   total_votes        40 non-null     int64         
dtypes: datetime64[ns](1), int64(4), object(1)
memory usage: 2.0+ KB


In [61]:
df.head()

Unnamed: 0,province,candidate_1_votes,candidate_2_votes,candidate_3_votes,refreshed_time,total_votes
0,Total,23623901,57085817,16667381,2024-02-20 11:00:12.525155,97377099
1,ACEH,1431911,400311,42865,2024-02-20 11:00:12.525155,1875087
2,BALI,31648,520553,449590,2024-02-20 11:00:12.525155,1001791
3,BANTEN,1265143,2123523,357708,2024-02-20 11:00:12.525155,3746374
4,BENGKULU,198441,769378,128666,2024-02-20 11:00:12.525155,1096485


In [62]:
# Don't forget to close the browser
driver.quit()