# Scraping Pilpres 2024 Data

## Import Libraries & Setup

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from datetime import datetime

In [2]:
# setup chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")  # ensure GUI is off
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.page_load_strategy = 'normal'

## Functions

In [3]:
def get_data(province, url):
    # set up the webdriver
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(url)
    
    table_xpath = "/html/body/div/div[1]/div/div[3]/div[2]/div[2]/div[2]/div/div/table"
    table_element = WebDriverWait(driver, 100).until(
        EC.presence_of_element_located((By.XPATH, table_xpath))
    )

    # Now that the table is loaded, extract the data
    # This example assumes the table rows are direct children of the table_element located by the XPath
    rows = table_element.find_elements(By.XPATH, ".//tr")
    data = []
    for row in rows:
        # Extract text from each cell in the row
        cells = row.find_elements(By.XPATH, ".//td")
        if not cells:  # If the first row consists of headers <th>
            cells = row.find_elements(By.XPATH, ".//th")
        data.append([cell.text for cell in cells])

    # Assuming the first row contains headers
    headers = data[0] if data else []
    data_rows = data[1:] if len(data) > 1 else []

    # Convert to DataFrame
    df = pd.DataFrame(data_rows, columns=headers)
    df['province'] = province
    df = df[df['Wilayah'] != 'Total\nProgress']
    # Clean Wilayah name
    df['Wilayah'] = df['Wilayah'].str.split('\n').str[0]
    
    return df

In [4]:
def process_data(df):
    # Rename columns
    new_column_names = ['province','kabupaten_kota','candidate_1_votes','candidate_2_votes','candidate_3_votes']
    df.columns = new_column_names
    
    # Add refreshed time
    current_time = datetime.now()
    df['refreshed_time'] = current_time
    
    # Correct the datatype
    columns_to_convert = ['candidate_1_votes', 'candidate_2_votes', 'candidate_3_votes']
    for col in columns_to_convert:
        df[col] = df[col].apply(lambda x: int(x.replace('.', '')) if x else None)

    # Add total votes column
    df['total_votes'] = df['candidate_1_votes'] + df['candidate_2_votes'] + df['candidate_3_votes']
    
    # Move refreshed_time to the left
    refreshed_column = df.pop('refreshed_time')
    df.insert(0, 'refreshed_time', province_column)
    
    return df

## Dataset

In [5]:
# Link dataset for scraping kabupaten kota data
link_scraping = pd.read_csv('link_scraping.csv')
link_scraping['province'] = link_scraping['province'].str.split('\r').str[0]
# Replace spaces with empty string only for 'P A P U A'
link_scraping['province'] = link_scraping['province'].apply(lambda x: x.replace(' ', '') if x == 'P A P U A' else x)
link_scraping.head()

Unnamed: 0,province,link
0,ACEH,https://pemilu2024.kpu.go.id/pilpres/hitung-su...
1,BALI,https://pemilu2024.kpu.go.id/pilpres/hitung-su...
2,BANTEN,https://pemilu2024.kpu.go.id/pilpres/hitung-su...
3,BENGKULU,https://pemilu2024.kpu.go.id/pilpres/hitung-su...
4,DAERAH ISTIMEWA YOGYAKARTA,https://pemilu2024.kpu.go.id/pilpres/hitung-su...


## Main

In [6]:
# Table xpath national and per province are different
#national_table_xpath = "/html/body/div/div[1]/div/div[2]/div/div[2]/div[3]/div/div/table"
#province_table_xpath = "/html/body/div/div[1]/div/div[3]/div[2]/div[2]/div[2]/div/div/table"

To get the province data, sometimes we encounter some errors, so we wrote this code below

In [7]:
# Initialize an empty DataFrame
df = pd.DataFrame()

# to save if there is error rows
error_rows = []

for index, row in link_scraping.iterrows():
    try:
        temp_df = get_data(row['province'], row['link'])
        # Check if temp_df is not empty before concatenating
        if not temp_df.empty:
            df = pd.concat([df, temp_df], ignore_index=True)
            print(row['province'], "success")
        else:
            print(row['province'], "has no data")
    except Exception as e:
        print(f"Error processing {row['province']}: {str(e)}")
        error_rows.append(row)

# Retry processing the rows that encountered errors
for row in error_rows:
    try:
        temp_df = get_data(row['province'], row['link'])
        if not temp_df.empty:
            df = pd.concat([df, temp_df], ignore_index=True)
            print(row['province'], "success on retry")
        else:
            print(row['province'], "has no data on retry")
    except Exception as e:
        print(f"Error processing {row['province']} on retry: {str(e)}")

ACEH success
BALI success
BANTEN success
BENGKULU success
DAERAH ISTIMEWA YOGYAKARTA success
DKI JAKARTA success
GORONTALO success
JAMBI success
JAWA BARAT success
JAWA TENGAH success
JAWA TIMUR has no data
KALIMANTAN BARAT success
KALIMANTAN SELATAN success
KALIMANTAN TENGAH success
KALIMANTAN TIMUR success
KALIMANTAN UTARA success
KEPULAUAN BANGKA BELITUNG success
KEPULAUAN RIAU success
LAMPUNG success
MALUKU success
MALUKU UTARA success
NUSA TENGGARA BARAT success
NUSA TENGGARA TIMUR success
PAPUA success
PAPUA BARAT success
PAPUA BARAT DAYA success
PAPUA PEGUNUNGAN success
PAPUA SELATAN success
PAPUA TENGAH success
RIAU success
SULAWESI BARAT success
SULAWESI SELATAN success
SULAWESI TENGAH success
SULAWESI TENGGARA has no data
SULAWESI UTARA success
SUMATERA BARAT success
SUMATERA SELATAN success
SUMATERA UTARA success
Luar Negeri success


In [8]:
df = df[df['province'].isin(link_scraping['province'])]

In [9]:
link_scraping.head()

Unnamed: 0,province,link
0,ACEH,https://pemilu2024.kpu.go.id/pilpres/hitung-su...
1,BALI,https://pemilu2024.kpu.go.id/pilpres/hitung-su...
2,BANTEN,https://pemilu2024.kpu.go.id/pilpres/hitung-su...
3,BENGKULU,https://pemilu2024.kpu.go.id/pilpres/hitung-su...
4,DAERAH ISTIMEWA YOGYAKARTA,https://pemilu2024.kpu.go.id/pilpres/hitung-su...


In [10]:
df.head()

Unnamed: 0,Wilayah,"H. ANIES RASYID BASWEDAN, Ph.D. - Dr. (H.C.) H. A. MUHAIMIN ISKANDAR",H. PRABOWO SUBIANTO - GIBRAN RAKABUMING RAKA,"H. GANJAR PRANOWO, S.H., M.I.P. - Prof. Dr. H. M. MAHFUD MD",province
0,ACEH BARAT,57.915,18.061,1.528,ACEH
1,ACEH BARAT DAYA,40.72,7.079,886.0,ACEH
2,ACEH BESAR,163.366,21.93,2.808,ACEH
3,ACEH JAYA,36.324,6.165,771.0,ACEH
4,ACEH SELATAN,89.183,16.232,1.774,ACEH


In [11]:
df = process_data(df)

ValueError: invalid literal for int() with base 10: 'ACEH'

In [None]:
df.info()

In [None]:
df.head()

In [None]:
# Don't forget to close the browser
driver.quit()