In [None]:
# import the library
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
# connecting to the page
page = requests.get('https://www.kompas.com/tren/read/2022/11/30/083000165/daftar-lengkap-ump-2023-di-34-provinsi-seluruh-indonesia?page=all')
soup = BeautifulSoup(page.content, 'html.parser')

In [None]:
# checking the respone status
# check, means behind numbers: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status
print(page)

A. SCRAPE PROVINCE NAME

In [None]:
lists = soup.find_all("p")

In [None]:
provinces = []

for list in lists:
    try :
        province = list.find("strong").text
    except :
        pass
    provinces.append(province)

In [None]:
# make province dataframe
df_provinces = pd.DataFrame(provinces).drop_duplicates()
df_provinces.columns = ['provinsi']

In [None]:
# filter row with provinsi name only (take row that contain '.' in the end of number list)
df_provinces["number"] = df_provinces['provinsi'].apply(lambda x: x.split(' ')[0].endswith('.'))
df_provinces = df_provinces[df_provinces["number"] == True]

# cleaning the provinsi name
df_provinces['provinsi2'] = df_provinces['provinsi'].apply(lambda x: x.replace(' (NAD)','').split('. ')[1])
df_provinces

B. SCRAPE UMP 2022

In [None]:
lists2 = soup.find_all("ul")
print(lists2)

In [None]:
l_ump2022 = []
for list2 in lists2:
    ump2022 = list2.find("li").text
    l_ump2022.append(ump2022)
print(l_ump2022)

In [None]:
df_ump2022 = pd.DataFrame(l_ump2022)
df_ump2022.columns = ['ump_2022']

In [None]:
# filter row with ump 2022 only (string in ump column contain 'UMP 2022')
df_ump2022["remark_ump"] = df_ump2022['ump_2022'].str.contains('UMP 2022')
df_ump2022 = df_ump2022[df_ump2022["remark_ump"] == True]
df_ump2022

In [None]:
df_ump2022["ump_2022_adj"] = df_ump2022["ump_2022"].apply(lambda x: x.split("Rp")[1].strip().replace(".",""))
df_ump2022["ump_2022_adj"] = df_ump2022["ump_2022_adj"].astype(int)
df_ump2022["ump_2022_adj"]

C. SCRAPE UMP 2023

In [None]:
lists3 = soup.find_all("ul")
print(lists3)

In [None]:
l_ump2023 = []
for list3 in lists3:
    ump2023 = list3.find_all("li")[-1].get_text()
    l_ump2023.append(ump2023)

In [None]:
df_ump2023 = pd.DataFrame(l_ump2023)
df_ump2023.columns = ['ump_2023']

In [None]:
# filter row with ump 2023 only (string in ump column contain 'UMP 2023')
df_ump2023["remark_ump"] = df_ump2023['ump_2023'].str.contains('UMP 2023: Rp')
df_ump2023 = df_ump2023[df_ump2023["remark_ump"] == True]

df_ump2023['ump_2023_adj'] = df_ump2023['ump_2023'].apply(lambda x: x.split("(")[0].strip().split(" ")[-1].replace("Rp","").split(",")[0].replace(".",""))
df_ump2023['ump_2023_adj'] = df_ump2023['ump_2023_adj'].astype(int)
df_ump2023['ump_2023_adj']

D. COMPILE DATA FRAME

In [None]:
prv_series = df_provinces['provinsi2'].squeeze().reset_index().drop(columns="index")
ump2022_series = df_ump2022['ump_2022_adj'].squeeze().reset_index().drop(columns="index")
ump2023_series = df_ump2023['ump_2023_adj'].squeeze().reset_index().drop(columns="index")

df_ump_indonesia = pd.DataFrame({"provinsi": prv_series.iloc[:,-1], 
                                    "ump_2022": ump2022_series.iloc[:,-1],
                                    "ump_2023": ump2023_series.iloc[:,-1]})

df_ump_indonesia["rp_kenaikan"] = df_ump_indonesia["ump_2023"]-df_ump_indonesia["ump_2022"]
df_ump_indonesia["%_kenaikan"] = df_ump_indonesia["ump_2023"]/df_ump_indonesia["ump_2022"]
df_ump_indonesia

E. SAVE DATAFRAME TO CSV

In [29]:
df_ump_indonesia.to_csv('Data Kenaikan UMP Indoensia 2023.csv', index=False)