In [65]:
# import the library
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [66]:
# connecting to the page
page = requests.get('https://www.kompas.com/tren/read/2022/11/30/083000165/daftar-lengkap-ump-2023-di-34-provinsi-seluruh-indonesia?page=all')
soup = BeautifulSoup(page.content, 'html.parser')

In [67]:
# checking the respone status
# check, means behind numbers: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status
print(page)

<Response [200]>


A. SCRAPE PROVINCE NAME

In [68]:
lists = soup.find_all("p")

In [69]:
provinces = []

for list in lists:
    province = 'none text'
    try :
        province = list.find("strong").text
    except :
        pass
    provinces.append(province)

In [70]:
# make province dataframe
df_provinces = pd.DataFrame(provinces).drop_duplicates()
df_provinces.columns = ['provinsi']

In [71]:
# filter row with provinsi name only (take row that contain '.' in the end of number list)
df_provinces["number"] = df_provinces['provinsi'].apply(lambda x: x.split(' ')[0].endswith('.'))
df_provinces = df_provinces[df_provinces["number"] == True]

# cleaning the provinsi name
df_provinces['provinsi2'] = df_provinces['provinsi'].apply(lambda x: x.replace(' (NAD)','').split('. ')[1])

B. SCRAPE UMP 2022

In [72]:
lists2 = soup.find_all("ul")

In [73]:
l_ump2022 = []
for list2 in lists2:
    ump2022 = list2.find("li").text
    l_ump2022.append(ump2022)

In [74]:
df_ump2022 = pd.DataFrame(l_ump2022)
df_ump2022.columns = ['ump_2022']

In [75]:
# filter row with ump 2022 only (string in ump column contain 'UMP 2022')
df_ump2022["remark_ump"] = df_ump2022['ump_2022'].str.contains('UMP 2022')
df_ump2022 = df_ump2022[df_ump2022["remark_ump"] == True]

In [76]:
df_ump2022["ump_2022_adj"] = df_ump2022["ump_2022"].apply(lambda x: x.split("Rp")[1].strip().replace(".",""))
df_ump2022["ump_2022_adj"] = df_ump2022["ump_2022_adj"].astype(int)

C. SCRAPE UMP 2023

In [77]:
lists3 = soup.find_all("ul")

In [78]:
l_ump2023 = []
for list3 in lists3:
    ump2023 = list3.find_all("li")[-1].get_text()
    l_ump2023.append(ump2023)

In [79]:
df_ump2023 = pd.DataFrame(l_ump2023)
df_ump2023.columns = ['ump_2023']

In [80]:
# filter row with ump 2023 only (string in ump column contain 'UMP 2023')
df_ump2023["remark_ump"] = df_ump2023['ump_2023'].str.contains('UMP 2023: Rp')
df_ump2023 = df_ump2023[df_ump2023["remark_ump"] == True]

df_ump2023['ump_2023_adj'] = df_ump2023['ump_2023'].apply(lambda x: x.split("(")[0].strip().split(" ")[-1].replace("Rp","").split(",")[0].replace(".",""))
df_ump2023['ump_2023_adj'] = df_ump2023['ump_2023_adj'].astype(int)

D. COMPILE DATA FRAME

In [81]:
prv_series = df_provinces['provinsi2'].squeeze().reset_index().drop(columns="index")
ump2022_series = df_ump2022['ump_2022_adj'].squeeze().reset_index().drop(columns="index")
ump2023_series = df_ump2023['ump_2023_adj'].squeeze().reset_index().drop(columns="index")

df_ump_indonesia = pd.DataFrame({"provinsi": prv_series.iloc[:,-1], 
                                    "ump_2022": ump2022_series.iloc[:,-1],
                                    "ump_2023": ump2023_series.iloc[:,-1]})

df_ump_indonesia["rp_kenaikan"] = df_ump_indonesia["ump_2023"]-df_ump_indonesia["ump_2022"]
df_ump_indonesia["%_kenaikan"] = df_ump_indonesia["ump_2023"]/df_ump_indonesia["ump_2022"]-1
df_ump_indonesia

Unnamed: 0,provinsi,ump_2022,ump_2023,rp_kenaikan,%_kenaikan
0,Nanggroe Aceh Darussalam,3166460,3413666,247206,0.07807
1,Sumatera Utara,2522609,2710493,187884,0.07448
2,Sumatera Barat,2512539,2742476,229937,0.091516
3,Riau,2938564,3191662,253098,0.08613
4,Jambi,2649034,2943000,293966,0.110971
5,Sumatera Selatan,3144446,3404177,259731,0.0826
6,Bengkulu,2238094,2418280,180186,0.080509
7,Lampung,2440486,2633284,192798,0.079
8,Bangka Belitung,3264884,3498479,233595,0.071548
9,Kepulauan Riau,3050172,3279194,229022,0.075085


E. SAVE DATAFRAME TO CSV

In [82]:
df_ump_indonesia.to_csv('Data Kenaikan UMP Indoensia 2023.csv', index=False)