In [1]:
#import required libraries
import pandas as pd #to analyze data
import requests #to handle  requests
from bs4 import BeautifulSoup #to parse HTML documents

In [2]:
#get the response in the form of html
url = 'https://en.wikipedia.org/wiki/List_of_Indonesian_cities_by_population'
response = requests.get(url)

In [3]:
#parse data from the html into a beautifulsoup object
soup = BeautifulSoup(response.content, 'html.parser')

In [4]:
#get table with class 'wikitable sortable'
table = soup.find('table',{'class':'wikitable'})
table

<table class="wikitable sortable static-row-numbers">
<tbody><tr>
<th>City</th>
<th>Province</th>
<th>Region</th>
<th><a href="/wiki/2020_Indonesian_census" title="2020 Indonesian census">2020<br/>census</a><br/><sup class="reference" id="cite_ref-citypopulation_2-0"><a href="#cite_note-citypopulation-2">[2]</a></sup></th>
<th><a href="/wiki/2010_Indonesian_census" title="2010 Indonesian census">2010<br/>census</a><br/><sup class="reference" id="cite_ref-citypopulation_2-1"><a href="#cite_note-citypopulation-2">[2]</a></sup></th>
<th>Change</th>
<th>Notes
</th></tr>
<tr>
<td><i><b><a href="/wiki/Jakarta" title="Jakarta">Jakarta</a></b></i></td>
<td><a class="mw-redirect" href="/wiki/Special_Capital_Region_of_Jakarta" title="Special Capital Region of Jakarta">Special Capital Region of Jakarta</a></td>
<td><a href="/wiki/Java" title="Java">Java</a></td>
<td style="text-align:right;">10,562,088
</td>
<td style="text-align:right;">9,607,787
</td>
<td style="text-align:right;"><span data-so

In [5]:
#convert list to dataframe
df = pd.read_html(str(table))
df = pd.DataFrame(df[0])

In [6]:
print(df.head()) #print the top 5 data rows before cleaning

       City                           Province   Region  2020census[2]  \
0   Jakarta  Special Capital Region of Jakarta     Java       10562088   
1  Surabaya                          East Java     Java        2874314   
2    Bekasi                          West Java     Java        2543676   
3   Bandung                          West Java     Java        2444160   
4     Medan                      North Sumatra  Sumatra        2435252   

   2010census[2]   Change                      Notes  
0        9607787   +9.93%                        NaN  
1        2765487   +3.94%                        NaN  
2        2334871   +8.94%  Satellite city of Jakarta  
3        2394873   +2.06%                        NaN  
4        2097610  +16.10%                        NaN  


In [7]:
#drop the unwanted columns
df_clean = df.drop(["Notes"], axis=1)
#change the columns name for ease
df_clean = df_clean.rename(columns={"2020census[2]": "Pupulation (2020)","2010census[2]": "Population (2010)"})

In [8]:
print(df_clean.head()) #print the top 5 data rows after cleaning

       City                           Province   Region  Pupulation (2020)  \
0   Jakarta  Special Capital Region of Jakarta     Java           10562088   
1  Surabaya                          East Java     Java            2874314   
2    Bekasi                          West Java     Java            2543676   
3   Bandung                          West Java     Java            2444160   
4     Medan                      North Sumatra  Sumatra            2435252   

   Population (2010)   Change  
0            9607787   +9.93%  
1            2765487   +3.94%  
2            2334871   +8.94%  
3            2394873   +2.06%  
4            2097610  +16.10%  


In [9]:
#convert dataframe to csv file
df_clean.to_csv('List_of_Indonesian_cities_by_population.csv', index=False, encoding='utf-8', quoting=1)