### Import Libraries

In [1]:
import pandas as pd
from bs4 import BeautifulSoup as bs4
import requests

### Begin Scraping

In [2]:
url = 'https://en.wikipedia.org/wiki/Statistics_of_the_COVID-19_pandemic_in_Indonesia'
response = requests.get(url)

In [3]:
soup = bs4(response.text, 'html.parser')
table = soup.find('table', {'class':'wikitable'})
table_rows = table.find_all('tr')
print(table_rows)

[<tr>
<th colspan="9" style="text-align:center;">COVID-19 cases in Indonesia<sup class="reference" id="cite_ref-fn0_1-0"><a href="#cite_note-fn0-1">[a]</a></sup><sup class="reference" id="cite_ref-fn1_2-0"><a href="#cite_note-fn1-2">[b]</a></sup>
</th></tr>, <tr>
<th>Province
</th>
<th>Cases
</th>
<th>Recoveries
</th>
<th>Deaths
</th>
<th>Active<sup class="reference" id="cite_ref-fn2_3-0"><a href="#cite_note-fn2-3">[c]</a></sup>
</th>
<th>Cases per<br/>100,000<br/> population<sup class="reference" id="cite_ref-fn3_4-0"><a href="#cite_note-fn3-4">[d]</a></sup>
</th>
<th>Recovery<br/>rate
</th>
<th>Fatality<br/>rate
</th>
<th>Official website
</th></tr>, <tr>
<th style="text-align:left;"><span typeof="mw:File"><a class="mw-file-description" href="/wiki/File:Coat_of_arms_of_Aceh.svg"><img class="mw-file-element" data-file-height="516" data-file-width="506" decoding="async" height="20" src="//upload.wikimedia.org/wikipedia/commons/thumb/4/41/Coat_of_arms_of_Aceh.svg/20px-Coat_of_arms_of_Ac

### Only taking td and a tag

In [4]:
data = []

for row in table_rows:
    cells = row.find_all(['td', 'a'])
    row_data = [cell.get_text(strip=True) for cell in cells]
    data.append(row_data)
    
data = data[1:]

for row in table_rows:
    print(row)

<tr>
<th colspan="9" style="text-align:center;">COVID-19 cases in Indonesia<sup class="reference" id="cite_ref-fn0_1-0"><a href="#cite_note-fn0-1">[a]</a></sup><sup class="reference" id="cite_ref-fn1_2-0"><a href="#cite_note-fn1-2">[b]</a></sup>
</th></tr>
<tr>
<th>Province
</th>
<th>Cases
</th>
<th>Recoveries
</th>
<th>Deaths
</th>
<th>Active<sup class="reference" id="cite_ref-fn2_3-0"><a href="#cite_note-fn2-3">[c]</a></sup>
</th>
<th>Cases per<br/>100,000<br/> population<sup class="reference" id="cite_ref-fn3_4-0"><a href="#cite_note-fn3-4">[d]</a></sup>
</th>
<th>Recovery<br/>rate
</th>
<th>Fatality<br/>rate
</th>
<th>Official website
</th></tr>
<tr>
<th style="text-align:left;"><span typeof="mw:File"><a class="mw-file-description" href="/wiki/File:Coat_of_arms_of_Aceh.svg"><img class="mw-file-element" data-file-height="516" data-file-width="506" decoding="async" height="20" src="//upload.wikimedia.org/wikipedia/commons/thumb/4/41/Coat_of_arms_of_Aceh.svg/20px-Coat_of_arms_of_Aceh.

### Converting scraped data into dataframe for further cleaning

In [5]:
df = pd.DataFrame(data, dtype=str)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,[c],[d],,,,,,,,,
1,,Aceh,44842,42515,2263,64,850,94.81%,5.05%,covid19.acehprov.go.id,covid19.acehprov.go.id
2,,Bali,172387,167356,4866,165,3993,97.08%,2.82%,infocorona.baliprov.go.id,infocorona.baliprov.go.id
3,,Bangka Belitung Islands,67135,65405,1651,79,4612,97.42%,2.46%,covid19.babelprov.go.id,covid19.babelprov.go.id
4,,Banten,364679,361040,2986,653,3063,99%,0.82%,infocorona.bantenprov.go.id,infocorona.bantenprov.go.id
5,,Bengkulu,29838,29257,528,53,1484,98.05%,1.77%,covid19.bengkuluprov.go.id,covid19.bengkuluprov.go.id
6,,Central Java,655026,617919,33956,3151,1794,94.34%,5.18%,corona.jatengprov.go.id,corona.jatengprov.go.id
7,,Central Kalimantan,59291,57683,1555,53,2221,97.29%,2.62%,corona.kalteng.go.id,corona.kalteng.go.id
8,,Central Sulawesi,62894,60983,1752,159,2106,96.96%,2.79%,dinkes.sultengprov.go.id/category/covid-19,dinkes.sultengprov.go.id/category/covid-19
9,,East Java,636284,603328,32224,732,1565,94.82%,5.06%,infocovid19.jatimprov.go.id,infocovid19.jatimprov.go.id


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37 entries, 0 to 36
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       37 non-null     object
 1   1       36 non-null     object
 2   2       35 non-null     object
 3   3       35 non-null     object
 4   4       35 non-null     object
 5   5       35 non-null     object
 6   6       35 non-null     object
 7   7       35 non-null     object
 8   8       34 non-null     object
 9   9       34 non-null     object
 10  10      34 non-null     object
dtypes: object(11)
memory usage: 3.3+ KB


In [7]:
df.shape

(37, 11)

### Dropping unused columns

In [8]:
df = df.drop(columns=[0,9,10])
df

Unnamed: 0,1,2,3,4,5,6,7,8
0,[d],,,,,,,
1,Aceh,44842,42515,2263,64,850,94.81%,5.05%
2,Bali,172387,167356,4866,165,3993,97.08%,2.82%
3,Bangka Belitung Islands,67135,65405,1651,79,4612,97.42%,2.46%
4,Banten,364679,361040,2986,653,3063,99%,0.82%
5,Bengkulu,29838,29257,528,53,1484,98.05%,1.77%
6,Central Java,655026,617919,33956,3151,1794,94.34%,5.18%
7,Central Kalimantan,59291,57683,1555,53,2221,97.29%,2.62%
8,Central Sulawesi,62894,60983,1752,159,2106,96.96%,2.79%
9,East Java,636284,603328,32224,732,1565,94.82%,5.06%


### Dropping unused rows

In [9]:
df = df.drop([0, 35, 36])
df

Unnamed: 0,1,2,3,4,5,6,7,8
1,Aceh,44842,42515,2263,64,850,94.81%,5.05%
2,Bali,172387,167356,4866,165,3993,97.08%,2.82%
3,Bangka Belitung Islands,67135,65405,1651,79,4612,97.42%,2.46%
4,Banten,364679,361040,2986,653,3063,99%,0.82%
5,Bengkulu,29838,29257,528,53,1484,98.05%,1.77%
6,Central Java,655026,617919,33956,3151,1794,94.34%,5.18%
7,Central Kalimantan,59291,57683,1555,53,2221,97.29%,2.62%
8,Central Sulawesi,62894,60983,1752,159,2106,96.96%,2.79%
9,East Java,636284,603328,32224,732,1565,94.82%,5.06%
10,East Kalimantan,214380,208517,5792,71,5692,97.27%,2.7%


### Column Mapping

In [10]:
column_mapping = {
    1: 'Provinsi',
    2: 'Kasus',
    3: 'Sembuh',
    4: 'Meninggal',
    5: 'Aktif',
    6: 'Kasus Per 100.000 Populasi',
    7: 'Tingkat Kesembuhan',
    8: 'Tingkat Kematian'
}

In [11]:
df.rename(columns=column_mapping, inplace=True)

In [12]:
df

Unnamed: 0,Provinsi,Kasus,Sembuh,Meninggal,Aktif,Kasus Per 100.000 Populasi,Tingkat Kesembuhan,Tingkat Kematian
1,Aceh,44842,42515,2263,64,850,94.81%,5.05%
2,Bali,172387,167356,4866,165,3993,97.08%,2.82%
3,Bangka Belitung Islands,67135,65405,1651,79,4612,97.42%,2.46%
4,Banten,364679,361040,2986,653,3063,99%,0.82%
5,Bengkulu,29838,29257,528,53,1484,98.05%,1.77%
6,Central Java,655026,617919,33956,3151,1794,94.34%,5.18%
7,Central Kalimantan,59291,57683,1555,53,2221,97.29%,2.62%
8,Central Sulawesi,62894,60983,1752,159,2106,96.96%,2.79%
9,East Java,636284,603328,32224,732,1565,94.82%,5.06%
10,East Kalimantan,214380,208517,5792,71,5692,97.27%,2.7%


### Removing %

In [17]:
df['Tingkat Kesembuhan'] = df['Tingkat Kesembuhan'].apply(lambda x: x.replace("%", ""))
df['Tingkat Kesembuhan']

1     94.81
2     97.08
3     97.42
4        99
5     98.05
6     94.34
7     97.29
8     96.96
9     94.82
10    97.27
11    98.27
12    95.63
13    98.83
14    97.56
15    94.14
16    98.32
17    98.08
18    97.43
19    97.29
20    97.76
21    98.64
22    97.01
23    97.22
24    96.94
25    98.22
26    95.75
27    97.62
28    97.05
29    97.89
30    98.11
31    97.33
32    98.61
33    97.39
34    97.55
Name: Tingkat Kesembuhan, dtype: object

### Export to CSV

In [14]:
file_path = './covid19_stats_indonesia.csv'
df.to_csv(file_path)
print(f"The cleaned dataset has been exported to '{file_path}'.")

The cleaned dataset has been exported to './covid19_stats_indonesia.csv'.
