In [15]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [16]:
url = 'https://geographyfieldwork.com/WorldCapitalCities.htm'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')

In [17]:
table = soup.find('table', id='anyid')

In [18]:
column_names = []

for table_header in table.find_all('th'):
    column_names.append(table_header.text)

Let's check what column names do we have.

In [19]:
column_names

['Country', 'Capital City']

Let's change column names according to our preferred naming convention.

In [20]:
column_names = ['country', 'capital_city']

In [21]:
table_values = []

for row in table.find_all('tr'):
    row_values = [data.text for data in row.find_all('td')]
    table_values.append(row_values)

In [32]:
df_capitals = pd.DataFrame(table_values, columns=column_names)

In [33]:
df_capitals

Unnamed: 0,country,capital_city
0,,
1,Afghanistan,Kabul
2,Albania,Tirana (Tirane)
3,Algeria,Algiers
4,Andorra,Andorra la Vella
...,...,...
197,Wales[22],Cardiff
198,Yemen,Sana'a[23]
199,Zambia,Lusaka
200,Zimbabwe,Harare


We can see that the first and last rows contain invalid values. Besides, some names have numbers attached to them. Let's fix that.

In [34]:
df_capitals = df_capitals.drop([0, 201]).reset_index(drop=True)
df_capitals = df_capitals.replace("\[[0-9]{1,2}\]", "", regex=True)

In [35]:
df_capitals

Unnamed: 0,country,capital_city
0,Afghanistan,Kabul
1,Albania,Tirana (Tirane)
2,Algeria,Algiers
3,Andorra,Andorra la Vella
4,Angola,Luanda
...,...,...
195,Vietnam,Hanoi
196,Wales,Cardiff
197,Yemen,Sana'a
198,Zambia,Lusaka


In [36]:
df_capitals.to_csv('data/capitals.csv', index=False)