In [662]:
import pandas as pd  

Method 1: Although convenient, this method comes with its own set of limitations.
Firstly, web pages may have content saved in them as tables but they may not appear as tables on the web page. 

In [663]:
URL = 'https://en.wikipedia.org/wiki/List_of_largest_banks'

In [664]:
tables = pd.read_html(URL)
df = tables[0]


In [665]:
df.head()

Unnamed: 0,Rank,Bank name,Total assets (2023) (US$ billion)
0,1,Industrial and Commercial Bank of China,6303.44
1,2,Agricultural Bank of China,5623.12
2,3,China Construction Bank,5400.28
3,4,Bank of China,4578.28
4,5,JPMorgan Chase,3875.39


In this example we can see that there is unwanted 
information that's stored in the table like hyperlinks and other denotations.

In [666]:
URL = 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)'

In [667]:
tables = pd.read_html(URL)
df = tables[2] # the required table will have index 2


In [668]:
df.head()

Unnamed: 0_level_0,Country/Territory,UN region,IMF[1][13],IMF[1][13],World Bank[14],World Bank[14],United Nations[15],United Nations[15]
Unnamed: 0_level_1,Country/Territory,UN region,Forecast,Year,Estimate,Year,Estimate,Year
0,World,—,109529216,2024,100562011,2022,96698005,2021
1,United States,Americas,28781083,2024,25462700,2022,23315081,2021
2,China,Asia,18532633,[n 1]2024,17963171,[n 3]2022,17734131,[n 1]2021
3,Germany,Europe,4591100,2024,4072192,2022,4259935,2021
4,Japan,Asia,4110452,2024,4231141,2022,4940878,2021


Method 2: Use BeatifulSoup to extract the table and clean up the denotations with...................

In [669]:
import requests
from bs4 import BeautifulSoup


In this case I analyzed the table with the inspect tool to find the CSS selector to find the table of interest. 
The problem with this table is that the denotations are also saved. 

To clean the data we use regular expressions to target the denotations found in the table. 

In [670]:
#Let's make a request to access the desired table 
response = requests.get(URL)
soup = BeautifulSoup(response.text, 'html.parser')

# Find all table elements in the soup
tables = soup.select('table.wikitable')

# Print the number of tables found
print("Number of tables:", len(tables))


Number of tables: 1


In [671]:
# Convert the table into a DataFrame
df = pd.read_html(str(tables))[0]


In [672]:
df.head()

Unnamed: 0_level_0,Country/Territory,UN region,IMF[1][13],IMF[1][13],World Bank[14],World Bank[14],United Nations[15],United Nations[15]
Unnamed: 0_level_1,Country/Territory,UN region,Forecast,Year,Estimate,Year,Estimate,Year
0,World,—,109529216,2024,100562011,2022,96698005,2021
1,United States,Americas,28781083,2024,25462700,2022,23315081,2021
2,China,Asia,18532633,[n 1]2024,17963171,[n 3]2022,17734131,[n 1]2021
3,Germany,Europe,4591100,2024,4072192,2022,4259935,2021
4,Japan,Asia,4110452,2024,4231141,2022,4940878,2021


In [673]:
import re
# Function to remove citation denotations like [1], [2], etc.
def remove_citations(text):
    """Remove citation denotations from a string."""
    return re.sub(r'\[.*?\]', '', text) if isinstance(text, str) else text


In [674]:
#Here I use applymap over crafting something with 'apply' since we are doing a transoformation. 
#'apply' is more suited for aggregation functions
df = df.applymap(remove_citations)

In [675]:
#You can see that the denotations are removed from the items
df.head()

Unnamed: 0_level_0,Country/Territory,UN region,IMF[1][13],IMF[1][13],World Bank[14],World Bank[14],United Nations[15],United Nations[15]
Unnamed: 0_level_1,Country/Territory,UN region,Forecast,Year,Estimate,Year,Estimate,Year
0,World,—,109529216,2024,100562011,2022,96698005,2021
1,United States,Americas,28781083,2024,25462700,2022,23315081,2021
2,China,Asia,18532633,2024,17963171,2022,17734131,2021
3,Germany,Europe,4591100,2024,4072192,2022,4259935,2021
4,Japan,Asia,4110452,2024,4231141,2022,4940878,2021


In [676]:

# Extract the multiindex column headers
cols = df.columns

# Apply the cleaning function to each element in the multiindex
cleaned_cols = pd.MultiIndex.from_tuples(
    [(remove_citations(col[0]), remove_citations(col[1])) for col in cols]
)

# Set the cleaned multiindex back to the DataFrame
df.columns = cleaned_cols

In [677]:
df.head()

Unnamed: 0_level_0,Country/Territory,UN region,IMF,IMF,World Bank,World Bank,United Nations,United Nations
Unnamed: 0_level_1,Country/Territory,UN region,Forecast,Year,Estimate,Year,Estimate,Year
0,World,—,109529216,2024,100562011,2022,96698005,2021
1,United States,Americas,28781083,2024,25462700,2022,23315081,2021
2,China,Asia,18532633,2024,17963171,2022,17734131,2021
3,Germany,Europe,4591100,2024,4072192,2022,4259935,2021
4,Japan,Asia,4110452,2024,4231141,2022,4940878,2021
