In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd

url="https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")

In [2]:
# Print the actual HTML code of the page

print (soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of countries by GDP (nominal) - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"988525b8-fcbf-4d24-bf96-5d7732a62373","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_countries_by_GDP_(nominal)","wgTitle":"List of countries by GDP (nominal)","wgCurRevisionId":990278580,"wgRevisionId":990278580,"wgArticleId":380845,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Pages using the EasyTimeline extension","Wikipedia indefinitely semi-protected pages","Ar

In [3]:
# Find all the images in the page (this could be useful maybe)

soup.find_all("img")

[<img alt="Page semi-protected" data-file-height="512" data-file-width="512" decoding="async" height="20" src="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/20px-Semi-protection-shackle.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/30px-Semi-protection-shackle.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/40px-Semi-protection-shackle.svg.png 2x" width="20"/>,
 <img alt="" data-file-height="40" data-file-width="40" decoding="async" height="40" src="//upload.wikimedia.org/wikipedia/en/thumb/b/b4/Ambox_important.svg/40px-Ambox_important.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/b/b4/Ambox_important.svg/60px-Ambox_important.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/b/b4/Ambox_important.svg/80px-Ambox_important.svg.png 2x" width="40"/>,
 <img alt="" data-file-height="48" data-file-width="48" decoding="async" height="40" src="//upload.wikimedia.

In [4]:
# Extract the title of the page

print (soup.title)
print (soup.title.text)

<title>List of countries by GDP (nominal) - Wikipedia</title>
List of countries by GDP (nominal) - Wikipedia


In [5]:
# The data that we are interested in appears to be in some sort of
# table.  Look for all of the table elements on the page, and print
# out their attributes.

for element in soup.find_all('table'):
    print (element.attrs)

{'class': ['box-Multiple_issues', 'plainlinks', 'metadata', 'ambox', 'ambox-content', 'ambox-multiple_issues', 'compact-ambox'], 'role': 'presentation'}
{'class': ['box-Confusing', 'plainlinks', 'metadata', 'ambox', 'ambox-style', 'ambox-confusing'], 'role': 'presentation'}
{'class': ['box-Original_research', 'plainlinks', 'metadata', 'ambox', 'ambox-content', 'ambox-Original_research'], 'role': 'presentation'}
{'class': ['box-POV-check', 'plainlinks', 'metadata', 'ambox', 'ambox-content'], 'role': 'presentation'}
{'border': '0', 'cellpadding': '2', 'cellspacing': '0', 'style': 'float:right;'}
{'width': '100%'}
{'class': ['wikitable'], 'style': 'margin:auto; width:100%;'}
{'class': ['wikitable', 'sortable'], 'style': 'margin-left:auto; margin-right:auto; margin-top:0;'}
{'class': ['wikitable', 'sortable'], 'style': 'margin-left:auto; margin-right:auto; margin-top:0;'}
{'class': ['wikitable', 'sortable'], 'style': 'margin-left:auto; margin-right:auto; margin-top:0;'}
{'class': ['nowrapl

In [6]:
# We see that there are some elements known as 'wikitables'
#
# Let's extract them from the page and store them in a new object

gdp_table = soup.find("table", attrs={"class": "wikitable"})

print (gdp_table.prettify())


<table class="wikitable" style="margin:auto; width:100%;">
 <tbody>
  <tr>
   <td style="width:33%; text-align:center">
    <b>
     Per the
     <a href="/wiki/International_Monetary_Fund" title="International Monetary Fund">
      International Monetary Fund
     </a>
     (2020 estimates)
    </b>
    <sup class="reference" id="cite_ref-GDP_IMF_1-2">
     <a href="#cite_note-GDP_IMF-1">
      [1]
     </a>
    </sup>
   </td>
   <td style="width:33%; text-align:center;">
    <b>
     Per the
     <a href="/wiki/World_Bank" title="World Bank">
      World Bank
     </a>
     (2019)
    </b>
    <sup class="reference" id="cite_ref-worldbank_21-0">
     <a href="#cite_note-worldbank-21">
      [20]
     </a>
    </sup>
   </td>
   <td style="width:33%; text-align:center;">
    <b>
     Per the
     <a href="/wiki/United_Nations" title="United Nations">
      United Nations
     </a>
     (2018)
    </b>
    <sup class="reference" id="cite_ref-22">
     <a href="#cite_note-22">
      [2

In [18]:
# We see that the data in the tables are contained within a <tr> tag

gdp_table_data = gdp_table.tbody.find_all("tr")  # contains 2 rows

print (gdp_table_data)

[<tr>
<td style="width:33%; text-align:center"><b>Per the <a href="/wiki/International_Monetary_Fund" title="International Monetary Fund">International Monetary Fund</a> (2020 estimates)</b><sup class="reference" id="cite_ref-GDP_IMF_1-2"><a href="#cite_note-GDP_IMF-1">[1]</a></sup>
</td>
<td style="width:33%; text-align:center;"><b>Per the <a href="/wiki/World_Bank" title="World Bank">World Bank</a> (2019)</b><sup class="reference" id="cite_ref-worldbank_21-0"><a href="#cite_note-worldbank-21">[20]</a></sup>
</td>
<td style="width:33%; text-align:center;"><b>Per the <a href="/wiki/United_Nations" title="United Nations">United Nations</a> (2018)</b><sup class="reference" id="cite_ref-22"><a href="#cite_note-22">[21]</a></sup>
</td></tr>, <tr valign="top">
<td>
<table class="wikitable sortable" style="margin-left:auto; margin-right:auto; margin-top:0;">
<tbody><tr>
<th data-sort-type="number" style="width:2em;">Rank</th>
<th>Country/Territory</th>
<th>GDP<br/>(US$million)
</th></tr>
<tr

In [8]:
# Get all the headings of Lists
headings = []
for td in gdp_table_data[0].find_all("td"):
    # remove any newlines and extra spaces from left and right
    headings.append(td.b.text.replace('\n', ' ').strip())

print(headings)

['Per the International Monetary Fund (2020 estimates)', 'Per the World Bank (2019)', 'Per the United Nations (2018)']


In [36]:
data = []
for table, heading in zip(gdp_table_data[1].find_all("table"), headings):
    # Get headers of table i.e., Rank, Country, GDP.
    t_headers = []
    for th in table.find_all("th"):
        # remove any newlines and extra spaces from left and right
        t_headers.append(th.text.replace('\n', ' ').strip())
    # Get all the rows of table
    table_data = []
    for tr in table.tbody.find_all("tr"): # find all tr's from table's tbody
        t_row = []
        # Each table row is stored in the form of
        # t_row = {'Rank': '', 'Country/Territory': '', 'GDP(US$million)': ''}

        # find all td's(3) in tr and zip it with t_header
        for td, th in zip(tr.find_all("td"), t_headers): 
            t_row.append(td.text.replace('\n', '').strip())
        table_data.append(t_row)

    # Put the data for the table with his heading.
    data.append(table_data)

print (data)

[[[], ['', 'World[19]', '83,844,988'], ['1', 'United States', '20,807,269'], ['—', 'European Union[22][n 1]', '14,926,538'], ['2', 'China[n 2][a]', '14,860,775'], ['3', 'Japan', '4,910,580'], ['4', 'Germany', '3,780,553'], ['5', 'United Kingdom', '2,638,296'], ['6', 'India', '2,592,583'], ['7', 'France', '2,551,451'], ['8', 'Italy', '1,848,222'], ['9', 'Canada', '1,600,264'], ['10', 'South Korea', '1,586,786'], ['11', 'Russia[n 3]', '1,464,078'], ['12', 'Brazil', '1,363,767'], ['13', 'Australia', '1,334,688'], ['14', 'Spain', '1,247,464'], ['15', 'Indonesia', '1,088,768'], ['16', 'Mexico', '1,040,372'], ['17', 'Netherlands', '886,339'], ['18', 'Switzerland', '707,868'], ['19', 'Saudi Arabia', '680,897'], ['20', 'Turkey', '649,436'], ['—', 'Taiwan', '635,547'], ['21', 'Iran', '610,662'], ['22', 'Poland', '580,894'], ['23', 'Sweden', '529,054'], ['24', 'Thailand', '509,200'], ['25', 'Belgium', '503,416'], ['26', 'Nigeria', '442,976'], ['27', 'Austria', '432,894'], ['28', 'Ireland', '399,

In [39]:
country = []
gdp = []
rank = []

for table in data:
    print (topic)
    # Each 3 table has headers as following

    for row in table:
        if row:
            print(row)
            country.append(row[1])
            rank.append(row[0])
            gdp.append(row[2])
            
        
df = pd.DataFrame({'Country':country, 'GDP':gdp, 'Rank':rank})

Per the United Nations (2018)
['', 'World[19]', '83,844,988']
['1', 'United States', '20,807,269']
['—', 'European Union[22][n 1]', '14,926,538']
['2', 'China[n 2][a]', '14,860,775']
['3', 'Japan', '4,910,580']
['4', 'Germany', '3,780,553']
['5', 'United Kingdom', '2,638,296']
['6', 'India', '2,592,583']
['7', 'France', '2,551,451']
['8', 'Italy', '1,848,222']
['9', 'Canada', '1,600,264']
['10', 'South Korea', '1,586,786']
['11', 'Russia[n 3]', '1,464,078']
['12', 'Brazil', '1,363,767']
['13', 'Australia', '1,334,688']
['14', 'Spain', '1,247,464']
['15', 'Indonesia', '1,088,768']
['16', 'Mexico', '1,040,372']
['17', 'Netherlands', '886,339']
['18', 'Switzerland', '707,868']
['19', 'Saudi Arabia', '680,897']
['20', 'Turkey', '649,436']
['—', 'Taiwan', '635,547']
['21', 'Iran', '610,662']
['22', 'Poland', '580,894']
['23', 'Sweden', '529,054']
['24', 'Thailand', '509,200']
['25', 'Belgium', '503,416']
['26', 'Nigeria', '442,976']
['27', 'Austria', '432,894']
['28', 'Ireland', '399,064']


In [40]:
df.head(1000)

Unnamed: 0,Country,GDP,Rank
0,World[19],83844988,
1,United States,20807269,1
2,European Union[22][n 1],14926538,—
3,China[n 2][a],14860775,2
4,Japan,4910580,3
...,...,...,...
598,Marshall Islands,214,190
599,Kiribati,189,191
600,Nauru,127,192
601,Montserrat,64,—
