# Activity 5.01: Reading Tabular Data from a Web Page and Creating DataFrames

In [1]:
import pandas as pd 
from bs4 import BeautifulSoup
import urllib

In [2]:
page = urllib.request.urlopen("https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)")
page_content = page.read().decode('utf-8')

In [3]:
soup = BeautifulSoup(page_content, 'html.parser')

In [17]:
tables = soup.findAll('table', class_="wikitable sortable static-row-numbers mw-datatable")
print(len(tables))


3


In [43]:
def get_dataframe(table):
    rows = table.findAll('tr')
    # get header
    headers = rows[0]
    columns = [th.getText() for th in headers.findAll('th')]
    # get data rows
    data_rows = rows[1:]
    data = [[td.getText() for td in tr.findAll('td')] for tr in data_rows]
    # create dataframe
    df = pd.DataFrame(data, columns=columns)

    return df

In [7]:
import re

def clean_brackets(s):
    pattern = re.compile("\[[\s\S]*?\]")
    return pattern.sub('', s)

def clean_crosses(s):
    pattern = re.compile("†")
    return pattern.sub('', s)

def clean_line_breaks(s):
    pattern = re.compile("\\n")
    return pattern.sub('', s)

def clean_commas(s):
    pattern = re.compile(",")
    return pattern.sub('', s)

def tidy_up_dataframe(input_df):
    df = input_df.copy()
    df.iloc[:, 0] = df.iloc[:, 0].apply(clean_brackets)
    df.iloc[:, 0] = df.iloc[:, 0].apply(clean_crosses)
    df.iloc[:, 1] = df.iloc[:, 1].apply(clean_line_breaks)
    df.iloc[:, 1] = df.iloc[:, 1].apply(clean_commas)
    df.iloc[:, 1] = df.iloc[:, 1].astype('int')
    df = df.rename(clean_line_breaks, axis='columns')
    
    return df

In [45]:
# get dataframes
dfs = []
for table in tables:
    df = get_dataframe(table)
    df = tidy_up_dataframe(df)
    dfs.append(df)

In [47]:
for df in dfs:
    print(df.head())
    print(df.tail())

  Country/Territory  GDP(US$ M)
0             World    93889577
1     United States    22675271
2             China    16642318
3             Japan     5378136
4           Germany     4319286
       Country/Territory  GDP(US$ M)
191     Marshall Islands         234
192             Kiribati         231
193                Palau         229
194                Nauru         133
195               Tuvalu          57
  Country/Territory  GDP(US$ M)
0             World    87813420
1     United States    21433226
2             China    14342903
3             Japan     5081770
4           Germany     3861124
              Country/Territory  GDP(US$ M)
185                       Palau         268
186     Marshall Islands (2018)         221
187                    Kiribati         195
188                       Nauru         118
189                      Tuvalu          47
  Country/Territory  GDP(US$ M)
0             World    87461674
1     United States    21433226
2             China    14342933
3 