## Pandas Input and Output - HTML Tables

In [2]:
# NOTE: Websites display tables through <table> tags. Sometimes a page contains more than one tables and in unwanted format. So, it is compulsory to clean those tables that you needed.

# NOTE: Also you need to install lxml library that pandas use to parse html data to convert tables into DataFrames.

# NOTE: 1. Not every table in a website is avaiable through HTML tables.
    # 2. Some websites may block your computer from scraping the HTML of the site through pandas.
    # 3. It may be more efficient to use an API.

import pandas as pd

In [3]:
# NOTE: We will work on wikipedia page (World Population). Keep in mind that this page must contain table.

# NOTE: In case of any blocking just open the page source and then save that page and then use that saved page as file to get used by pandas.

url = 'https://en.wikipedia.org/wiki/World_population'

tables = pd.read_html(url)

In [4]:
len(tables)

25

In [5]:
tables[0]

Unnamed: 0,#,Most populous countries,2000,2015,2030[A]
0,1,China[B],1270,1376,1416
1,2,India,1053,1311,1528
2,3,United States,283,322,356
3,4,Indonesia,212,258,295
4,5,Pakistan,136,208,245
5,6,Brazil,176,206,228
6,7,Nigeria,123,182,263
7,8,Bangladesh,131,161,186
8,9,Russia,146,146,149
9,10,Mexico,103,127,148


In [6]:
# NOTE: As you can see that there are some unuseful or some other data that are not used. So, we use our pandas learning to clean this data to make it normal. Also, sometimes that comes in multi-index form so also keep in mind about that too.

# First store the table into some variable then clean the data.

world_pop = tables[0]

In [7]:
world_pop

Unnamed: 0,#,Most populous countries,2000,2015,2030[A]
0,1,China[B],1270,1376,1416
1,2,India,1053,1311,1528
2,3,United States,283,322,356
3,4,Indonesia,212,258,295
4,5,Pakistan,136,208,245
5,6,Brazil,176,206,228
6,7,Nigeria,123,182,263
7,8,Bangladesh,131,161,186
8,9,Russia,146,146,149
9,10,Mexico,103,127,148


In [8]:
world_pop = world_pop.set_index('#')

In [9]:
world_pop

Unnamed: 0_level_0,Most populous countries,2000,2015,2030[A]
#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,China[B],1270,1376,1416
2,India,1053,1311,1528
3,United States,283,322,356
4,Indonesia,212,258,295
5,Pakistan,136,208,245
6,Brazil,176,206,228
7,Nigeria,123,182,263
8,Bangladesh,131,161,186
9,Russia,146,146,149
10,Mexico,103,127,148


In [10]:
world_pop = world_pop.reset_index()

In [11]:
world_pop

Unnamed: 0,#,Most populous countries,2000,2015,2030[A]
0,1,China[B],1270,1376,1416
1,2,India,1053,1311,1528
2,3,United States,283,322,356
3,4,Indonesia,212,258,295
4,5,Pakistan,136,208,245
5,6,Brazil,176,206,228
6,7,Nigeria,123,182,263
7,8,Bangladesh,131,161,186
8,9,Russia,146,146,149
9,10,Mexico,103,127,148


In [12]:
world_pop = world_pop.drop('#', axis=1)

In [13]:
world_pop

Unnamed: 0,Most populous countries,2000,2015,2030[A]
0,China[B],1270,1376,1416
1,India,1053,1311,1528
2,United States,283,322,356
3,Indonesia,212,258,295
4,Pakistan,136,208,245
5,Brazil,176,206,228
6,Nigeria,123,182,263
7,Bangladesh,131,161,186
8,Russia,146,146,149
9,Mexico,103,127,148


In [14]:
world_pop.columns

Index(['Most populous countries', '2000', '2015', '2030[A]'], dtype='object')

In [15]:
world_pop.columns = ['Country', '2000', '2015', '2030 Est.']

In [16]:
world_pop

Unnamed: 0,Country,2000,2015,2030 Est.
0,China[B],1270,1376,1416
1,India,1053,1311,1528
2,United States,283,322,356
3,Indonesia,212,258,295
4,Pakistan,136,208,245
5,Brazil,176,206,228
6,Nigeria,123,182,263
7,Bangladesh,131,161,186
8,Russia,146,146,149
9,Mexico,103,127,148


In [18]:
world_pop = world_pop.drop(11, axis=0)

In [19]:
world_pop

Unnamed: 0,Country,2000,2015,2030 Est.
0,China[B],1270,1376,1416
1,India,1053,1311,1528
2,United States,283,322,356
3,Indonesia,212,258,295
4,Pakistan,136,208,245
5,Brazil,176,206,228
6,Nigeria,123,182,263
7,Bangladesh,131,161,186
8,Russia,146,146,149
9,Mexico,103,127,148


In [20]:
# NOTE: Also you can save the cleaned data in HTML format.

world_pop.to_html('world_pop_table.html', index=False)

In [21]:
# searching more clean table

tables[5]

Unnamed: 0,Rank,Country,Population,Area(km2),Density(pop/km2)
0,1,Singapore,5704000,710,8033
1,2,Bangladesh,173540000,143998,1205
2,3,Palestine,5266785,6020,847
3,4,Lebanon,6856000,10452,656
4,5,Taiwan,23604000,36193,652
5,6,South Korea,51781000,99538,520
6,7,Rwanda,12374000,26338,470
7,8,Haiti,11578000,27065,428
8,9,Netherlands,17750000,41526,427
9,10,Israel,9590000,22072,434


In [22]:
# As you can see above data is already cleaned

top_10 = tables[5]

In [23]:
top_10

Unnamed: 0,Rank,Country,Population,Area(km2),Density(pop/km2)
0,1,Singapore,5704000,710,8033
1,2,Bangladesh,173540000,143998,1205
2,3,Palestine,5266785,6020,847
3,4,Lebanon,6856000,10452,656
4,5,Taiwan,23604000,36193,652
5,6,South Korea,51781000,99538,520
6,7,Rwanda,12374000,26338,470
7,8,Haiti,11578000,27065,428
8,9,Netherlands,17750000,41526,427
9,10,Israel,9590000,22072,434


In [24]:
top_10 = top_10.set_index('Rank')

In [25]:
top_10

Unnamed: 0_level_0,Country,Population,Area(km2),Density(pop/km2)
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Singapore,5704000,710,8033
2,Bangladesh,173540000,143998,1205
3,Palestine,5266785,6020,847
4,Lebanon,6856000,10452,656
5,Taiwan,23604000,36193,652
6,South Korea,51781000,99538,520
7,Rwanda,12374000,26338,470
8,Haiti,11578000,27065,428
9,Netherlands,17750000,41526,427
10,Israel,9590000,22072,434


In [26]:
top_10.to_html('Top_10.html', index=True)