## Scrape data from HTML tables into a DataFrame using BeautifulSoup and Pandas

In [5]:
!pip install bs4
!pip install lxml
!pip install html5lib
!pip install pandas
!pip install requests

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


In [6]:
import warnings
warnings.simplefilter("ignore")

In [7]:
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a web page

import pandas as pd

In [8]:
#The below url contains html tables with data about world population.
url = "https://en.wikipedia.org/wiki/World_population"

In [9]:
# get the contents of the webpage in text format and store in a variable called data
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/91.0.4472.124 Safari/537.36"
}

data  = requests.get(url, headers=headers)

In [10]:
soup = BeautifulSoup(data.text,"html.parser")

In [13]:
#find all html tables in the web page
tables = soup.find_all('table') # in html table is represented by the tag <table>
len(tables)

26

In [14]:
for index,table in enumerate(tables):
    if ("10 most densely populated countries" in str(table)):
        table_index = index
print(table_index)

5


In [15]:
print(tables[table_index].prettify())

<table class="wikitable sortable" style="text-align:right">
 <caption>
  10 most densely populated countries
  <small>
   (with population above 5 million)
  </small>
  <sup class="reference" id="cite_ref-:10_106-0">
   <a href="#cite_note-:10-106">
    <span class="cite-bracket">
     [
    </span>
    101
    <span class="cite-bracket">
     ]
    </span>
   </a>
  </sup>
 </caption>
 <tbody>
  <tr>
   <th scope="col">
    Rank
   </th>
   <th scope="col">
    Country
   </th>
   <th scope="col">
    Population
   </th>
   <th scope="col">
    Area
    <br/>
    <small>
     (km
     <sup>
      2
     </sup>
     )
    </small>
   </th>
   <th scope="col">
    Density
    <br/>
    <small>
     (pop/km
     <sup>
      2
     </sup>
     )
    </small>
   </th>
  </tr>
  <tr>
   <td>
    1
   </td>
   <td align="left">
    <span class="flagicon nowrap">
     <span class="mw-image-border" typeof="mw:File">
      <span>
       <img alt="" class="mw-file-element" data-file-height="600"

In [16]:
population_data = pd.DataFrame(columns=["Rank", "Country", "Population", "Area", "Density"])

for row in tables[table_index].tbody.find_all("tr"):
    col = row.find_all("td")
    if col:
        rank = col[0].text.strip()
        country = col[1].text.strip()
        population = col[2].text.strip()
        area = col[3].text.strip()
        density = col[4].text.strip()

        # Create a temporary DataFrame for the new row
        new_row = pd.DataFrame([{"Rank": rank, "Country": country, "Population": population, "Area": area, "Density": density}])

        # Use concat
        population_data = pd.concat([population_data, new_row], ignore_index=True)

population_data

Unnamed: 0,Rank,Country,Population,Area,Density
0,1,Singapore,5921231,719,8235
1,2,Bangladesh,165650475,148460,1116
2,3,Palestine[note 3][102],5223000,6025,867
3,4,Taiwan[note 4],23580712,35980,655
4,5,South Korea,51844834,99720,520
5,6,Lebanon,5296814,10400,509
6,7,Rwanda,13173730,26338,500
7,8,Burundi,12696478,27830,456
8,9,Israel,9402617,21937,429
9,10,India,1389637446,3287263,423


## Scrape data from HTML tables into a DataFrame using BeautifulSoup and read_html


In [17]:
pd.read_html(str(tables[5]), flavor='bs4')

[   Rank                 Country  Population  Area (km2)  Density (pop/km2)
 0     1               Singapore     5921231         719               8235
 1     2              Bangladesh   165650475      148460               1116
 2     3  Palestine[note 3][102]     5223000        6025                867
 3     4          Taiwan[note 4]    23580712       35980                655
 4     5             South Korea    51844834       99720                520
 5     6                 Lebanon     5296814       10400                509
 6     7                  Rwanda    13173730       26338                500
 7     8                 Burundi    12696478       27830                456
 8     9                  Israel     9402617       21937                429
 9    10                   India  1389637446     3287263                423]

In [18]:
population_data_read_html = pd.read_html(str(tables[5]), flavor='bs4')[0]

population_data_read_html

Unnamed: 0,Rank,Country,Population,Area (km2),Density (pop/km2)
0,1,Singapore,5921231,719,8235
1,2,Bangladesh,165650475,148460,1116
2,3,Palestine[note 3][102],5223000,6025,867
3,4,Taiwan[note 4],23580712,35980,655
4,5,South Korea,51844834,99720,520
5,6,Lebanon,5296814,10400,509
6,7,Rwanda,13173730,26338,500
7,8,Burundi,12696478,27830,456
8,9,Israel,9402617,21937,429
9,10,India,1389637446,3287263,423


## Scrape data from HTML tables into a DataFrame using read_html


In [19]:
dataframe_list = pd.read_html(data.text, flavor='bs4')

len(dataframe_list)

26

In [20]:
dataframe_list[5]

Unnamed: 0,Rank,Country,Population,Area (km2),Density (pop/km2)
0,1,Singapore,5921231,719,8235
1,2,Bangladesh,165650475,148460,1116
2,3,Palestine[note 3][102],5223000,6025,867
3,4,Taiwan[note 4],23580712,35980,655
4,5,South Korea,51844834,99720,520
5,6,Lebanon,5296814,10400,509
6,7,Rwanda,13173730,26338,500
7,8,Burundi,12696478,27830,456
8,9,Israel,9402617,21937,429
9,10,India,1389637446,3287263,423


In [21]:
heading = soup.find("h3", {"id": "Most_densely_populated_countries"})

# Get the next table after this heading
table = heading.find_next("table")

# Convert to DataFrame
df = pd.read_html(str(table))[0]
print(df.head())

   Rank                 Country  Population  Area (km2)  Density (pop/km2)
0     1               Singapore     5921231         719               8235
1     2              Bangladesh   165650475      148460               1116
2     3  Palestine[note 3][102]     5223000        6025                867
3     4          Taiwan[note 4]    23580712       35980                655
4     5             South Korea    51844834       99720                520
