# Finding out population data of countries using Web Scrapping.

## Scrape data from HTML tables into a DataFrame using BeautifulSoup and Pandas.

In [1]:
!pip install bs4
!pip install html5lib

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup 

In [3]:
url = "https://en.wikipedia.org/wiki/World_population"

In [4]:
data = requests.get(url).text

In [5]:
soup = BeautifulSoup(data, "html.parser")

In [6]:
# Finding all html tables in the webpage
tables = soup.find_all("table")
len(tables)

28

In [7]:
# Locating the right table by matching string
table_index_density="Table not found."

for index, table in enumerate(tables):
    if("10 most densely populated countries" in str(table)):
        table_index_density=index
        
print(table_index_density)

6


In [8]:
print(tables[table_index_density].prettify())

<table class="wikitable sortable" style="text-align:right">
 <caption>
  10 most densely populated countries
  <small>
   (with population above 5 million)
  </small>
  <sup class="reference" id="cite_ref-:10_105-0">
   <a href="#cite_note-:10-105">
    [101]
   </a>
  </sup>
 </caption>
 <tbody>
  <tr>
   <th scope="col">
    Rank
   </th>
   <th scope="col">
    Country
   </th>
   <th scope="col">
    Population
   </th>
   <th scope="col">
    Area
    <br/>
    <small>
     (km
     <sup>
      2
     </sup>
     )
    </small>
   </th>
   <th scope="col">
    Density
    <br/>
    <small>
     (pop/km
     <sup>
      2
     </sup>
     )
    </small>
   </th>
  </tr>
  <tr>
   <td>
    1
   </td>
   <td align="left">
    <span class="flagicon">
     <img alt="" class="thumbborder" data-file-height="600" data-file-width="900" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/4/48/Flag_of_Singapore.svg/23px-Flag_of_Singapore.svg.png" srcset="//upload

In [9]:
population_density = pd.DataFrame(columns=["Rank", "Country", "Population", "Area","Density"])

for row in tables[table_index_density].tbody.find_all("tr"):
    col = row.find_all("td")
    if (col != []):
        rank = col[0].get_text()
        country = col[1].get_text(strip=True)
        population = col[2].get_text(strip=True)
        area = col[3].get_text(strip=True)
        density = col[4].get_text(strip=True)
        population_row = {"Rank": rank, "Country": country, "Population": population, "Area": area, "Density": density}
        
        population_density = pd.concat([population_density,pd.DataFrame([population_row])], ignore_index=True)
        
population_density

Unnamed: 0,Rank,Country,Population,Area,Density
0,1,Singapore,5921231,719,8235
1,2,Bangladesh,165650475,148460,1116
2,3,Palestine[102],5223000,6025,867
3,4,Taiwan,23580712,35980,655
4,5,South Korea,51844834,99720,520
5,6,Lebanon,5296814,10400,509
6,7,Rwanda,13173730,26338,500
7,8,Burundi,12696478,27830,456
8,9,India,1389637446,3287263,423
9,10,Netherlands,17400824,41543,419


## Scrape data from HTML tables into a DataFrame using BeautifulSoup and read_html


In [10]:
# Getting dataframe from pandas' read_html
population_wp = pd.read_html(str(tables[3]), flavor="bs4")[0]
population_wp

Unnamed: 0,Rank,Country / Dependency,Population,Percentage of the world,Date,Source (official or from the United Nations)
0,1,India,1425775850,17.8%,14 Apr 2023,UN projection[91]
1,2,China,1412600000,17.6%,31 Dec 2021,National annual estimate[92]
2,3,United States,334719694,4.17%,9 May 2023,National population clock[93]
3,4,Indonesia,275773800,3.43%,1 Jul 2022,National annual estimate[94]
4,5,Pakistan,229488994,2.86%,1 Jul 2022,UN projection[95]
5,6,Nigeria,216746934,2.70%,1 Jul 2022,UN projection[95]
6,7,Brazil,216119742,2.69%,9 May 2023,National population clock[96]
7,8,Bangladesh,168220000,2.10%,1 Jul 2020,Annual Population Estimate[97]
8,9,Russia,147190000,1.83%,1 Oct 2021,2021 preliminary census results[98]
9,10,Mexico,128271248,1.60%,31 Mar 2022,


## Scrape all the HTML tables into a DataFrame using read_html

In [11]:
population_df_list = pd.read_html(url, flavor="bs4")
len(population_df_list)

25

In [12]:
population_df_list[6]

Unnamed: 0,Rank,Country,Population,Area (km2),Density (pop/km2),Population trend[citation needed]
0,1,India,1389637446,3287263,423,Growing
1,2,Pakistan,242923845,796095,305,Rapidly growing
2,3,Bangladesh,165650475,148460,1116,Growing
3,4,Japan,124214766,377915,329,Declining[103]
4,5,Philippines,114597229,300000,382,Growing
5,6,Vietnam,103808319,331210,313,Growing
6,7,United Kingdom,67791400,243610,278,Growing
7,8,South Korea,51844834,99720,520,Steady
8,9,Taiwan,23580712,35980,655,Steady
9,10,Sri Lanka,23187516,65610,353,Growing
