# Python pandas library for webscraping

In [13]:
# step 1: Import the required libraries
import pandas as pd

# step 2: Load the URL for world population on wikipedia
url = "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population"
# step 3: Read the HTML tables from the URL
tables = pd.read_html(url)
# step 4: Check the number of tables found
print(f"Number of tables found: {len(tables)}")
# step 5: Display the first few rows of the first table
tables[0].head()
# save this into csv
tables[0].to_csv("world_population_11052025.csv", index=False)

Number of tables found: 3


In [10]:
import pandas as pd
url = "http://worldometers.info/world-population/population-by-country/"
tables = pd.read_html(url)
tables[0].head()

HTTPError: HTTP Error 403: Forbidden

# Why do we need requests library?

In [11]:
# step 1: Import the required libraries
import pandas as pd

# step 2: Load the URL
url = 'https://www.worldometers.info/world-population/population-by-country/'

# step 3: Read the HTML content
tables = pd.read_html(url)

# step 4: Check the number of tables
print(f"Number of tables found: {len(tables)}")

HTTPError: HTTP Error 403: Forbidden

> to resolve this problem, we need to use the requests library to download the HTML content of the page. 
> The requests library allows us to send HTTP requests and receive responses from web servers. 
> It is a powerful and easy-to-use library that simplifies the process of making HTTP requests in Python.

In [7]:
# step 1: Import the required libraries
import pandas as pd

# step 2: Load the URL
url = 'https://www.worldometers.info/world-population/population-by-country/'

# step 3: Fetch the HTML content with headers and read the HTML table into a DataFrame
import requests
# headers to avoid being blocked
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
# headers are important to avoid being blocked by the server
response = requests.get(url, headers=headers)
response.raise_for_status()  # Raise an error for bad status codes

tables = pd.read_html(response.text)

# step 4: Check the number of tables
print(f"Number of tables found: {len(tables)}")
# step 5: Display the first few rows of the first table
tables[0].head()

Number of tables found: 1


  tables = pd.read_html(response.text)


Unnamed: 0,#,Country (or dependency),Population (2025),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Median Age,Urban Pop %,World Share
0,1,India,1463865525,0.89%,12929734,492,2973190,"−495,753",1.94,28.8,37.1%,17.78%
1,2,China,1416096094,−0.23%,"−3,225,184",151,9388211,"−268,126",1.02,40.1,67.5%,17.20%
2,3,United States,347275807,0.54%,1849236,38,9147420,1230663,1.62,38.5,82.8%,4.22%
3,4,Indonesia,285721236,0.79%,2233305,158,1811570,"−39,509",2.1,30.4,59.6%,3.47%
4,5,Pakistan,255219554,1.57%,3950390,331,770880,"−1,235,336",3.5,20.6,34.4%,3.10%
