In [1]:
!pip install requests beautifulsoup4 pandas


[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip




In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [None]:

url = "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population"

print(f"Target URL selected: {url}")

Target URL selected: https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population


In [None]:

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

response = requests.get(url, headers=headers)

if response.status_code == 200:
    print("Success! Webpage fetched.")
    print(f"Server responded with status code: {response.status_code}") 
    print("\n--- Raw HTML Snippet ---")
    print(response.text[:500])
else:
    print(f"Error: Failed to fetch page. Status code: {response.status_code}")

Success! Webpage fetched.
Server responded with status code: 200

--- Raw HTML Snippet ---
<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vect


In [None]:

soup = BeautifulSoup(response.text, 'html.parser')

print("HTML parsed into a BeautifulSoup object.")
print(f"Page Title: {soup.title.string}")

HTML parsed into a BeautifulSoup object.
Page Title: List of countries and dependencies by population - Wikipedia


In [None]:

table = soup.find('table', {'class': 'wikitable'})


if table:
    print("Table found.")
else:
    print("Table not found. Check the class name or URL.")

headers = []
for th in table.find_all('th'):
    headers.append(th.text.strip())

print(f"\nHeaders detected: {headers}")

Table found.

Headers detected: ['Location', 'Population', '% ofworld', 'Date', 'Source (official or fromthe United Nations)', 'Notes']


In [6]:
table_data = []

rows = table.find_all('tr')

for row in rows[1:]: 
    cells = row.find_all(['td', 'th'])

    row_data = [cell.text.strip() for cell in cells]

    if len(row_data) > 0:
        table_data.append(row_data)

print(f"Extracted {len(table_data)} rows of data.")
print("First row example:", table_data[0])

Extracted 240 rows of data.
First row example: ['World', '8,232,000,000', '100%', '13 Jun 2025', 'UN projection[1][3]', '']


In [7]:

df = pd.DataFrame(table_data)

if len(df.columns) == len(headers):
    df.columns = headers
else:
    df.columns = headers[:len(df.columns)]
print("--- Raw DataFrame Preview ---")
df.drop(columns=['Source (official or fromthe United Nations)', 'Notes'], inplace=True)
display(df.head())

--- Raw DataFrame Preview ---


Unnamed: 0,Location,Population,% ofworld,Date
0,World,8232000000,100%,13 Jun 2025
1,India,1417492000,17.3%,1 Jul 2025
2,China,1408280000,17.1%,31 Dec 2024
3,United States,340110988,4.1%,1 Jul 2024
4,Indonesia,284438782,3.5%,30 Jun 2025


In [8]:

print(f"Current Columns: {df.columns.tolist()}")

country_col = df.columns[1] 

def clean_text(text):
    text = re.sub(r'\[.*?\]', '', str(text))
    return text.strip()

df[country_col] = df[country_col].apply(clean_text)

display(df.head())

Current Columns: ['Location', 'Population', '% ofworld', 'Date']


Unnamed: 0,Location,Population,% ofworld,Date
0,World,8232000000,100%,13 Jun 2025
1,India,1417492000,17.3%,1 Jul 2025
2,China,1408280000,17.1%,31 Dec 2024
3,United States,340110988,4.1%,1 Jul 2024
4,Indonesia,284438782,3.5%,30 Jun 2025


In [9]:
filename = "wikipedia_population_data.csv"
df.to_csv(filename, index=False, encoding='utf-8')

print(f"Data saved successfully to '{filename}'")

Data saved successfully to 'wikipedia_population_data.csv'
