In [5]:
# U.S. States Population Data Scraper
# Import required libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup

# URL of the Wikipedia page with U.S. states population data
URL = "https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_population"

def fetch_wikipedia_page(url):
    """Fetch the Wikipedia page content."""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    return response.text

def parse_population_data(html_content):
    """Parse the HTML content to extract population data."""
    soup = BeautifulSoup(html_content, 'html.parser')
    table = soup.find('table', {'class': 'wikitable sortable'})
    if not table:
        raise ValueError("Could not find the population data table on the page")
    return pd.read_html(str(table))[0]

def clean_data(df):
    """Clean and process the population data."""
    df_clean = df.copy()
    df_clean = df_clean.dropna(how='all')
    df_clean.columns = [col.strip() for col in df_clean.columns]
    state_col = df_clean.columns[1]
    df_clean[state_col] = df_clean[state_col].str.replace(r'\[.*?\]', '', regex=True).str.strip()
    pop_col = df_clean.columns[2]
    df_clean[pop_col] = df_clean[pop_col].astype(str).str.replace(r'[^\d]', '', regex=True).astype('int64')
    return df_clean

# Main execution
print("Fetching U.S. states population data from Wikipedia...")
try:
    # Fetch and process the data
    html_content = fetch_wikipedia_page(URL)
    df = parse_population_data(html_content)
    df_clean = clean_data(df)
    
    # Display first 10 rows
    print("\nFirst 10 rows of the scraped data:")
    display(df_clean.head(10))
    
    # Save to CSV with the required filename
    csv_filename = 'CSC221-webscrape-data.csv'
    df_clean.to_csv(csv_filename, index=False)
    print(f"\nData successfully saved to {csv_filename}")
    
except Exception as e:
    print(f"An error occurred: {e}")

Fetching U.S. states population data from Wikipedia...


  return pd.read_html(str(table))[0]



First 10 rows of the scraped data:


Unnamed: 0,State/federal district/territory/ division/region,#,2020 pop.,#.1,2010 pop.,#.2,2000 pop.,#.3,2010– 2020 change,Geo. sort
0,Massachusetts,15,7029917,14,6547629,13,6349097,21,7.4%,NEng
1,Connecticut,29,3605944,29,3574097,29,3405565,47,0.9%,NEng
2,New Hampshire,41,1377529,42,1316470,41,1235786,30,4.6%,NEng
3,Maine,42,1362359,41,1328361,40,1274923,42,2.6%,NEng
4,Rhode Island,43,1097379,43,1052567,43,1048319,31,4.3%,NEng
5,Vermont,49,643077,49,625741,49,608827,40,2.8%,NEng
6,New England,9,15116205,9,14444865,9,13922517,7,4.6%,NEast
7,New York,4,20201249,3,19378102,3,18976457,32,4.2%,MAtl
8,Pennsylvania,5,13002700,6,12702379,6,12281054,43,2.4%,MAtl
9,New Jersey,11,9288994,11,8791894,9,8414350,25,5.7%,MAtl



Data successfully saved to CSC221-webscrape-data.csv
