In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# Wikipedia URL
url = "https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States"

In [3]:
# Send request and parse HTML
headers = {"User-Agent": "Mozilla/5.0"}  # Avoid 403 error
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

In [5]:
# Find the table
table = soup.find("table", {"class": "wikitable"})

# Extract column headers
headers = [header.text.strip() for header in table.find_all("th")]


In [12]:
# Extract data rows
data = []
for row in table.find_all("tr")[1:]:  # Skip header row
    cols = row.find_all(["th", "td"])  # Include both 'th' (for state names) and 'td'
    row_data = [col.text.strip().replace(",", "") for col in cols]  # Clean text
    if len(row_data) > 1:  # Avoid empty rows
        data.append(row_data)

In [13]:
# Ensure correct number of columns
max_cols = max(len(row) for row in data)  # Find max column count in any row
headers = headers[:max_cols]  # Trim headers if extra ones exist

In [14]:
# Create DataFrame
df = pd.DataFrame(data, columns=headers)

In [16]:
#drop 1st row
df = df.iloc[1:]
df

Unnamed: 0,"Flag, name andpostal abbreviation[8]",Cities,Ratification oradmission[A],Population(2020)[10],Total area[11],Reps.,Capital,Largest[12],mi2
1,Alabama,AL,Montgomery,Huntsville,Dec 14 1819,5024279,52420,135767,7.0
2,Alaska,AK,Juneau,Anchorage,Jan 3 1959,733391,665384,1723337,1.0
3,Arizona,AZ,Phoenix,Feb 14 1912,7151502,113990,295234,9,
4,Arkansas,AR,Little Rock,Jun 15 1836,3011524,53179,137732,4,
5,California,CA,Sacramento,Los Angeles,Sep 9 1850,39538223,163695,423967,52.0
6,Colorado,CO,Denver,Aug 1 1876,5773714,104094,269601,8,
7,Connecticut,CT,Hartford,Bridgeport,Jan 9 1788,3605944,5543,14357,5.0
8,Delaware,DE,Dover,Wilmington,Dec 7 1787,989948,2489,6446,1.0
9,Florida,FL,Tallahassee,Jacksonville,Mar 3 1845,21538187,65758,170312,28.0
10,Georgia,GA,Atlanta,Jan 2 1788,10711908,59425,153910,14,


In [18]:
#rename 1st column and population column
df.rename(columns={df.columns[0]: 'State', df.columns[5]: 'Population'}, inplace=True)
df


Unnamed: 0,State,Cities,Ratification oradmission[A],Population(2020)[10],Total area[11],Population,Capital,Largest[12],mi2
1,Alabama,AL,Montgomery,Huntsville,Dec 14 1819,5024279,52420,135767,7.0
2,Alaska,AK,Juneau,Anchorage,Jan 3 1959,733391,665384,1723337,1.0
3,Arizona,AZ,Phoenix,Feb 14 1912,7151502,113990,295234,9,
4,Arkansas,AR,Little Rock,Jun 15 1836,3011524,53179,137732,4,
5,California,CA,Sacramento,Los Angeles,Sep 9 1850,39538223,163695,423967,52.0
6,Colorado,CO,Denver,Aug 1 1876,5773714,104094,269601,8,
7,Connecticut,CT,Hartford,Bridgeport,Jan 9 1788,3605944,5543,14357,5.0
8,Delaware,DE,Dover,Wilmington,Dec 7 1787,989948,2489,6446,1.0
9,Florida,FL,Tallahassee,Jacksonville,Mar 3 1845,21538187,65758,170312,28.0
10,Georgia,GA,Atlanta,Jan 2 1788,10711908,59425,153910,14,


In [19]:
#keeping only the state name and population columns
df = df[['State', 'Population']]
df

Unnamed: 0,State,Population
1,Alabama,5024279
2,Alaska,733391
3,Arizona,113990
4,Arkansas,53179
5,California,39538223
6,Colorado,104094
7,Connecticut,3605944
8,Delaware,989948
9,Florida,21538187
10,Georgia,59425


In [20]:
df.to_csv('us_states_population.csv', index=False)
print("File saved to 'us_states_population.csv'")

File saved to 'us_states_population.csv'
