<a href="https://colab.research.google.com/github/arloera01-blip/AshlynL_DTSC3020_Fall2025/blob/main/WebScraping_Demos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DTSC 3020 — Web Scraping






## Setup — install required libraries




In [None]:
# Install lightweight scraping libraries
!pip install --quiet requests beautifulsoup4 pandas lxml



---
## Demo 1 — Books to Scrape (static site)

This demo shows extracting **Title, Price, Availability, Rating** from a static educational site using `requests` + `BeautifulSoup` and saving results to CSV.


In [None]:
# Demo 1: scrape first page of books.toscrape.com and build a DataFrame
import requests  # make HTTP GET requests
from bs4 import BeautifulSoup  # parse HTML
import pandas as pd  # work with tabular data

url = 'http://books.toscrape.com/catalogue/page-1.html'  # target page (static educational site)
resp = requests.get(url, timeout=10)  # download HTML content with a short timeout
soup = BeautifulSoup(resp.text, 'html.parser')  # parse HTML with the html.parser

books = []  # container for scraped book records

for item in soup.select('article.product_pod'):
    title = item.h3.a['title'].strip()  # extract book title from the anchor title attribute
    price = item.select_one('p.price_color').text.strip()  # extract price text
    availability = item.select_one('p.instock.availability').text.strip()  # extract availability text
    rating_classes = item.select_one('p.star-rating')['class']  # class encodes rating
    rating = [r for r in rating_classes if r != 'star-rating'][0]  # pick the rating word (One..Five)
    books.append({'Title': title, 'Price': price, 'Availability': availability, 'Rating': rating})

df_books = pd.DataFrame(books)  # convert list of dicts into a DataFrame
df_books.to_csv('books_page1.csv', index=False)  # save results to CSV
print('Saved books_page1.csv — rows:', len(df_books))
df_books.head(10)


Saved books_page1.csv — rows: 20


Unnamed: 0,Title,Price,Availability,Rating
0,A Light in the Attic,Â£51.77,In stock,Three
1,Tipping the Velvet,Â£53.74,In stock,One
2,Soumission,Â£50.10,In stock,One
3,Sharp Objects,Â£47.82,In stock,Four
4,Sapiens: A Brief History of Humankind,Â£54.23,In stock,Five
5,The Requiem Red,Â£22.65,In stock,One
6,The Dirty Little Secrets of Getting Your Dream...,Â£33.34,In stock,Four
7,The Coming Woman: A Novel Based on the Life of...,Â£17.93,In stock,Three
8,The Boys in the Boat: Nine Americans and Their...,Â£22.60,In stock,Four
9,The Black Maria,Â£52.15,In stock,One


---
## Demo 2 — Quotes to Scrape (textual scraping)

This demo extracts textual content (quote text + author) from a static site designed for scraping practice.


In [None]:
# Demo 2: scrape quotes and authors from quotes.toscrape.com
import requests
from bs4 import BeautifulSoup
import pandas as pd

quotes_url = 'http://quotes.toscrape.com/page/1/'  # practice site for textual scraping
r = requests.get(quotes_url, timeout=10)  # download page
soup = BeautifulSoup(r.text, 'html.parser')  # parse HTML

quotes = []  # store quote records
for block in soup.select('div.quote'):
    text = block.select_one('span.text').text.strip()  # quote text
    author = block.select_one('small.author').text.strip()  # quote author
    tags = [t.text for t in block.select('div.tags a.tag')]  # associated tags
    quotes.append({'Quote': text, 'Author': author, 'Tags': ','.join(tags)})

df_quotes = pd.DataFrame(quotes)
df_quotes.to_csv('quotes_page1.csv', index=False)
print('Saved quotes_page1.csv — rows:', len(df_quotes))
df_quotes.head()


Saved quotes_page1.csv — rows: 10


Unnamed: 0,Quote,Author,Tags
0,“The world as we have created it is a process ...,Albert Einstein,"change,deep-thoughts,thinking,world"
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling,"abilities,choices"
2,“There are only two ways to live your life. On...,Albert Einstein,"inspirational,life,live,miracle,miracles"
3,"“The person, be it gentleman or lady, who has ...",Jane Austen,"aliteracy,books,classic,humor"
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe,"be-yourself,inspirational"


In [None]:
# Demo 3: Wikipedia Table

# Imports
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from IPython.display import display
import matplotlib.pyplot as plt

# set the web page we want to scrape.
URL = "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population"

# Makes our request identify as a normal web browser (like Chrome)
# so the site returns the real page, not an error or empty content
# Many sites block  “bot-looking” requests
# Adding a User-Agent helps avoid simple blocks and gives us the same HTML a human user would see

HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/122.0 Safari/537.36"
}



# downloads the page; timeout=20 prevents hanging forever
r = requests.get(URL, headers=HEADERS, timeout=20)

# if the server returned an error, stop and show an error
r.raise_for_status()

# saves the page HTML as text
html = r.text

# turns the HTML text into a parsed object we can search
soup = BeautifulSoup(html, "lxml")

# prints the page title so we know we got the right page
print("PAGE TITLE:", soup.title.get_text(strip=True))

# finds all tables on the page and returns them as a list of DataFrames
tables = pd.read_html(html)

# shows how many tables were found
print("Number of tables found:", len(tables))

# Pick a reasonable table (has both a text-like and a numeric column)
#    If not sure, take the first table; we'll auto-pick columns later.
# loop over tables and pick the first one that has at least 2 columns

raw_df = None
for t in tables:
    if t.shape[1] >= 2:
        raw_df = t
        break
#  if nothing matched, just take the first table
if raw_df is None:
    raw_df = tables[0]


# Normalize headers (clean column names)
# some tables have multi-row headers(MultiIndex); this flattens them and strips spaces

if isinstance(raw_df.columns, pd.MultiIndex):
  # we flatten each header tuple into one string, drop "nan", and trim spaces.
    raw_df.columns = [' '.join([str(x) for x in tup if str(x) != 'nan']).strip()
                      for tup in raw_df.columns.values]
  # If headers are single-level, just convert to string and trim spaces
else:
    raw_df.columns = [str(c).strip() for c in raw_df.columns]

# shows the cleaned column names
print("Columns ->", list(raw_df.columns))

# prints the table size (rows, columns)
print("Raw shape:", raw_df.shape)

# shows the first few rows so we can see the raw data
display(raw_df.head())


# 7) Auto-pick a text column and a numeric column

# Looks through the columns and takes the first one whose dtype is 'object' (usually strings, e.g., country names).
# If none is found, it falls back to the first column.
# next(generator, default) = give me the first match, otherwise use default.

text_col = next((c for c in raw_df.columns if raw_df[c].dtype == 'object'), raw_df.columns[0])

# Convert the column to strings.
# str.replace(r"[^\d]", "", regex=True) removes everything that isn’t a digit (keeps only 0–9).
def numeric_score(series):
    s = series.astype(str).str.replace(r"[^\d]", "", regex=True)
    return (s != "").sum() # s != "" is True for cells that still have at least one digit.
                           # sum() counts how many such cells there are → a score for “numeric-ness.

# Evaluates numeric_score(...) for each column.
# max(..., key=...) returns the column with the largest score.
# lambda c: ... is just an inline function that says “for a column name c, compute this score.”
# picks the column with the most numeric content (best guess for population)
num_col = max(raw_df.columns, key=lambda c: numeric_score(raw_df[c]))

# 8) Create a smaller dataframe with just the chosen text and numeric columns.
df = raw_df[[text_col, num_col]].copy()

# rename them to standard names
df.columns = ["Country", "Population"]


# 9) Clean the data
# r"\[.*?\]" matches things like [1], [a], [note] (non-greedy). Removes them.

df["Country"] = (df["Country"].astype(str)
                 .str.replace(r"\[.*?\]", "", regex=True)
                 .str.strip())

# keep digits only for population and cast to Int64 (nullable)
# str.replace(r"[^\d]", "", regex=True) removes everything that isn’t a digit (keeps only 0–9).
df["Population"] = (df["Population"].astype(str)
                    .str.replace(r"[^\d]", "", regex=True)
                    .replace("", pd.NA) #
                    .astype("Int64"))

# if nothing remains (empty string), mark it as missing (NA).
# .astype("Int64"): convert to nullable integer type (Int64 accepts missing values).


# drop missing, sort desc
df = df.dropna(subset=["Population"]).sort_values("Population", ascending=False)

# dropna(subset=[...]): remove rows where Population is missing (but keep other NAs).
# sort_values(..., ascending=False): sort from largest population to smallest.

# 10) Show top 20
top20 = df.head(20)
display(top20)


# Save CSVs
df.to_csv("countries_population_clean.csv", index=False)
top20.to_csv("countries_population_top20.csv", index=False)
print("Saved -> countries_population_clean.csv, countries_population_top20.csv")

PAGE TITLE: List of countries and dependencies by population - Wikipedia
Number of tables found: 3
Columns -> ['Location', 'Population', '% of world', 'Date', 'Source (official or from the United Nations)', 'Notes']
Raw shape: (240, 6)


  tables = pd.read_html(html)


Unnamed: 0,Location,Population,% of world,Date,Source (official or from the United Nations),Notes
0,World,8232000000,100%,13 Jun 2025,UN projection[1][3],
1,India,1417492000,17.3%,1 Jul 2025,Official projection[4],[b]
2,China,1408280000,17.2%,31 Dec 2024,Official estimate[5],[c]
3,United States,340110988,4.1%,1 Jul 2024,Official estimate[6],[d]
4,Indonesia,284438782,3.5%,30 Jun 2025,National annual projection[7],


Unnamed: 0,Country,Population
0,World,8232000000
1,India,1417492000
2,China,1408280000
3,United States,340110988
4,Indonesia,284438782
5,Pakistan,241499431
6,Nigeria,223800000
7,Brazil,213421037
8,Bangladesh,169828911
9,Russia,146028325


Saved -> countries_population_clean.csv, countries_population_top20.csv
