<a href="https://colab.research.google.com/github/amylynnn/Revision-Notebook/blob/main/Week8_web_scraping_notebook_ipynb_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re

# Define headers to identify yourself politely
headers = {
    'User-Agent': 'Mozilla/5.0 (Educational purpose scraper)',
    'Accept': 'text/html,application/xhtml+xml'
}

# Lists to store all scraped data
all_titles = []
all_prices = []
all_ratings = []

# Rating mapping to convert words to numbers
rating_mapping = {
    'One': 1,
    'Two': 2,
    'Three': 3,
    'Four': 4,
    'Five': 5
}

# Loop through pages 1 to 3
for page_num in range(1, 4):
    # Construct URL for each page
    if page_num == 1:
        url = 'http://books.toscrape.com/'
    else:
        url = f'http://books.toscrape.com/catalogue/page-{page_num}.html'

    print(f"Scraping page {page_num} - {url}")

    # Request the page
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to scrape page {page_num}")
        continue

    # Parse with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all book containers
    book_containers = soup.find_all('article', class_='product_pod')
    print(f"Found {len(book_containers)} books on page {page_num}")

    # Extract data from each book
    for book in book_containers:
        # Title
        title = book.h3.a['title']
        all_titles.append(title)

        # Price (raw string)
        price = book.find('p', class_='price_color').text
        all_prices.append(price)

        # Rating (class name)
        star_rating = book.find('p', class_='star-rating')['class'][1]
        all_ratings.append(star_rating)

    # Be polite and wait a second before next request
    time.sleep(1)

# Create DataFrame
books_df = pd.DataFrame({
    'Title': all_titles,
    'Price': all_prices,
    'Rating': all_ratings
})

# Clean 'Price' column - remove 'Â', '£', whitespace and convert to float
books_df['Price'] = (
    books_df['Price']
    .str.replace('Â', '', regex=False)
    .str.replace('£', '', regex=False)
    .str.strip()
    .astype(float)
)

# Convert rating words to numbers
books_df['Rating'] = books_df['Rating'].map(rating_mapping)

# Show cleaned DataFrame head
print(books_df.head())

# Save to CSV
books_df.to_csv('scraped_books.csv', index=False)
print("Data saved to 'scraped_books.csv'")


Scraping page 1 - http://books.toscrape.com/
Found 20 books on page 1
Scraping page 2 - http://books.toscrape.com/catalogue/page-2.html
Found 20 books on page 2
Scraping page 3 - http://books.toscrape.com/catalogue/page-3.html
Found 20 books on page 3
                                   Title  Price  Rating
0                   A Light in the Attic  51.77       3
1                     Tipping the Velvet  53.74       1
2                             Soumission  50.10       1
3                          Sharp Objects  47.82       4
4  Sapiens: A Brief History of Humankind  54.23       5
Data saved to 'scraped_books.csv'
