In [None]:
# Web scraping -1
- Prerequisite: Little knowledge of HTML

In [1]:
!pip install requests beautifulsoup4 pandas




[notice] A new release of pip is available: 25.3 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


We’ll scrape example data from Books to Scrape (a legal practice website made for learning scraping).

In [18]:
import requests
from bs4 import BeautifulSoup
import csv

# Website URL
url = "http://books.toscrape.com/catalogue/page-1.html"

# Send request
response = requests.get(url)

# Parse HTML
soup = BeautifulSoup(response.text, "html.parser")

# Find all book containers
items = soup.find_all("article", class_="product_pod")

In [19]:
items[:2]

[<article class="product_pod">
 <div class="image_container">
 <a href="a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
 </div>
 <p class="star-rating Three">
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 </p>
 <h3><a href="a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
 <div class="product_price">
 <p class="price_color">Â£51.77</p>
 <p class="instock availability">
 <i class="icon-ok"></i>
     
         In stock
     
 </p>
 <form>
 <button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
 </form>
 </div>
 </article>,
 <article class="product_pod">
 <div class="image_container">
 <a href="tipping-the-velvet_999/index.html"><img alt="Tipping the Velvet" class="thumbnail" src="../media/cach

In [24]:
# Now we define a function that performs web scraping:

def scrape_books():
    url = "http://books.toscrape.com/catalogue/page-1.html"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    # Find all book containers
    items = soup.find_all("article", class_="product_pod")

    # Store data
    books = []
    
    for item in items:
        title = item.h3.a["title"]
        price = item.find("p", class_="price_color").text
        availability = item.find("p", class_="instock availability").text.strip()
    
        books.append({
            "Title": title,
            "Price": price,
            "Availability": availability
        })
        

    # Save correctly
    with open("books_data.csv", mode="w", newline="", encoding="utf-8") as file:
        fieldnames = ["Title", "Price", "Availability"]

        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(books)

    print("Data saved successfully!")
    

In [25]:
scrape_books()

[{'Title': 'A Light in the Attic', 'Price': 'Â£51.77', 'Availability': 'In stock'}, {'Title': 'Tipping the Velvet', 'Price': 'Â£53.74', 'Availability': 'In stock'}, {'Title': 'Soumission', 'Price': 'Â£50.10', 'Availability': 'In stock'}, {'Title': 'Sharp Objects', 'Price': 'Â£47.82', 'Availability': 'In stock'}, {'Title': 'Sapiens: A Brief History of Humankind', 'Price': 'Â£54.23', 'Availability': 'In stock'}, {'Title': 'The Requiem Red', 'Price': 'Â£22.65', 'Availability': 'In stock'}, {'Title': 'The Dirty Little Secrets of Getting Your Dream Job', 'Price': 'Â£33.34', 'Availability': 'In stock'}, {'Title': 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull', 'Price': 'Â£17.93', 'Availability': 'In stock'}, {'Title': 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics', 'Price': 'Â£22.60', 'Availability': 'In stock'}, {'Title': 'The Black Maria', 'Price': 'Â£52.15', 'Availability': 'In stock'}, {'Title': 

## Make It Automated (Daily Run)
Python collects data every day automatically.

In [None]:
import schedule
import time

def job():
    print("Running scraper...")
    # call your scraping function here
    scrape_books()

schedule.every().day.at("09:00").do(job)

while True:
    schedule.run_pending()
    time.sleep(60)

In [None]:
# Final Python File: Industry standard

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import schedule
import time

# -----------------------------
# Web Scraping Function
# -----------------------------
def scrape_books():
    url = "http://books.toscrape.com/catalogue/page-1.html"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    # Find all book containers
    items = soup.find_all("article", class_="product_pod")

    # Store data
    books = []
    
    for item in items:
        title = item.h3.a["title"]
        price = item.find("p", class_="price_color").text
        availability = item.find("p", class_="instock availability").text.strip()
    
        books.append({
            "Title": title,
            "Price": price,
            "Availability": availability
        })
        

    # Save correctly
    with open("books_data.csv", mode="w", newline="", encoding="utf-8") as file:
        fieldnames = ["Title", "Price", "Availability"]

        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(books)

    print("Data saved successfully!")
        

# -----------------------------
# Scheduled Job
# -----------------------------
def job():
    print("Running scraper...")
    scrape_books()


# -----------------------------
# Main Function
# -----------------------------
def main():
    # Schedule task
    schedule.every().day.at("09:00").do(job)

    print("Scheduler started... Waiting for next run.")

    while True:
        schedule.run_pending()
        time.sleep(60)


# -----------------------------
# Entry Point (BEST PRACTICE)
# -----------------------------
if __name__ == "__main__":
    main()
