Importing necessary Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
from pprint import pprint
import json
import random
import time

SCRAPING QUOTES

Initialising variables

In [None]:
base_url = "https://quotes.toscrape.com/"   
url = base_url
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"}      # Chrome browser string

response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
data = defaultdict(lambda: defaultdict(list))

response.status_code

Scraping a single page

In [None]:
quotes = soup.find_all("div", class_="quote")

for quote in quotes:
    text = quote.find("span", class_="text").get_text(strip=True)
    author = quote.find("small", class_="author").get_text(strip=True)
    tags = [tag.get_text(strip=True) for tag in quote.find_all("a", class_="tag")]

    for tag in tags:
        data[author][tag].append(text)

pprint(data)

In [24]:
next_button = soup.find("li", class_="next")
next_href = next_button.find("a")
next_href["href"]

'/page/2/'

Scraping Multiple Pages

In [None]:
data = defaultdict(lambda: defaultdict(list))       # Quote data is stored here
author_details = defaultdict()      # Author details are stored here
page_count = 0

while url:
    try:
        response = requests.get(url, timeout=5, headers=header)     # Getting response from website
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        
    soup = BeautifulSoup(response.text, "html.parser")
    quotes = soup.find_all("div", class_="quote")       # Find all quotes in 1 page
    
    page_count += 1
    print(page_count)

    for quote in quotes:
        text = quote.find("span", class_="text").get_text(strip=True)       # quote_text
        author = quote.find("small", class_="author").get_text(strip=True)      # author
        tags = [tag.get_text(strip=True) for tag in quote.find_all("a", class_="tag")]      # tags associated with the quote

        for tag in tags:
            data[author][tag].append(text)      # Listing all quotes by author and tag

        if author not in author_details:        # Scraping author details if not scraped
            print(author)
            about_href = quote.find("a")["href"]
            author_url = base_url + about_href

            author_response = requests.get(author_url)
            author_soup = BeautifulSoup(author_response.text, "html.parser")

            born_date = author_soup.find("span", class_="author-born-date").get_text(strip=True)
            born_location = author_soup.find("span", class_="author-born-location").get_text(strip=True)[3:]

            author_details[author] = {"Born On": born_date, "Location": born_location}
            time.sleep(random.uniform(1, 3))        # Delay requests to reduce traffic on website
            
    next_button = soup.find("li", class_="next")        # Next button at the end of page for author_details

    if next_button:     # If next_buuton is availbale
        next_href = next_button.find("a")["href"]
        url = base_url + next_href      # url for next page
    else:
        url = None
    
    print()
    time.sleep(random.uniform(1, 3))        # Delay requests to reduce traffic on website for next_page

1
Albert Einstein
J.K. Rowling
Jane Austen
Marilyn Monroe
André Gide
Thomas A. Edison
Eleanor Roosevelt
Steve Martin

2
Bob Marley
Dr. Seuss
Douglas Adams
Elie Wiesel
Friedrich Nietzsche
Mark Twain
Allen Saunders

3
Pablo Neruda
Ralph Waldo Emerson
Mother Teresa
Garrison Keillor
Jim Henson

4
Charles M. Schulz
William Nicholson
Jorge Luis Borges
George Eliot

5
George R.R. Martin
C.S. Lewis
Martin Luther King Jr.
James Baldwin

6
Haruki Murakami
Alexandre Dumas fils
Stephenie Meyer
Ernest Hemingway
Helen Keller
George Bernard Shaw

7
Charles Bukowski
Suzanne Collins
J.R.R. Tolkien

8
Alfred Tennyson
Terry Pratchett
J.D. Salinger
George Carlin
John Lennon
W.C. Fields
Ayn Rand

9

10
Jimi Hendrix
J.M. Barrie
E.E. Cummings
Khaled Hosseini
Harper Lee
Madeleine L'Engle



Writing data to JSON files

In [7]:
with open("quotes.json", mode="w", encoding='utf-8') as q:
    json.dump(data,  q, indent=4, ensure_ascii=False)

with open("author_details.json", mode="w", encoding='utf-8') as a:
    json.dump(author_details, a, indent=4, ensure_ascii=False)

SCRAPING BOOKS

In [11]:
base_url = "https://books.toscrape.com/"
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"}      # Chrome browser string
response = requests.get(base_url, timeout=5, headers=header)
soup = BeautifulSoup(response.text, "html.parser")

side_panel = soup.find("ul", class_="nav nav-list")
genres = side_panel.find("ul").find_all("li")
genres

[<li>
 <a href="catalogue/category/books/travel_2/index.html">
                             
                                 Travel
                             
                         </a>
 </li>,
 <li>
 <a href="catalogue/category/books/mystery_3/index.html">
                             
                                 Mystery
                             
                         </a>
 </li>,
 <li>
 <a href="catalogue/category/books/historical-fiction_4/index.html">
                             
                                 Historical Fiction
                             
                         </a>
 </li>,
 <li>
 <a href="catalogue/category/books/sequential-art_5/index.html">
                             
                                 Sequential Art
                             
                         </a>
 </li>,
 <li>
 <a href="catalogue/category/books/classics_6/index.html">
                             
                                 Classics
                 

In [None]:
book_data = defaultdict(lambda: defaultdict())


for genre in genres:
    genre_text = genre.get_text(strip=True)
    genre_href = genre.find("a")["href"]

    genre_url = base_url + genre_href
    genre_response = requests.get(genre_url, timeout=5, headers=header)
    genre_soup = BeautifulSoup(genre_response.text, "html.parser")

    books = genre_soup.find_all("article", class_="product-pod")

    for book in books:
        title = book.find("a").get_text(strip=True)
        price = book.find("p", class_="price_color")
        availability = book.find("p", class_="instock availability").get_text(strip=True)

        book_data[genre_text][title] = {"Price": price, "Availability": availability}
    
    next_button = genre.find("li", class_="next")
    next_href = next_button.find("a")["href"]

    if next_button:
        url = base_url + next_href
    else:
        url = None




['Travel', 'Mystery', 'Historical Fiction', 'Sequential Art', 'Classics', 'Philosophy', 'Romance', 'Womens Fiction', 'Fiction', 'Childrens', 'Religion', 'Nonfiction', 'Music', 'Default', 'Science Fiction', 'Sports and Games', 'Add a comment', 'Fantasy', 'New Adult', 'Young Adult', 'Science', 'Poetry', 'Paranormal', 'Art', 'Psychology', 'Autobiography', 'Parenting', 'Adult Fiction', 'Humor', 'Horror', 'History', 'Food and Drink', 'Christian Fiction', 'Business', 'Biography', 'Thriller', 'Contemporary', 'Spirituality', 'Academic', 'Self Help', 'Historical', 'Christian', 'Suspense', 'Short Stories', 'Novels', 'Health', 'Politics', 'Cultural', 'Erotica', 'Crime']
