Importing necessary Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
from pprint import pprint
from typing import Dict, List, Any, Tuple
import json
import random
import time
from colorama import Fore, init

SCRAPING QUOTES

Initialising variables

In [None]:
base_url = "https://quotes.toscrape.com/"   
url = base_url
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"}      # Chrome browser string

response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
data = defaultdict(lambda: defaultdict(list))

response.status_code

200

Scraping a single page

In [None]:
quotes = soup.find_all("div", class_="quote")

for quote in quotes:
    text = quote.find("span", class_="text").get_text(strip=True)
    author = quote.find("small", class_="author").get_text(strip=True)
    tags = [tag.get_text(strip=True) for tag in quote.find_all("a", class_="tag")]

    for tag in tags:
        data[author][tag].append(text)

pprint(data)

In [24]:
next_button = soup.find("li", class_="next")
next_href = next_button.find("a")
next_href["href"]

'/page/2/'

In [3]:
class QuoteScraping:
    base_url = "https://quotes.toscrape.com/"
    init(autoreset=True)

    def __init__(self) -> None:
        self.timeout = 5
        self.session = requests.Session()
        self.header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"}      # Chrome browser string
        self.delay = [1.5, 3]

    def author_list(self) -> List[str]:
        url = QuoteScraping.base_url
        author_set = set()
        page_count = 0

        while url:
            try:
                response = self.session.get(url, timeout=self.timeout, headers=self.header)
            except requests.exceptions.RequestException as e:
                raise Exception(f"Error fetching {url}: {e}")
            
            page_count += 1
            print(f"Scraping page {page_count}...")
            
            soup = BeautifulSoup(response.text, "html.parser")
            authors = soup.select("small.author")
            
            author_set.update([author.get_text(strip=True) for author in authors])
            next_button = soup.find("li", class_="next")

            if next_button:
                next_href = next_button.find("a")["href"]
                url = QuoteScraping.base_url + next_href
                time.sleep(random.uniform(self.delay[0], self.delay[1]))
            else:
                break

        return list(author_set)
    
    def scrape_author_quotes(self, author: str) -> Dict[str, List[str]]:
        if not isinstance(author, str):
            raise TypeError("Author name must be a string.")
        
        page_count = 0
        author_quotes = defaultdict()
        author = author.lower()
        url = QuoteScraping.base_url

        while url:
            try:
                response = self.session.get(url, timeout=self.timeout, headers=self.header)
            except requests.exceptions.RequestException as e:
                raise Exception(f"Error fetching {url}: {e}")

            page_count += 1
            print(Fore.GREEN + f"Searching page {page_count}...")
            print()
            soup = BeautifulSoup(response.text, "html.parser")
            authors = soup.find_all("small", class_="author")

            for auth in authors:
                name = auth.get_text(strip=True).lower()

                if name == author:
                    quote = auth.find_parent("div", class_="quote")
                    text = quote.select_one("span.text").get_text(strip=True)
                    tags = [tag.get_text(strip=True) for tag in quote.select("a.tag")]

                    print(Fore.MAGENTA + f"📜 Quote: {text}")
                    print(Fore.CYAN + f"🏷️  Tags: {', '.join(tags)}")
                    print("-" * 60)

                    author_quotes[text] = tags
            
            print()
            next_button = soup.find("li", class_="next")

            if next_button:
                next_href = next_button.find("a")["href"]
                url = QuoteScraping.base_url + next_href
                time.sleep(random.uniform(self.delay[0], self.delay[1]))
            else:
                break
        
        if len(author_quotes) == 0:
            raise ValueError(f"No quotes found for author {author}.")
        
        return dict(author_quotes)
    
    def scrape_author_info(self, author: str) -> Dict[str, str]:
        if not isinstance(author, str):
            raise TypeError("Author name must be a string.")
        
        page_count = 0
        author = author.lower()
        url = QuoteScraping.base_url

        while url:
            try:
                response = self.session.get(url, timeout=self.timeout, headers=self.header)
            except requests.exceptions.RequestException as e:
                raise Exception(f"Error fetching {url}: {e}")
            
            page_count += 1
            print(Fore.GREEN + f"Searching page {page_count}...")
            soup = BeautifulSoup(response.text, "html.parser")          
            authors = soup.find_all("small", class_="author")

            for auth in authors:
                name = auth.get_text(strip=True)

                if name.lower() == author:
                    quote = auth.find_parent("div", class_="quote")
                    about_href = quote.find("a", class_=None)["href"]
                    author_url = QuoteScraping.base_url + about_href

                    author_response = self.session.get(author_url, timeout=self.timeout, headers=self.header)
                    author_soup = BeautifulSoup(author_response.text, "html.parser")

                    born = author_soup.select_one("span.author-born-date").get_text(strip=True)
                    location = author_soup.select_one("span.author-born-location").get_text(strip=True)
                    description = author_soup.select_one("div.author-description").get_text(strip=True)
                    description = ".".join(description.split('.', maxsplit=6)[:5])

                    print()
                    print(f"👤 Author: {name}")
                    print(f"🎂 Born: {born}")
                    print(f"📍 Location: {location}")
                    print(f"📝 Bio: {description}")
                    print("-" * 70)

                    author_info = {"Born": born, "Location": location, "Bio": description}
                    return author_info
            
            next_button = soup.find("li", class_="next")

            if next_button:
                next_href = next_button.find("a")["href"]
                url = QuoteScraping.base_url + next_href
                time.sleep(random.uniform(self.delay[0], self.delay[1]))
            else:
                break

            raise ValueError(f"Author {author} not found.")
    
    def scrape_all_quotes(self) -> Dict[str, Dict[str, List[str]]]:
        data = defaultdict(lambda: defaultdict(list))       # Quote data is stored here
        page_count = 0
        url = QuoteScraping.base_url

        while url:
            try:
                response = self.session.get(url, timeout=self.timeout, headers=self.header)     # Getting response from website
            except requests.exceptions.RequestException as e:
                raise Exception(f"Error fetching {url}: {e}")

            soup = BeautifulSoup(response.text, "html.parser")
            quotes = soup.find_all("div", class_="quote")       # Find all quotes in 1 page

            page_count += 1
            print(f"Scraping page {page_count}...")

            for quote in quotes:
                text = quote.find("span", class_="text").get_text(strip=True)       # quote_text
                author = quote.find("small", class_="author").get_text(strip=True)      # author
                tags = [tag.get_text(strip=True) for tag in quote.find_all("a", class_="tag")]      # tags associated with the quote

                for tag in tags:
                    data[author][tag].append(text)      # Listing all quotes by author and tag

            next_button = soup.find("li", class_="next")        # Next button at the end of page for author_details

            if next_button:     # If next_buuton is availbale
                next_href = next_button.find("a")["href"]
                url = QuoteScraping.base_url + next_href      # url for next page
                time.sleep(random.uniform(self.delay[0], self.delay[1]))        # Delay requests to reduce traffic on website for next_page
            else:
                url = None

        return dict(data)

    def scrape_all_authors(self) -> Dict[str, str]:
        author_details = defaultdict()      # Author details are stored here
        page_count = 0
        url = QuoteScraping.base_url
        
        while url:
            try:
                response = requests.get(url, timeout=5, headers=self.header)     # Getting response from website
            except requests.exceptions.RequestException as e:
                raise Exception(f"Error fetching {url}: {e}")

            soup = BeautifulSoup(response.text, "html.parser")
            authors = soup.select("small.author")

            page_count += 1
            print(Fore.GREEN + f"Scraping page {page_count}...")
            print(Fore.CYAN + "Reading authors: ")

            for auth in authors:
                name = auth.get_text(strip=True)      # author name

                if name not in author_details:        # Scraping author details if not scraped
                    print(name)
                    quote = auth.find_parent("div", class_="quote") 
                    about_href = quote.find("a", class_=None)["href"]
                    author_url = QuoteScraping.base_url + about_href

                    author_response = requests.get(author_url)
                    author_soup = BeautifulSoup(author_response.text, "html.parser")

                    born = author_soup.find("span", class_="author-born-date").get_text(strip=True)
                    location = author_soup.find("span", class_="author-born-location").get_text(strip=True)[3:]
                    description = author_soup.select_one("div.author-description").get_text(strip=True)

                    author_details[name] = {"Born": born, "Location": location, "Bio": description}
                    time.sleep(random.uniform(self.delay[0], self.delay[1]))        # Delay requests to reduce traffic on website

            print()
            next_button = soup.find("li", class_="next")        # Next button at the end of page for author_details

            if next_button:     # If next_buuton is availbale
                next_href = next_button.find("a")["href"]
                url = QuoteScraping.base_url + next_href      # url for next page
                time.sleep(random.uniform(self.delay[0], self.delay[1]))        # Delay requests to reduce traffic on website
            else:
                url = None

        return dict(author_details)
    
    @staticmethod
    def write_to_json(data: Dict[str, Any], filename: str) -> None:
        if not isinstance(filename, str):
            raise TypeError("Filename must be a string")
        if not isinstance(data, dict):
            raise TypeError("Data must be a dictionary")
        
        with open(file=filename, mode='w', encoding='utf-8') as f:
            json.dump(data, f, indent=4, ensure_ascii=False)

In [None]:
class BookScraping(QuoteScraping):
    base_url = "https://books.toscrape.com/"
    rating_map = {"one": 1, "two": 2, "three": 3, "four": 4, "five": 5}
    init(autoreset=True)
    
    def genre_list(self) -> List[str]:
        try:
            response = self.session.get(url=BookScraping.base_url, timeout=self.timeout, headers=self.header)
        except requests.exceptions.RequestException as e:
            raise Exception(f"Error fetching {url}: {e}")
        
        soup = BeautifulSoup(response.text, "html.parser")
        side_panel = soup.find("ul", class_="nav nav-list")
        genres = side_panel.select("ul ul li")

        genre_list = [genre.find("a").get_text(strip=True) for genre in genres]
        return genre_list
    
    def scrape_books_from_genre(self, genre_name: str) -> Tuple[List[str], List[str]]:
        if not isinstance(genre_name, str):
            raise TypeError("genre_name must be a string")
        
        genre_name = genre_name.lower()
        genre_list = list(map(lambda x: x.lower(), self.genre_list()))

        if genre_name not in [genre for genre in genre_list]:
            raise ValueError("genre_name not present")
        
        book_list = []
        book_href = []
        genre_index = genre_list.index(genre_name) + 2

        base_url = BookScraping.base_url + f"catalogue/category/books/{genre_name}_{genre_index}/index.html"
        url = base_url
        page_count = 0

        # Scraping a Genre
        while url:
            page_count += 1

            try:
                response = self.session.get(url=url, timeout=self.timeout, headers=self.header)
            except requests.exceptions.RequestException as e:
                raise Exception(f"Error fetching {url}: {e}")
            
            print(Fore.GREEN + f"Scraping page {page_count}...")
            soup = BeautifulSoup(response.text, "html.parser")

            # All the books in the current page of the genre
            books = soup.select("article.product_pod")
            book_list.extend([book.h3.select_one("a")["title"] for book in books])
            book_href.extend([book.h3.select_one("a")["href"] for book in books])

            # Looking for next button in the same genre
            next_button = soup.find("li", class_="next")

            if next_button:
                next_href = next_button.find("a")["href"]
                # url = base_url + next_href
                url = base_url.replace("index.html", next_href)
                time.sleep(random.uniform(self.delay[0], self.delay[1]))
            else:
                break
        
        return (book_list, book_href)
    
    def scrape_book_info(self, book_href: str) -> Dict[str, str]:
        if not isinstance(book_href, str):
            raise TypeError("book_href must be a string")
        
        url = BookScraping.base_url + "catalogue/" + book_href[9:]
        try:
            response = self.session.get(url=url, timeout=self.timeout, headers=self.header)
        except requests.exceptions.RequestException as e:
            raise Exception(f"Error fetching {url}: {e}")
        
        soup = BeautifulSoup(response.text, "html.parser")

        availability = soup.select_one("p", class_="instock availability").get_text(strip=True)      # availability
        price = soup.select_one("p.price_color").get_text(strip=True)       # price
        rating_text = soup.select_one("p.star-rating")["class"][-1].lower()
        rating = BookScraping.rating_map[rating_text]

        table = soup.find("table", class_="table table-striped")
        rows = table.select("tr")

        for row in rows:
            if row.select_one("th").get_text(strip=True) == 'UPC':
                upc = row.select_one("td").get_text(strip=True)     # UPC

        book_info = {"UPC": upc, "Price": price, "Rating": rating, "Availability": availability, "URL": url}     # Recording data
        return book_info
        
    def scrape_all_books(self) -> Tuple[List[str], List[str]]:
        url = BookScraping.base_url
        book_list = []
        book_href = []
        page_count = 0

        while url:
            page_count += 1

            try:
                response = self.session.get(url=url, timeout=self.timeout, headers=self.header)
            except requests.exceptions.RequestException as e:
                raise Exception(f"Error fetching {url}: {e}")
            
            print(Fore.GREEN + f"Scraping page {page_count}...")
            soup = BeautifulSoup(response.text, "html.parser")

            books = soup.select("article.product_pod")
            book_list.extend([book.h3.select_one("a")["title"] for book in books])
            book_href.extend([book.h3.select_one("a")["href"] for book in books])

            # Looking for next button in the same genre
            next_button = soup.find("li", class_="next")

            if next_button:
                next_href = next_button.find("a")["href"]

                if page_count == 1:
                    url = url + next_href
                else:
                    url = BookScraping.base_url + "catalogue/" + next_href

                time.sleep(random.uniform(self.delay[0], self.delay[1]))
            else:
                break
        
        return (book_list, book_href)

In [None]:
scraper = QuoteScraping()

In [None]:
scraper.scrape_author_quotes("albert einstein")

In [None]:
scraper.scrape_author_info("albert einstein")

In [None]:
scraper.scrape_all_authors()

In [None]:
scraper.scrape_all_quotes()

In [13]:
scraper = BookScraping()

In [6]:
scraper.genre_list()

['Travel',
 'Mystery',
 'Historical Fiction',
 'Sequential Art',
 'Classics',
 'Philosophy',
 'Romance',
 'Womens Fiction',
 'Fiction',
 'Childrens',
 'Religion',
 'Nonfiction',
 'Music',
 'Default',
 'Science Fiction',
 'Sports and Games',
 'Add a comment',
 'Fantasy',
 'New Adult',
 'Young Adult',
 'Science',
 'Poetry',
 'Paranormal',
 'Art',
 'Psychology',
 'Autobiography',
 'Parenting',
 'Adult Fiction',
 'Humor',
 'Horror',
 'History',
 'Food and Drink',
 'Christian Fiction',
 'Business',
 'Biography',
 'Thriller',
 'Contemporary',
 'Spirituality',
 'Academic',
 'Self Help',
 'Historical',
 'Christian',
 'Suspense',
 'Short Stories',
 'Novels',
 'Health',
 'Politics',
 'Cultural',
 'Erotica',
 'Crime']

In [8]:
scraper.scrape_books_from_genre("romance")

Scraping page 1...
Scraping page 2...


(['Chase Me (Paris Nights #2)',
  'Black Dust',
  'Her Backup Boyfriend (The Sorensen Family #1)',
  'First and First (Five Boroughs #3)',
  'Fifty Shades Darker (Fifty Shades #2)',
  'The Wedding Dress',
  'Suddenly in Love (Lake Haven #1)',
  'Something More Than This',
  'Doing It Over (Most Likely To #1)',
  "The Wedding Pact (The O'Malleys #2)",
  'Hold Your Breath (Search and Rescue #1)',
  'Dirty (Dive Bar #1)',
  'Take Me Home Tonight (Rock Star Romance #3)',
  'Off the Hook (Fishing for Trouble #1)',
  "A Gentleman's Position (Society of Gentlemen #3)",
  'Sit, Stay, Love',
  "A Girl's Guide to Moving On (New Beginnings #2)",
  'The Perfect Play (Play by Play #1)',
  'Dark Lover (Black Dagger Brotherhood #1)',
  'Changing the Game (Play by Play #2)',
  'A Walk to Remember',
  'The Purest Hook (Second Circle Tattoos #3)',
  'The Obsession',
  'Reservations for Two',
  "Best of My Love (Fool's Gold #20)",
  'Where Lightning Strikes (Bleeding Stars #3)',
  'This One Moment (Pushi

In [None]:
scraper.scrape_all_books()

Scraping Multiple Pages

In [None]:
data = defaultdict(lambda: defaultdict(list))       # Quote data is stored here
author_details = defaultdict()      # Author details are stored here
page_count = 0

while url:
    try:
        response = requests.get(url, timeout=5, headers=header)     # Getting response from website
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        
    soup = BeautifulSoup(response.text, "html.parser")
    quotes = soup.find_all("div", class_="quote")       # Find all quotes in 1 page
    
    page_count += 1
    print(page_count)

    for quote in quotes:
        text = quote.find("span", class_="text").get_text(strip=True)       # quote_text
        author = quote.find("small", class_="author").get_text(strip=True)      # author
        tags = [tag.get_text(strip=True) for tag in quote.find_all("a", class_="tag")]      # tags associated with the quote

        for tag in tags:
            data[author][tag].append(text)      # Listing all quotes by author and tag

        if author not in author_details:        # Scraping author details if not scraped
            print(author)
            about_href = quote.find("a")["href"]
            author_url = base_url + about_href

            author_response = requests.get(author_url)
            author_soup = BeautifulSoup(author_response.text, "html.parser")

            born_date = author_soup.find("span", class_="author-born-date").get_text(strip=True)
            born_location = author_soup.find("span", class_="author-born-location").get_text(strip=True)[3:]

            author_details[author] = {"Born On": born_date, "Location": born_location}
            time.sleep(random.uniform(1, 3))        # Delay requests to reduce traffic on website
            
    next_button = soup.find("li", class_="next")        # Next button at the end of page for author_details

    if next_button:     # If next_buuton is availbale
        next_href = next_button.find("a")["href"]
        url = base_url + next_href      # url for next page
    else:
        url = None
    
    print()
    time.sleep(random.uniform(1, 3))        # Delay requests to reduce traffic on website for next_page

Writing data to JSON files

In [7]:
with open("quotes.json", mode="w", encoding='utf-8') as q:
    json.dump(data,  q, indent=4, ensure_ascii=False)

with open("author_details.json", mode="w", encoding='utf-8') as a:
    json.dump(author_details, a, indent=4, ensure_ascii=False)

SCRAPING BOOKS

In [5]:
base_url = "https://books.toscrape.com/"
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"}      # Chrome browser string
response = requests.get(base_url, timeout=5, headers=header)
print(response.status_code)
soup = BeautifulSoup(response.text, "html.parser")

side_panel = soup.find("ul", class_="nav nav-list")
genres = side_panel.select("ul ul li")

200


In [14]:
def scrape_books_from_genre(genre_url):
    global rating_map, page_count, book_data
    base_genre_url = genre_url

    # Scraping a Genre
    while genre_url:
        page_count += 1
        print(page_count)

        genre_response = requests.get(genre_url, timeout=5, headers=header)
        genre_soup = BeautifulSoup(genre_response.text, "html.parser")

        # All the books in the current page of the genre
        books = genre_soup.select("article.product_pod")

        # Scraping details of each book
        for book in books:
            price = book.select_one("p.price_color").get_text(strip=True)       # price
            rating_text = book.select_one("p.star-rating")["class"][-1].lower()
            rating = rating_map[rating_text]        # rating

            # Scraping details from individual book pages
            book_href = book.h3.select_one("a")["href"]
            book_url = base_url + "catalogue/" + book_href[9:]
            book_response = requests.get(book_url, timeout=5, headers=header)
            book_soup = BeautifulSoup(book_response.text, "html.parser")

            title = book_soup.h1.get_text(strip=True)       # title
            print(title)
            availability = book_soup.find("p", class_="instock availability").get_text(strip=True)      # availability
            table = book_soup.find("table", class_="table table-striped")
            rows = table.select("tr")

            for row in rows:
                if row.select_one("th").get_text(strip=True) == 'UPC':
                    upc = row.select_one("td").get_text(strip=True)     # UPC

            book_data[genre_text][title] = {"UPC": upc, "Price": price, "Rating": rating, "Availability": availability}     # Recording data
            # time.sleep(random.uniform(1,2))

        # Looking for next button in the same genre
        next_button = genre_soup.find("li", class_="next")
        print()

        if next_button:
            next_href = next_button.find("a")["href"]
            genre_url = base_genre_url.replace("index.html", next_href)
            # time.sleep(random.uniform(1, 2))
        else:
            page_count = 0
            print()
            break

In [None]:
page_count = 0
rating_map = {"one": 1, "two": 2, "three": 3, "four": 4, "five": 5}
book_data = defaultdict(lambda: defaultdict())

for genre in genres:
    genre_text = genre.get_text(strip=True)
    genre_href = genre.find("a")["href"]
    print(genre_text)

    genre_url = base_url + genre_href
    scrape_books_from_genre(genre_url)


In [11]:
genre = genres[1]

genre_href = genre.find("a")["href"]
genre_url = base_url + genre_href
print(genre_url)

genre_response = requests.get(genre_url, timeout=5, headers=header)
genre_soup = BeautifulSoup(genre_response.text, "html.parser")

https://books.toscrape.com/catalogue/category/books/mystery_3/index.html


In [12]:
books = genre_soup.select("article.product_pod")
book = books[0]
book_href = book.h3.select_one("a")["href"]
book_url = base_url + "catalogue/" + book_href[9:]
book_url

'https://books.toscrape.com/catalogue/sharp-objects_997/index.html'

In [13]:
next_button = genre_soup.find("li", class_="next")
print(next_button)
next_href = next_button.find("a")["href"]
genre_url = genre_url.replace("index.html", next_href)
genre_url

<li class="next"><a href="page-2.html">next</a></li>


'https://books.toscrape.com/catalogue/category/books/mystery_3/page-2.html'

In [31]:
rating = book.select_one("p.star-rating")
rating["class"]

['star-rating', 'Two']

In [45]:
book_href = book.h3.select_one("a")["href"]
book_url = base_url + "catalogue/" + book_href[9:]
print(book_url)
book_response = requests.get(book_url, timeout=5, headers=header)
book_soup = BeautifulSoup(book_response.text, "html.parser")

book_soup.text

https://books.toscrape.com/catalogue/its-only-the-himalayas_981/index.html




In [52]:
table = book_soup.select_one("table.table.table-striped")
rows = table.select("tr")
row = rows[0]
row.select_one("th").get_text(strip=True)

'UPC'

In [None]:
d = {"Name": "Arya", "Class": 12}
list(d.values())

['Arya', 12]