Importing necessary Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
from pprint import pprint
import json
import random
import time
from Class_Scraping import QuoteScraping, BookScraping

SCRAPING QUOTES

Initialising variables

In [None]:
base_url = "https://quotes.toscrape.com/"   
url = base_url
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"}      # Chrome browser string

response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
data = defaultdict(lambda: defaultdict(list))

response.status_code

200

Scraping a single page

In [None]:
quotes = soup.find_all("div", class_="quote")

for quote in quotes:
    text = quote.find("span", class_="text").get_text(strip=True)
    author = quote.find("small", class_="author").get_text(strip=True)
    tags = [tag.get_text(strip=True) for tag in quote.find_all("a", class_="tag")]

    for tag in tags:
        data[author][tag].append(text)

pprint(data)

In [24]:
next_button = soup.find("li", class_="next")
next_href = next_button.find("a")
next_href["href"]

'/page/2/'

In [2]:
quote_scraper = QuoteScraping()

In [None]:
quote_scraper.scrape_author_quotes("albert einstein")

In [13]:
url = quote_scraper.get_author_url("stephanie meyer")
quote_scraper.scrape_author_info(url)

Successfully scraped details of author Stephenie Meyer

👤 Author: Stephenie Meyer
🎂 Born: December 24, 1973
📍 Location: in Connecticut, The United States
📝 Bio: I was born in Connecticut in 1973, during a brief blip in my family's otherwise western U.S. existence. We were settled in Phoenix by the time I was four, and I think of myself as a native. The unusual spelling of my name was a gift from my father, Stephen (+ ie = me)
URL: https://quotes.toscrape.com//author/Stephenie-Meyer
------------------------------------------------------------


{'Born': 'December 24, 1973',
 'Location': 'in Connecticut, The United States',
 'Bio': "I was born in Connecticut in 1973, during a brief blip in my family's otherwise western U.S. existence. We were settled in Phoenix by the time I was four, and I think of myself as a native. The unusual spelling of my name was a gift from my father, Stephen (+ ie = me)",
 'URL': 'https://quotes.toscrape.com//author/Stephenie-Meyer'}

In [14]:
quote_scraper.author_urls

{'last page': 10,
 'next href': None,
 'Albert Einstein': 'https://quotes.toscrape.com//author/Albert-Einstein',
 'J.K. Rowling': 'https://quotes.toscrape.com//author/J-K-Rowling',
 'Jane Austen': 'https://quotes.toscrape.com//author/Jane-Austen',
 'Marilyn Monroe': 'https://quotes.toscrape.com//author/Marilyn-Monroe',
 'André Gide': 'https://quotes.toscrape.com//author/Andre-Gide',
 'Thomas A. Edison': 'https://quotes.toscrape.com//author/Thomas-A-Edison',
 'Eleanor Roosevelt': 'https://quotes.toscrape.com//author/Eleanor-Roosevelt',
 'Steve Martin': 'https://quotes.toscrape.com//author/Steve-Martin',
 'Bob Marley': 'https://quotes.toscrape.com//author/Bob-Marley',
 'Dr. Seuss': 'https://quotes.toscrape.com//author/Dr-Seuss',
 'Douglas Adams': 'https://quotes.toscrape.com//author/Douglas-Adams',
 'Elie Wiesel': 'https://quotes.toscrape.com//author/Elie-Wiesel',
 'Friedrich Nietzsche': 'https://quotes.toscrape.com//author/Friedrich-Nietzsche',
 'Mark Twain': 'https://quotes.toscrape.co

In [None]:
quote_scraper.scrape_all_authors()

In [None]:
quote_scraper.scrape_all_quotes()

In [None]:
book_scraper = BookScraping()

In [None]:
book_scraper.genre_list()

In [None]:
book_scraper.scrape_books_from_genre("romance")

In [None]:
book_scraper.scrape_all_books()

In [None]:
book_scraper.scrape_book_info()

Scraping Multiple Pages

In [None]:
data = defaultdict(lambda: defaultdict(list))       # Quote data is stored here
author_details = defaultdict()      # Author details are stored here
page_count = 0

while url:
    try:
        response = requests.get(url, timeout=5, headers=header)     # Getting response from website
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        
    soup = BeautifulSoup(response.text, "html.parser")
    quotes = soup.find_all("div", class_="quote")       # Find all quotes in 1 page
    
    page_count += 1
    print(page_count)

    for quote in quotes:
        text = quote.find("span", class_="text").get_text(strip=True)       # quote_text
        author = quote.find("small", class_="author").get_text(strip=True)      # author
        tags = [tag.get_text(strip=True) for tag in quote.find_all("a", class_="tag")]      # tags associated with the quote

        for tag in tags:
            data[author][tag].append(text)      # Listing all quotes by author and tag

        if author not in author_details:        # Scraping author details if not scraped
            print(author)
            about_href = quote.find("a")["href"]
            author_url = base_url + about_href

            author_response = requests.get(author_url)
            author_soup = BeautifulSoup(author_response.text, "html.parser")

            born_date = author_soup.find("span", class_="author-born-date").get_text(strip=True)
            born_location = author_soup.find("span", class_="author-born-location").get_text(strip=True)[3:]

            author_details[author] = {"Born On": born_date, "Location": born_location}
            time.sleep(random.uniform(1, 3))        # Delay requests to reduce traffic on website
            
    next_button = soup.find("li", class_="next")        # Next button at the end of page for author_details

    if next_button:     # If next_buuton is availbale
        next_href = next_button.find("a")["href"]
        url = base_url + next_href      # url for next page
    else:
        url = None
    
    print()
    time.sleep(random.uniform(1, 3))        # Delay requests to reduce traffic on website for next_page

Writing data to JSON files

In [7]:
with open("quotes.json", mode="w", encoding='utf-8') as q:
    json.dump(data,  q, indent=4, ensure_ascii=False)

with open("author_details.json", mode="w", encoding='utf-8') as a:
    json.dump(author_details, a, indent=4, ensure_ascii=False)

SCRAPING BOOKS

In [5]:
base_url = "https://books.toscrape.com/"
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"}      # Chrome browser string
response = requests.get(base_url, timeout=5, headers=header)
print(response.status_code)
soup = BeautifulSoup(response.text, "html.parser")

side_panel = soup.find("ul", class_="nav nav-list")
genres = side_panel.select("ul ul li")

200


In [14]:
def scrape_books_from_genre(genre_url):
    global rating_map, page_count, book_data
    base_genre_url = genre_url

    # Scraping a Genre
    while genre_url:
        page_count += 1
        print(page_count)

        genre_response = requests.get(genre_url, timeout=5, headers=header)
        genre_soup = BeautifulSoup(genre_response.text, "html.parser")

        # All the books in the current page of the genre
        books = genre_soup.select("article.product_pod")

        # Scraping details of each book
        for book in books:
            price = book.select_one("p.price_color").get_text(strip=True)       # price
            rating_text = book.select_one("p.star-rating")["class"][-1].lower()
            rating = rating_map[rating_text]        # rating

            # Scraping details from individual book pages
            book_href = book.h3.select_one("a")["href"]
            book_url = base_url + "catalogue/" + book_href[9:]
            book_response = requests.get(book_url, timeout=5, headers=header)
            book_soup = BeautifulSoup(book_response.text, "html.parser")

            title = book_soup.h1.get_text(strip=True)       # title
            print(title)
            availability = book_soup.find("p", class_="instock availability").get_text(strip=True)      # availability
            table = book_soup.find("table", class_="table table-striped")
            rows = table.select("tr")

            for row in rows:
                if row.select_one("th").get_text(strip=True) == 'UPC':
                    upc = row.select_one("td").get_text(strip=True)     # UPC

            book_data[genre_text][title] = {"UPC": upc, "Price": price, "Rating": rating, "Availability": availability}     # Recording data
            # time.sleep(random.uniform(1,2))

        # Looking for next button in the same genre
        next_button = genre_soup.find("li", class_="next")
        print()

        if next_button:
            next_href = next_button.find("a")["href"]
            genre_url = base_genre_url.replace("index.html", next_href)
            # time.sleep(random.uniform(1, 2))
        else:
            page_count = 0
            print()
            break

In [None]:
page_count = 0
rating_map = {"one": 1, "two": 2, "three": 3, "four": 4, "five": 5}
book_data = defaultdict(lambda: defaultdict())

for genre in genres:
    genre_text = genre.get_text(strip=True)
    genre_href = genre.find("a")["href"]
    print(genre_text)

    genre_url = base_url + genre_href
    scrape_books_from_genre(genre_url)


In [11]:
genre = genres[1]

genre_href = genre.find("a")["href"]
genre_url = base_url + genre_href
print(genre_url)

genre_response = requests.get(genre_url, timeout=5, headers=header)
genre_soup = BeautifulSoup(genre_response.text, "html.parser")

https://books.toscrape.com/catalogue/category/books/mystery_3/index.html


In [12]:
books = genre_soup.select("article.product_pod")
book = books[0]
book_href = book.h3.select_one("a")["href"]
book_url = base_url + "catalogue/" + book_href[9:]
book_url

'https://books.toscrape.com/catalogue/sharp-objects_997/index.html'

In [13]:
next_button = genre_soup.find("li", class_="next")
print(next_button)
next_href = next_button.find("a")["href"]
genre_url = genre_url.replace("index.html", next_href)
genre_url

<li class="next"><a href="page-2.html">next</a></li>


'https://books.toscrape.com/catalogue/category/books/mystery_3/page-2.html'

In [31]:
rating = book.select_one("p.star-rating")
rating["class"]

['star-rating', 'Two']

In [45]:
book_href = book.h3.select_one("a")["href"]
book_url = base_url + "catalogue/" + book_href[9:]
print(book_url)
book_response = requests.get(book_url, timeout=5, headers=header)
book_soup = BeautifulSoup(book_response.text, "html.parser")

book_soup.text

https://books.toscrape.com/catalogue/its-only-the-himalayas_981/index.html




In [52]:
table = book_soup.select_one("table.table.table-striped")
rows = table.select("tr")
row = rows[0]
row.select_one("th").get_text(strip=True)

'UPC'

In [2]:
import difflib

i = "jk rowling"
c = "j.k. rowling"
similarity = difflib.SequenceMatcher(None, i, c).ratio()
similarity

0.9090909090909091

In [6]:
authors = {"j.k. rowling": "j", "jane austen": "k"}
match = difflib.get_close_matches("jk rowlin", authors.keys(), n=1, cutoff=0.85)
match

['j.k. rowling']

In [2]:
response = requests.get("https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html")
soup = BeautifulSoup(response.text, "html.parser")

breadcrumb = soup.select("ul.breadcrumb li a")
breadcrumb

[<a href="../../index.html">Home</a>,
 <a href="../category/books_1/index.html">Books</a>,
 <a href="../category/books/poetry_23/index.html">Poetry</a>]