# code to extract information such as the number of pages, number of books on each page, total number of books

In [1]:
import requests
from bs4 import BeautifulSoup

# Make a GET request to the first page of the website
url = "https://books.toscrape.com/catalogue/page-1.html"
response = requests.get(url)

# Parse the HTML content using Beautiful Soup
soup = BeautifulSoup(response.content, "html.parser")

# Find the total number of pages
num_pages = int(soup.find("li", {"class": "current"}).text.strip().split()[-1])

# Find the number of books on each page
num_books_per_page = len(soup.find_all("article", {"class": "product_pod"}))

# Find the total number of books
total_num_books = (num_pages - 1) * 20 + num_books_per_page

# Print the results
print("Number of pages:", num_pages)
print("Number of books on each page:", num_books_per_page)
print("Total number of books:", total_num_books)


Number of pages: 50
Number of books on each page: 20
Total number of books: 1000


# The number of books in each category. 


In [2]:
import requests
from bs4 import BeautifulSoup

# Make a GET request to the website
url = "https://books.toscrape.com/"
response = requests.get(url)

# Parse the HTML content using Beautiful Soup
soup = BeautifulSoup(response.content, "html.parser")

# Find the link to the "Books" category
books_category = soup.select('a[href*="/category/books_1/"]')

# Find all the subcategories within the "Books" category
subcategories = books_category[0].find_next("ul").find_all("a")

# Loop through each subcategory
for subcategory in subcategories:
    category_url = url + subcategory['href']
    category_response = requests.get(category_url)
    category_soup = BeautifulSoup(category_response.content, "html.parser")
    num_books = len(category_soup.select('article.product_pod'))
    print(f"{subcategory.text.strip()}: {num_books} books")


Travel: 11 books
Mystery: 20 books
Historical Fiction: 20 books
Sequential Art: 20 books
Classics: 19 books
Philosophy: 11 books
Romance: 20 books
Womens Fiction: 17 books
Fiction: 20 books
Childrens: 20 books
Religion: 7 books
Nonfiction: 20 books
Music: 13 books
Default: 20 books
Science Fiction: 16 books
Sports and Games: 5 books
Add a comment: 20 books
Fantasy: 20 books
New Adult: 6 books
Young Adult: 20 books
Science: 14 books
Poetry: 19 books
Paranormal: 1 books
Art: 8 books
Psychology: 7 books
Autobiography: 9 books
Parenting: 1 books
Adult Fiction: 1 books
Humor: 10 books
Horror: 17 books
History: 18 books
Food and Drink: 20 books
Christian Fiction: 6 books
Business: 12 books
Biography: 5 books
Thriller: 11 books
Contemporary: 3 books
Spirituality: 6 books
Academic: 1 books
Self Help: 5 books
Historical: 2 books
Christian: 3 books
Suspense: 1 books
Short Stories: 1 books
Novels: 1 books
Health: 4 books
Politics: 3 books
Cultural: 1 books
Erotica: 1 books
Crime: 1 books


## The data is saved in a CSV file for further analysis.

In [3]:
import requests
from bs4 import BeautifulSoup
import csv

# Make a GET request to the website
url = "https://books.toscrape.com/"
response = requests.get(url)

# Parse the HTML content using Beautiful Soup
soup = BeautifulSoup(response.content, "html.parser")

# Find the link to the "Books" category
books_category = soup.select('a[href*="/category/books_1/"]')

# Find all the subcategories within the "Books" category
subcategories = books_category[0].find_next("ul").find_all("a")

# Create a list to store the category and number of books
data = []

# Loop through each subcategory and get the number of books
for subcategory in subcategories:
    subcategory_url = url + subcategory['href']
    subcategory_response = requests.get(subcategory_url)
    subcategory_soup = BeautifulSoup(subcategory_response.content, "html.parser")
    num_books = len(subcategory_soup.select("article.product_pod"))
    data.append({"Category": subcategory.text.strip(), "Num Books": num_books})

# Save the data to a CSV file
with open("data/categories_data.csv", "w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=["Category", "Num Books"])
    writer.writeheader()
    writer.writerows(data)
