In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

In [None]:
# URL base do site
base_url = 'http://books.toscrape.com/catalogue/'

# Função para pegar a descrição e o gênero de um livro
def get_book_details(book_url):
    response = requests.get(book_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Pega a descrição do livro
    description_tag = soup.find('meta', {'name': 'description'})
    description = description_tag.get('content').strip() if description_tag else 'No description available'

    # Pega o gênero do livro
    breadcrumb = soup.find('ul', class_='breadcrumb')
    genre = breadcrumb.find_all('a')[2].text if breadcrumb and len(breadcrumb.find_all('a')) > 2 else 'Unknown genre'

    # Pega o preço do livro
    price_find = soup.find('p', class_='price_color')
    price = price_find.text if price_find else 'No price available'

    # Pega o número de livros em estoque
    stock_find = soup.find('p', class_='instock availability')
    brute_stock = stock_find.text if stock_find else 'Out of stock'
    if 'Out of stock' not in brute_stock:
      stock = re.sub(r'[^0-9]', '', brute_stock)

    #Pega a nota do livro
    rating_find = soup.find('p', class_='star-rating')
    rating = rating_find.get('class')[1] if rating_find else 'No rating available'

    return description, genre, price, stock, rating

In [None]:
# Função para pegar os detalhes de todos os livros de uma página
def get_books_from_page(page_url):
    response = requests.get(page_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    books = soup.find_all('article', class_='product_pod')

    book_data = []
    for book in books:
        book_link = book.find('h3').find('a')['href']
        book_link = book_link.replace('../../../', '')
        book_url = base_url + book_link
        try:
            description, genre, price, stock, rating = get_book_details(book_url)
            title = book.find('h3').find('a')['title']
            book_data.append({
                'Title': title,
                'Description': description,
                'Genre': genre,
                'URL': book_url,
                'Rating': rating,
                'Price': price,
                'Stock': stock
            })
        except Exception as e:
            print(f"Error processing book {book_url}: {e}")
    return book_data

# Função principal para navegar entre as páginas e coletar os dados
def scrape_books(base_page_url):
    all_books = []
    page_number = 1
    while True:
        print(f'Scraping page {page_number}...')
        page_url = base_page_url if page_number == 1 else base_page_url.replace('page-1.html', f'page-{page_number}.html')
        books = get_books_from_page(page_url)
        if not books:
            break
        all_books.extend(books)
        page_number += 1
    return all_books

In [None]:
# URL inicial
base_page_url = 'http://books.toscrape.com/catalogue/page-1.html'

# Inicia o scraping
books_data = scrape_books(base_page_url)

# Converte os dados para um DataFrame do pandas
df = pd.DataFrame(books_data)

# Salva os dados em um arquivo CSV
df.to_csv('books_to_scrape.csv', index=False)

In [None]:
df

Unnamed: 0,Title,Description,Genre,URL,Rating,Price,Stock
0,A Light in the Attic,It's hard to imagine a world without A Light i...,Poetry,http://books.toscrape.com/catalogue/a-light-in...,Three,£51.77,\n\n \n In stock (22 available)\n \n
1,Tipping the Velvet,"""Erotic and absorbing...Written with starling ...",Historical Fiction,http://books.toscrape.com/catalogue/tipping-th...,One,£53.74,\n\n \n In stock (20 available)\n \n
2,Soumission,"Dans une France assez proche de la nôtre, un h...",Fiction,http://books.toscrape.com/catalogue/soumission...,One,£50.10,\n\n \n In stock (20 available)\n \n
3,Sharp Objects,"WICKED above her hipbone, GIRL across her hear...",Mystery,http://books.toscrape.com/catalogue/sharp-obje...,Four,£47.82,\n\n \n In stock (20 available)\n \n
4,Sapiens: A Brief History of Humankind,From a renowned historian comes a groundbreaki...,History,http://books.toscrape.com/catalogue/sapiens-a-...,Five,£54.23,\n\n \n In stock (20 available)\n \n
...,...,...,...,...,...,...,...
995,Alice in Wonderland (Alice's Adventures in Won...,,Classics,http://books.toscrape.com/catalogue/alice-in-w...,One,£55.53,\n\n \n In stock (1 available)\n \n
996,"Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)",High school student Kei Nagai is struck dead i...,Sequential Art,http://books.toscrape.com/catalogue/ajin-demi-...,Four,£57.06,\n\n \n In stock (1 available)\n \n
997,A Spy's Devotion (The Regency Spies of London #1),"In England’s Regency era, manners and elegance...",Historical Fiction,http://books.toscrape.com/catalogue/a-spys-dev...,Five,£16.97,\n\n \n In stock (1 available)\n \n
998,1st to Die (Women's Murder Club #1),"James Patterson, bestselling author of the Ale...",Mystery,http://books.toscrape.com/catalogue/1st-to-die...,One,£53.98,\n\n \n In stock (1 available)\n \n


In [None]:
df.columns

Index(['Title', 'Description', 'Genre', 'URL', 'Rating', 'Price', 'Stock'], dtype='object')