In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Base URL
base_url = "https://books.toscrape.com/catalogue/"
start_url = "https://books.toscrape.com/catalogue/page-1.html"

books_data = []

def get_full_details(book_url):
    """Fungsi untuk mengambil detail di dalam halaman buku"""
    response = requests.get(book_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Ambil tabel informasi produk
    table = soup.find('table', class_='table table-striped')
    rows = table.find_all('tr')
    info = {row.find('th').text: row.find('td').text for row in rows}

    # Ambil deskripsi
    desc_tag = soup.find('div', id='product_description')
    description = desc_tag.find_next('p').text if desc_tag else ""

    # Ambil kategori (breadcrump)
    category = soup.find('ul', class_='breadcrumb').find_all('li')[2].text.strip()

    # Ekstrak angka stok saja dari teks (misal: "In stock (19 available)")
    stock_text = info.get('Availability', '0')
    stock_available = re.findall(r'\d+', stock_text)

    return {
        'category': category,
        'code': info.get('UPC'),
        'price_excl_tax': info.get('Price (excl. tax)'),
        'price_incl_tax': info.get('Price (incl. tax)'),
        'tax': info.get('Tax'),
        'stock_status': "In Stock" if "In stock" in stock_text else "Out of Stock",
        'number_of_stock_available': stock_available[0] if stock_available else "0",
        'description': description,
        'number_of_reviews': info.get('Number of reviews')
    }

print("Memulai proses scraping 1000 buku... Mohon tunggu.")

current_page = 1
while len(books_data) < 1000:
    url = f"https://books.toscrape.com/catalogue/page-{current_page}.html"
    response = requests.get(url)
    if response.status_code != 200:
        break

    soup = BeautifulSoup(response.content, 'html.parser')
    books = soup.find_all('article', class_='product_pod')

    for book in books:
        # Data di halaman utama (katalog)
        relative_link = book.find('h3').find('a')['href']
        full_link = base_url + relative_link

        title = book.find('h3').find('a')['title']
        rating_class = book.find('p', class_='star-rating')['class'][1] # Contoh: "Three"
        cover_url = "https://books.toscrape.com/" + book.find('img')['src'].replace('../', '')

        # Masuk ke detail produk untuk ambil sisa kolom
        detail_data = get_full_details(full_link)

        # Gabungkan semua data
        book_info = {
            'title': title,
            'rating': rating_class,
            'cover': cover_url,
            **detail_data
        }

        books_data.append(book_info)

        if len(books_data) % 50 == 0:
            print(f"Berhasil mengambil {len(books_data)} buku...")

    current_page += 1

# Convert ke DataFrame
df = pd.DataFrame(books_data)

# Reorder kolom sesuai permintaan
df = df[['category', 'code', 'cover', 'title', 'rating', 'price_excl_tax',
         'price_incl_tax', 'tax', 'stock_status', 'number_of_stock_available',
         'description', 'number_of_reviews']]

# Tampilkan 5 data teratas
print("\nSelesai!")
display(df.head())

# Simpan ke CSV untuk diupload ke GitHub
df.to_csv('hasil_scraping_books.csv', index=False)

Memulai proses scraping 1000 buku... Mohon tunggu.
Berhasil mengambil 50 buku...
Berhasil mengambil 100 buku...
Berhasil mengambil 150 buku...
Berhasil mengambil 200 buku...
Berhasil mengambil 250 buku...
Berhasil mengambil 300 buku...
Berhasil mengambil 350 buku...
Berhasil mengambil 400 buku...
Berhasil mengambil 450 buku...
Berhasil mengambil 500 buku...
Berhasil mengambil 550 buku...
Berhasil mengambil 600 buku...
Berhasil mengambil 650 buku...
Berhasil mengambil 700 buku...
Berhasil mengambil 750 buku...
Berhasil mengambil 800 buku...
Berhasil mengambil 850 buku...
Berhasil mengambil 900 buku...
Berhasil mengambil 950 buku...
Berhasil mengambil 1000 buku...

Selesai!


Unnamed: 0,category,code,cover,title,rating,price_excl_tax,price_incl_tax,tax,stock_status,number_of_stock_available,description,number_of_reviews
0,Poetry,a897fe39b1053632,https://books.toscrape.com/media/cache/2c/da/2...,A Light in the Attic,Three,£51.77,£51.77,£0.00,In Stock,22,It's hard to imagine a world without A Light i...,0
1,Historical Fiction,90fa61229261140a,https://books.toscrape.com/media/cache/26/0c/2...,Tipping the Velvet,One,£53.74,£53.74,£0.00,In Stock,20,"""Erotic and absorbing...Written with starling ...",0
2,Fiction,6957f44c3847a760,https://books.toscrape.com/media/cache/3e/ef/3...,Soumission,One,£50.10,£50.10,£0.00,In Stock,20,"Dans une France assez proche de la nôtre, un h...",0
3,Mystery,e00eb4fd7b871a48,https://books.toscrape.com/media/cache/32/51/3...,Sharp Objects,Four,£47.82,£47.82,£0.00,In Stock,20,"WICKED above her hipbone, GIRL across her hear...",0
4,History,4165285e1663650f,https://books.toscrape.com/media/cache/be/a5/b...,Sapiens: A Brief History of Humankind,Five,£54.23,£54.23,£0.00,In Stock,20,From a renowned historian comes a groundbreaki...,0
