# Import

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time


# Fungsi bantu untuk mengambil detail tiap buku

In [None]:
def get_book_details(book_url):
    response = requests.get(book_url)
    soup = BeautifulSoup(response.text, "html.parser")

    # --- Ambil kategori ---
    category = soup.select_one("ul.breadcrumb li:nth-of-type(3) a").text.strip()

    # --- Ambil judul ---
    title = soup.find("div", class_="product_main").h1.text.strip()

    # --- Ambil rating ---
    rating = soup.find("p", class_="star-rating")["class"][1]

    # --- Ambil cover image ---
    cover = "https://books.toscrape.com/" + soup.find("div", class_="item active").img["src"].replace("../", "")

    # --- Ambil tabel detail (UPC, harga, stok, review) ---
    table = soup.find("table", class_="table table-striped")
    details = {row.th.text: row.td.text for row in table.find_all("tr")}

    code = details.get("UPC", "")
    price_excl = details.get("Price (excl. tax)", "")
    price_incl = details.get("Price (incl. tax)", "")
    tax = details.get("Tax", "")
    stock_status = "In stock" if "In stock" in details.get("Availability", "") else "Out of stock"

    # Ekstrak jumlah stok tersedia (angka)
    import re
    stock_available = re.findall(r'\d+', details.get("Availability", "0"))
    stock_available = int(stock_available[0]) if stock_available else 0

    # --- Ambil deskripsi (bisa kosong) ---
    desc = soup.select_one("#product_description ~ p")
    description = desc.text.strip() if desc else "No description available"

    # --- Jumlah ulasan ---
    num_reviews = details.get("Number of reviews", "0")

    # Kembalikan sebagai dictionary
    return {
        "category": category,
        "code": code,
        "cover": cover,
        "title": title,
        "rating": rating,
        "price (excl. tax)": price_excl,
        "price (incl. tax)": price_incl,
        "tax": tax,
        "stock status": stock_status,
        "number of stock available": stock_available,
        "description": description,
        "number of reviews": num_reviews,
        "url": book_url
    }

# Loop scraping URL

In [None]:
base_url = "https://books.toscrape.com/catalogue/page-{}.html"
all_books = []

for page in range(1, 51):  # total 50 halaman (1000 buku)
    print(f"ðŸ“„ Scraping halaman {page} ...")
    response = requests.get(base_url.format(page))
    soup = BeautifulSoup(response.text, "html.parser")
    books = soup.select("article.product_pod h3 a")

    for b in books:
        book_url = "https://books.toscrape.com/catalogue/" + b["href"]
        try:
            book_data = get_book_details(book_url)
            all_books.append(book_data)
            time.sleep(0.2)  # jeda kecil untuk mencegah overload
        except Exception as e:
            print(f"Gagal ambil data: {book_url} | Error: {e}")
            continue

print("âœ… Scraping selesai. Total buku:", len(all_books))

ðŸ“„ Scraping halaman 1 ...
ðŸ“„ Scraping halaman 2 ...
ðŸ“„ Scraping halaman 3 ...
ðŸ“„ Scraping halaman 4 ...
ðŸ“„ Scraping halaman 5 ...
ðŸ“„ Scraping halaman 6 ...
ðŸ“„ Scraping halaman 7 ...
ðŸ“„ Scraping halaman 8 ...
ðŸ“„ Scraping halaman 9 ...
ðŸ“„ Scraping halaman 10 ...
ðŸ“„ Scraping halaman 11 ...
ðŸ“„ Scraping halaman 12 ...
ðŸ“„ Scraping halaman 13 ...
ðŸ“„ Scraping halaman 14 ...
ðŸ“„ Scraping halaman 15 ...
ðŸ“„ Scraping halaman 16 ...
ðŸ“„ Scraping halaman 17 ...
ðŸ“„ Scraping halaman 18 ...
ðŸ“„ Scraping halaman 19 ...
ðŸ“„ Scraping halaman 20 ...
ðŸ“„ Scraping halaman 21 ...
ðŸ“„ Scraping halaman 22 ...
ðŸ“„ Scraping halaman 23 ...
ðŸ“„ Scraping halaman 24 ...
ðŸ“„ Scraping halaman 25 ...
ðŸ“„ Scraping halaman 26 ...
ðŸ“„ Scraping halaman 27 ...
ðŸ“„ Scraping halaman 28 ...
ðŸ“„ Scraping halaman 29 ...
ðŸ“„ Scraping halaman 30 ...
ðŸ“„ Scraping halaman 31 ...
ðŸ“„ Scraping halaman 32 ...
ðŸ“„ Scraping halaman 33 ...
ðŸ“„ Scraping halaman 34 ...
ðŸ“„ Scraping halaman 3

# Membuat DataFrame

In [None]:
df_books = pd.DataFrame(all_books)

# Hasil DataFrame

In [None]:
print("Jumlah buku:", len(df_books))
df_books.head()

Jumlah buku: 1000


Unnamed: 0,category,code,cover,title,rating,price (excl. tax),price (incl. tax),tax,stock status,number of stock available,description,number of reviews,url
0,Poetry,a897fe39b1053632,https://books.toscrape.com/media/cache/fe/72/f...,A Light in the Attic,Three,Ã‚Â£51.77,Ã‚Â£51.77,Ã‚Â£0.00,In stock,22,It's hard to imagine a world without A Light i...,0,https://books.toscrape.com/catalogue/a-light-i...
1,Historical Fiction,90fa61229261140a,https://books.toscrape.com/media/cache/08/e9/0...,Tipping the Velvet,One,Ã‚Â£53.74,Ã‚Â£53.74,Ã‚Â£0.00,In stock,20,"""Erotic and absorbing...Written with starling ...",0,https://books.toscrape.com/catalogue/tipping-t...
2,Fiction,6957f44c3847a760,https://books.toscrape.com/media/cache/ee/cf/e...,Soumission,One,Ã‚Â£50.10,Ã‚Â£50.10,Ã‚Â£0.00,In stock,20,"Dans une France assez proche de la nÃƒÂ´tre, un ...",0,https://books.toscrape.com/catalogue/soumissio...
3,Mystery,e00eb4fd7b871a48,https://books.toscrape.com/media/cache/c0/59/c...,Sharp Objects,Four,Ã‚Â£47.82,Ã‚Â£47.82,Ã‚Â£0.00,In stock,20,"WICKED above her hipbone, GIRL across her hear...",0,https://books.toscrape.com/catalogue/sharp-obj...
4,History,4165285e1663650f,https://books.toscrape.com/media/cache/ce/5f/c...,Sapiens: A Brief History of Humankind,Five,Ã‚Â£54.23,Ã‚Â£54.23,Ã‚Â£0.00,In stock,20,From a renowned historian comes a groundbreaki...,0,https://books.toscrape.com/catalogue/sapiens-a...


# Membersihkan karakter "Ã‚" pada mata uang

In [None]:
# Hapus karakter non-ASCII (seperti Ã‚)
df_books['price (excl. tax)'] = df_books['price (excl. tax)'].str.replace('Ã‚', '', regex=False)
df_books['price (incl. tax)'] = df_books['price (incl. tax)'].str.replace('Ã‚', '', regex=False)
df_books['tax'] = df_books['tax'].str.replace('Ã‚', '', regex=False)

# Opsional: ubah menjadi float tanpa simbol Â£
for col in ['price (excl. tax)', 'price (incl. tax)', 'tax']:
    df_books[col] = df_books[col].str.replace('Â£', '', regex=False).astype(float)

# Hasil DataFrame setelah pembersihan

In [None]:
df_books.head()

Unnamed: 0,category,code,cover,title,rating,price (excl. tax),price (incl. tax),tax,stock status,number of stock available,description,number of reviews,url
0,Poetry,a897fe39b1053632,https://books.toscrape.com/media/cache/fe/72/f...,A Light in the Attic,Three,51.77,51.77,0.0,In stock,22,It's hard to imagine a world without A Light i...,0,https://books.toscrape.com/catalogue/a-light-i...
1,Historical Fiction,90fa61229261140a,https://books.toscrape.com/media/cache/08/e9/0...,Tipping the Velvet,One,53.74,53.74,0.0,In stock,20,"""Erotic and absorbing...Written with starling ...",0,https://books.toscrape.com/catalogue/tipping-t...
2,Fiction,6957f44c3847a760,https://books.toscrape.com/media/cache/ee/cf/e...,Soumission,One,50.1,50.1,0.0,In stock,20,"Dans une France assez proche de la nÃƒÂ´tre, un ...",0,https://books.toscrape.com/catalogue/soumissio...
3,Mystery,e00eb4fd7b871a48,https://books.toscrape.com/media/cache/c0/59/c...,Sharp Objects,Four,47.82,47.82,0.0,In stock,20,"WICKED above her hipbone, GIRL across her hear...",0,https://books.toscrape.com/catalogue/sharp-obj...
4,History,4165285e1663650f,https://books.toscrape.com/media/cache/ce/5f/c...,Sapiens: A Brief History of Humankind,Five,54.23,54.23,0.0,In stock,20,From a renowned historian comes a groundbreaki...,0,https://books.toscrape.com/catalogue/sapiens-a...


In [None]:
df_books.to_csv("books_toscrape_complete.csv", index=False)
print("âœ… Dataset lengkap tersimpan sebagai 'books_toscrape_complete.csv'")

âœ… Dataset lengkap tersimpan sebagai 'books_toscrape_complete.csv'
