# === Install dependencies ===

In [2]:

!pip install requests beautifulsoup4 pandas tqdm




# === Import Library ===

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm

# === Fungsi bantu untuk parsing ===

In [4]:
def get_book_info(book_url):
    """Mengambil detail dari setiap buku"""
    base = "https://books.toscrape.com/catalogue/"
    url = book_url if "catalogue" in book_url else base + book_url
    res = requests.get(url)
    soup = BeautifulSoup(res.text, "html.parser")

      # Ambil informasi umum
    title = soup.find("h1").get_text(strip=True)
    description_tag = soup.select_one("#product_description ~ p")
    description = description_tag.get_text(strip=True) if description_tag else ""

    # Informasi tabel
    table = soup.find("table", class_="table table-striped")
    rows = table.find_all("tr")
    data = {row.th.text.strip(): row.td.text.strip() for row in rows}

    # Ambil gambar
    image_url = soup.find("img")["src"].replace("../../", "https://books.toscrape.com/")

     # Ambil rating
    rating_class = soup.find("p", class_="star-rating")["class"]
    rating = [r for r in rating_class if r != "star-rating"][0]

    # Ambil kategori
    category = soup.select("ul.breadcrumb li a")[-1].text.strip()

    # Ambil stok
    stock_info = soup.find("p", class_="instock availability").get_text(strip=True)
    import re
    number_stock = re.findall(r"\d+", stock_info)
    number_stock = int(number_stock[0]) if number_stock else 0

    # Return data dictionary
    return {
        "code": data.get("UPC"),
        "cover": image_url,
        "title": title,
        "rating": rating,
        "price (excl. tax)": data.get("Price (excl. tax)"),
        "price (incl. tax)": data.get("Price (incl. tax)"),
        "tax": data.get("Tax"),
        "stock status": stock_info,
        "number of stock available": number_stock,
        "description": description,
        "number of reviews": data.get("Number of reviews"),
        "category": category
    }


# === Loop semua halaman ===

In [5]:
# === Loop semua halaman ===

base_url = "https://books.toscrape.com/catalogue/page-{}.html"
books_data = []

for page in tqdm(range(1, 51)):  # total ada 50 halaman, masing2 20 buku = 1000 buku
    url = base_url.format(page)
    res = requests.get(url)
    if res.status_code != 200:
        break
    soup = BeautifulSoup(res.text, "html.parser")
    books = soup.select("article.product_pod h3 a")

    for b in books:
        book_link = b["href"]
        book_data = get_book_info(book_link)
        books_data.append(book_data)

100%|██████████| 50/50 [02:18<00:00,  2.77s/it]



# === Simpan ke DataFrame ===

In [6]:
df = pd.DataFrame(books_data)
print("Total buku:", len(df))
df.head()

Total buku: 1000


Unnamed: 0,code,cover,title,rating,price (excl. tax),price (incl. tax),tax,stock status,number of stock available,description,number of reviews,category
0,a897fe39b1053632,https://books.toscrape.com/media/cache/fe/72/f...,A Light in the Attic,Three,Â£51.77,Â£51.77,Â£0.00,In stock (22 available),22,It's hard to imagine a world without A Light i...,0,Poetry
1,90fa61229261140a,https://books.toscrape.com/media/cache/08/e9/0...,Tipping the Velvet,One,Â£53.74,Â£53.74,Â£0.00,In stock (20 available),20,"""Erotic and absorbing...Written with starling ...",0,Historical Fiction
2,6957f44c3847a760,https://books.toscrape.com/media/cache/ee/cf/e...,Soumission,One,Â£50.10,Â£50.10,Â£0.00,In stock (20 available),20,"Dans une France assez proche de la nÃ´tre, un ...",0,Fiction
3,e00eb4fd7b871a48,https://books.toscrape.com/media/cache/c0/59/c...,Sharp Objects,Four,Â£47.82,Â£47.82,Â£0.00,In stock (20 available),20,"WICKED above her hipbone, GIRL across her hear...",0,Mystery
4,4165285e1663650f,https://books.toscrape.com/media/cache/ce/5f/c...,Sapiens: A Brief History of Humankind,Five,Â£54.23,Â£54.23,Â£0.00,In stock (20 available),20,From a renowned historian comes a groundbreaki...,0,History


# === Simpan ke file CSV ===

In [7]:
df.to_csv("books_1000.csv", index=False)
print("✅ Data berhasil disimpan ke 'books_1000.csv'")


✅ Data berhasil disimpan ke 'books_1000.csv'


In [8]:
from google.colab import files
files.download("books_1000.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>