## Install and List Library

In [None]:
!pip install requests



In [None]:
!pip install beautifulsoup4



In [None]:
!pip install lxml



In [None]:
import requests
from bs4 import BeautifulSoup
import lxml
from urllib.parse import urljoin

### Get book_detail (Detail buku dari halaman buku tertentu)

In [None]:
def book_detail(url_detail):
  page = requests.get(url_detail).text

  soup = BeautifulSoup(page, 'lxml')

  # Get book details
  book_detail_page = soup.find('article', class_='product_page')

  # Get availability
  head_col = book_detail_page.find('div', class_ = 'col-sm-6 product_main')
  instock = head_col.find('p', class_='instock availability').get_text(strip=True).split(' ')[2]
  availability = instock.replace('(', '')

  # Get UPC (United Product Code)
  product_info = book_detail_page.find('table', class_ = 'table table-striped')
  upc = product_info.find('tr').find('td').get_text()

  detail = {
      'availability' : availability,
      'upc' : upc
  }

  return detail

### Get Book Overview

In [None]:
def book_overview():
  url_html = f'https://books.toscrape.com/'
  page = requests.get(url_html).text

  soup = BeautifulSoup(page, 'lxml')

  # Get book overview
  book_cards = soup.find_all('article', class_='product_pod')

  data = []
  for book_card in book_cards:
    # Scrape Title of the book
    h3_tag = book_card.find('h3')
    a_tag = h3_tag.find('a')
    full_title = a_tag['title']
    book_link = a_tag['href']

    full_url = url_html + book_link
    detail_product = book_detail(full_url)

    # Scrape Price of the book
    price_tag = book_card.find('p', class_ = 'price_color').text
    price = price_tag.split('£')[-1]
    # Scrape Availability
    stock_status = book_card.find('p', class_='instock availability').get_text(strip=True)

    data.append({
        'title' : full_title,
        'upc' : detail_product['upc'],
        'price' : float(price),
        'stock_status' : stock_status,
        'availability' : detail_product['availability']
    })

  return data

In [None]:
import pandas as pd

buku = book_overview()
dataBuku = pd.DataFrame(buku)
dataBuku

Unnamed: 0,title,upc,price,stock_status,availability
0,A Light in the Attic,a897fe39b1053632,51.77,In stock,22
1,Tipping the Velvet,90fa61229261140a,53.74,In stock,20
2,Soumission,6957f44c3847a760,50.1,In stock,20
3,Sharp Objects,e00eb4fd7b871a48,47.82,In stock,20
4,Sapiens: A Brief History of Humankind,4165285e1663650f,54.23,In stock,20
5,The Requiem Red,f77dbf2323deb740,22.65,In stock,19
6,The Dirty Little Secrets of Getting Your Dream...,2597b5a345f45e1b,33.34,In stock,19
7,The Coming Woman: A Novel Based on the Life of...,e72a5dfc7e9267b2,17.93,In stock,19
8,The Boys in the Boat: Nine Americans and Their...,e10e1e165dc8be4a,22.6,In stock,19
9,The Black Maria,1dfe412b8ac00530,52.15,In stock,19


### Get All Data From Web with Pagination

In [None]:
def scrape_with_next_button():
    current_url = "https://books.toscrape.com/index.html"
    base_url = "https://books.toscrape.com/"
    data = []

    while current_url:
        print(f"Sedang memproses link: {current_url}")
        res = requests.get(current_url).text
        soup = BeautifulSoup(res, 'lxml')

        # Get book overview
        book_cards = soup.find_all('article', class_='product_pod')

        # get all data
        for book_card in book_cards:
          # Scrape Title of the book
          h3_tag = book_card.find('h3')
          a_tag = h3_tag.find('a')
          full_title = a_tag['title']
          book_link = a_tag['href']

          full_url = urljoin(current_url, book_link)
          detail_product = book_detail(full_url)

          # Scrape Price of the book
          price_tag = book_card.find('p', class_ = 'price_color').text
          price = price_tag.split('£')[-1]
          # Scrape Availability
          stock_status = book_card.find('p', class_='instock availability').get_text(strip=True)

          data.append({
              'title' : full_title,
              'upc' : detail_product['upc'],
              'price' : float(price),
              'stock_status' : stock_status,
              'availability' : detail_product['availability']
          })

        # --- Bagian Mencari Halaman Berikutnya ---
        next_tag = soup.find('li', class_='next')

        if next_tag:
            # Ambil href dari tombol next
            next_href = next_tag.find('a')['href']
            # Logika join URL yang lebih aman
            if "catalogue/" in next_href:
                current_url = urljoin(base_url, next_href)
            else:
                current_url = base_url + "catalogue/" + next_href

        else:
            # Jika tidak ada tombol next, hentikan loop
            current_url = None
            print("Semua halaman selesai diambil.")

    return data

In [None]:
import pandas as pd

all_books = scrape_with_next_button()

Sedang memproses link: https://books.toscrape.com/index.html
Sedang memproses link: https://books.toscrape.com/catalogue/page-2.html
Sedang memproses link: https://books.toscrape.com/catalogue/page-3.html
Sedang memproses link: https://books.toscrape.com/catalogue/page-4.html
Sedang memproses link: https://books.toscrape.com/catalogue/page-5.html
Sedang memproses link: https://books.toscrape.com/catalogue/page-6.html
Sedang memproses link: https://books.toscrape.com/catalogue/page-7.html
Sedang memproses link: https://books.toscrape.com/catalogue/page-8.html
Sedang memproses link: https://books.toscrape.com/catalogue/page-9.html
Sedang memproses link: https://books.toscrape.com/catalogue/page-10.html
Sedang memproses link: https://books.toscrape.com/catalogue/page-11.html
Sedang memproses link: https://books.toscrape.com/catalogue/page-12.html
Sedang memproses link: https://books.toscrape.com/catalogue/page-13.html
Sedang memproses link: https://books.toscrape.com/catalogue/page-14.htm

In [None]:
df = pd.DataFrame(all_books)
df

Unnamed: 0,title,upc,price,stock_status,availability
0,A Light in the Attic,a897fe39b1053632,51.77,In stock,22
1,Tipping the Velvet,90fa61229261140a,53.74,In stock,20
2,Soumission,6957f44c3847a760,50.10,In stock,20
3,Sharp Objects,e00eb4fd7b871a48,47.82,In stock,20
4,Sapiens: A Brief History of Humankind,4165285e1663650f,54.23,In stock,20
...,...,...,...,...,...
995,Alice in Wonderland (Alice's Adventures in Won...,cd2a2a70dd5d176d,55.53,In stock,1
996,"Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)",bfd5e1701c862ac3,57.06,In stock,1
997,A Spy's Devotion (The Regency Spies of London #1),19fec36a1dfb4c16,16.97,In stock,1
998,1st to Die (Women's Murder Club #1),f684a82adc49f011,53.98,In stock,1


In [None]:
df.to_csv('web_scrape_books.csv', index=False)