## Install and List Library

In [0]:
!pip install requests

In [0]:
!pip install beautifulsoup4

In [0]:
!pip install lxml

In [0]:
!pip install sqlalchemy

In [0]:
import requests
from bs4 import BeautifulSoup
import lxml
from urllib.parse import urljoin
import sqlite3
import pandas as pd
from sqlalchemy import create_engine

## Extract (Web Scrapping)

### Get book_detail (Detail buku dari halaman buku tertentu)

In [0]:
def book_detail(url_detail):
  page = requests.get(url_detail).text

  soup = BeautifulSoup(page, 'lxml')

  # Get book details
  book_detail_page = soup.find('article', class_='product_page')

  # Get availability
  head_col = book_detail_page.find('div', class_ = 'col-sm-6 product_main')
  instock = head_col.find('p', class_='instock availability').get_text(strip=True).split(' ')[2]
  availability = instock.replace('(', '')

  # Get UPC (United Product Code)
  product_info = book_detail_page.find('table', class_ = 'table table-striped')
  upc = product_info.find('tr').find('td').get_text()

  detail = {
      'availability' : availability,
      'upc' : upc
  }

  return detail

### Get Book Overview

In [0]:
def book_overview():
  url_html = f'https://books.toscrape.com/'
  page = requests.get(url_html).text

  soup = BeautifulSoup(page, 'lxml')

  # Get book overview
  book_cards = soup.find_all('article', class_='product_pod')

  data = []
  for book_card in book_cards:
    # Scrape Title of the book
    h3_tag = book_card.find('h3')
    a_tag = h3_tag.find('a')
    full_title = a_tag['title']
    book_link = a_tag['href']

    full_url = url_html + book_link
    detail_product = book_detail(full_url)

    # Scrape Price of the book
    price_tag = book_card.find('p', class_ = 'price_color').text
    price = price_tag.split('£')[-1]
    # Scrape Availability
    stock_status = book_card.find('p', class_='instock availability').get_text(strip=True)

    data.append({
        'upc' : detail_product['upc'],
        'title' : full_title,
        'price' : float(price),
        'stock_status' : stock_status,
        'availability' : detail_product['availability']
    })

  return data

### Get All Data From Web with Pagination

In [0]:
def scrape_with_next_button():
    current_url = "https://books.toscrape.com/index.html"
    base_url = "https://books.toscrape.com/"
    data = []

    while current_url:
        print(f"Sedang memproses link: {current_url}")
        res = requests.get(current_url).text
        soup = BeautifulSoup(res, 'lxml')

        # Get book overview
        book_cards = soup.find_all('article', class_='product_pod')

        # get all data
        for book_card in book_cards:
          # Scrape Title of the book
          h3_tag = book_card.find('h3')
          a_tag = h3_tag.find('a')
          full_title = a_tag['title']
          book_link = a_tag['href']

          full_url = urljoin(current_url, book_link)
          detail_product = book_detail(full_url)

          # Scrape Price of the book
          price_tag = book_card.find('p', class_ = 'price_color').text
          price = price_tag.split('£')[-1]
          # Scrape Availability
          stock_status = book_card.find('p', class_='instock availability').get_text(strip=True)

          data.append({
              'upc' : detail_product['upc'],
              'title' : full_title,
              'price' : float(price),
              'stock_status' : stock_status,
              'availability' : detail_product['availability']
          })

        # --- Bagian Mencari Halaman Berikutnya ---
        next_tag = soup.find('li', class_='next')

        if next_tag:
            # Ambil href dari tombol next
            next_href = next_tag.find('a')['href']
            # Logika join URL yang lebih aman
            if "catalogue/" in next_href:
                current_url = urljoin(base_url, next_href)
            else:
                current_url = base_url + "catalogue/" + next_href

        else:
            # Jika tidak ada tombol next, hentikan loop
            current_url = None
            print("Semua halaman selesai diambil.")

    return data

In [0]:
import pandas as pd

all_books = scrape_with_next_button()

In [0]:
df = pd.DataFrame(all_books)
df

## Transform

### Add Category Price Book Column

In [0]:
# Fungsi untuk mengkategorikan harga sebuah buku 
def categorize_price(price):
        if price < 20.0: 
          return 'Budget'

        elif price <= 50.0: 
          return 'Mid-Range'

        else: return 'Premium'
    
# Menambahkan kolom Category harga buku
df['price_class'] = df['price'].apply(categorize_price)

In [0]:
# Data hasil akhir 
df

In [0]:
# Menyimpan data ke CSV
df.to_csv('scrape_books.csv', index=False)

## Load (SQLite)

In [0]:
try:

  engine = create_engine(f'sqlite:///db_books.db')
  df.to_sql('books', engine, if_exists='replace', index=False)

except Exception as err:

  print(f"Terjadi kesalahan saat dalam menyimpan database: {err}")

In [0]:
# Example 
query = "SELECT title, price FROM books ORDER BY price DESC LIMIT 5"
df_check = pd.read_sql(query, con=engine)

print("--- 5 Buku Termahal ---")
print(df_check)