In [24]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [25]:
# Mengambil halaman web
url = 'http://books.toscrape.com/'
response = requests.get(url)

# Membuat objek BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Mengekstrak data buku
books = soup.find_all('article', class_='product_pod')

In [26]:
# Lists to hold scraped data
titles = []
prices = []
ratings = []

In [27]:
for book in books:
    # Judul buku
    title = book.h3.a['title']
    titles.append(title)

    # Harga buku
    price = book.find('p', class_='price_color').text
    prices.append(price)

    # Rating buku
    rating = book.p['class'][1]
    ratings.append(rating)

In [28]:
# Menyimpan data ke dalam DataFrame dan ekspor sebagai CSV
df = pd.DataFrame({
    'Title': titles,
    'Price': prices,
    'Rating': ratings
})

In [29]:
# Ekspor ke CSV
df.to_csv('books_scraped.csv', index=False)

print("Scraping selesai! Data disimpan dalam 'books_scraped.csv'.")

Scraping selesai! Data disimpan dalam 'books_scraped.csv'.


In [30]:
# Melihat semua file di direktori kerja
!ls

books_scraped.csv  sample_data


In [31]:
import pandas as pd

# Baca file CSV
df = pd.read_csv('books_scraped.csv')

print(df.head())

                                   Title    Price Rating
0                   A Light in the Attic  Â£51.77  Three
1                     Tipping the Velvet  Â£53.74    One
2                             Soumission  Â£50.10    One
3                          Sharp Objects  Â£47.82   Four
4  Sapiens: A Brief History of Humankind  Â£54.23   Five


# Perbaikan Kode Scraping Buku: Encoding, Simbol Mata Uang, dan Rating

In [51]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Mengambil halaman web
url = 'http://books.toscrape.com/'
response = requests.get(url)

# Membuat objek BeautifulSoup dengan encoding yang tepat
soup = BeautifulSoup(response.content, 'html.parser')

# Mengekstrak data buku
books = soup.find_all('article', class_='product_pod')

# Lists to hold scraped data
titles = []
prices = []
ratings = []

In [52]:
for book in books:
    # Judul buku
    title = book.h3.a['title']
    titles.append(title)

    # Harga buku - perbaiki encoding dan simpan simbol mata uang
    price = book.find('p', class_='price_color').text
    clean_price = price.replace('Â', '').strip()
    prices.append(clean_price)

    # Rating buku - konversi ke angka (1-5)
    rating = book.p['class'][1]
    rating_dict = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}
    numeric_rating = rating_dict.get(rating, 0)
    ratings.append(numeric_rating)

In [53]:
# Menyimpan data ke dalam DataFrame dan ekspor sebagai CSV
df = pd.DataFrame({
    'Title': titles,
    'Price': prices,
    'Rating': ratings
})

# Ekspor ke CSV
df.to_csv('books_scraped_clean.csv', index=False)

print("Scraping selesai! Data disimpan dalam 'books_scraped_clean.csv'.")

Scraping selesai! Data disimpan dalam 'books_scraped_clean.csv'.


In [54]:
!ls

books_scraped_clean.csv  books_scraped.csv  sample_data


In [55]:
import pandas as pd

# Baca file CSV
df = pd.read_csv('books_scraped_clean.csv')

print(df.head())

                                   Title   Price  Rating
0                   A Light in the Attic  £51.77       3
1                     Tipping the Velvet  £53.74       1
2                             Soumission  £50.10       1
3                          Sharp Objects  £47.82       4
4  Sapiens: A Brief History of Humankind  £54.23       5


In [56]:
from google.colab import files
files.download('books_scraped_clean.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>