# Web Scraping dari Website Books to Scrape

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# Mengambil halaman web
url = 'http://books.toscrape.com/'
response = requests.get(url)

# Membuat objek BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Mengekstrak data buku
books = soup.find_all('article', class_='product_pod')

In [3]:
# Lists to hold scraped data
titles = []
prices = []
ratings = []

In [4]:
for book in books:
    # Judul buku
    title = book.h3.a['title']
    titles.append(title)

    # Harga buku
    price = book.find('p', class_='price_color').text
    prices.append(price)

    # Rating buku
    rating = book.p['class'][1]
    ratings.append(rating)

In [5]:
# Menyimpan data ke dalam DataFrame dan ekspor sebagai CSV
df = pd.DataFrame({
    'Title': titles,
    'Price': prices,
    'Rating': ratings
})

In [6]:
# Ekspor ke CSV
df.to_csv('books_scraped.csv', index=False)

print("Scraping selesai! Data disimpan dalam 'books_scraped.csv'.")

Scraping selesai! Data disimpan dalam 'books_scraped.csv'.


In [7]:
# Melihat semua file di direktori kerja
!ls

books_scraped.csv  sample_data


In [8]:
import pandas as pd

# Baca file CSV
df = pd.read_csv('books_scraped.csv')

print(df.head())

                                   Title    Price Rating
0                   A Light in the Attic  Â£51.77  Three
1                     Tipping the Velvet  Â£53.74    One
2                             Soumission  Â£50.10    One
3                          Sharp Objects  Â£47.82   Four
4  Sapiens: A Brief History of Humankind  Â£54.23   Five


# Perbaikan Kode Scraping Buku: Encoding, Simbol Mata Uang, dan Rating

In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Mengambil halaman web
url = 'http://books.toscrape.com/'
response = requests.get(url)

# Membuat objek BeautifulSoup dengan encoding yang tepat
soup = BeautifulSoup(response.content, 'html.parser')

# Mengekstrak data buku
books = soup.find_all('article', class_='product_pod')

# Lists to hold scraped data
titles = []
prices = []
ratings = []

In [10]:
for book in books:
    # Judul buku
    title = book.h3.a['title']
    titles.append(title)

    # Harga buku - perbaiki encoding dan simpan simbol mata uang
    price = book.find('p', class_='price_color').text
    clean_price = price.replace('Â', '').strip()
    prices.append(clean_price)

    # Rating buku - konversi ke angka (1-5)
    rating = book.p['class'][1]
    rating_dict = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}
    numeric_rating = rating_dict.get(rating, 0)
    ratings.append(numeric_rating)

In [11]:
# Menyimpan data ke dalam DataFrame dan ekspor sebagai CSV
df = pd.DataFrame({
    'Title': titles,
    'Price': prices,
    'Rating': ratings
})

# Ekspor ke CSV
df.to_csv('books_scraped_clean.csv', index=False)

print("Scraping selesai! Data disimpan dalam 'books_scraped_clean.csv'.")

Scraping selesai! Data disimpan dalam 'books_scraped_clean.csv'.


In [12]:
!ls

books_scraped_clean.csv  books_scraped.csv  sample_data


In [13]:
import pandas as pd

# Baca file CSV
df = pd.read_csv('books_scraped_clean.csv')

print(df.head())

                                   Title   Price  Rating
0                   A Light in the Attic  £51.77       3
1                     Tipping the Velvet  £53.74       1
2                             Soumission  £50.10       1
3                          Sharp Objects  £47.82       4
4  Sapiens: A Brief History of Humankind  £54.23       5


In [14]:
from google.colab import files
files.download('books_scraped_clean.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Menggunakan GraphQL

In [34]:
import requests
import pandas as pd

url = "https://countries.trevorblades.com/"

# Query GraphQL untuk mendapatkan data negara
query = """
{
  countries {
    code
    name
    continent {
      name
    }
    languages {
      name
    }
  }
}
"""

In [35]:
# Mengirim request ke GraphQL API
response = requests.post(url, json={'query': query})

# Memeriksa status response
if response.status_code == 200:
    print("Request berhasil!")
else:
    print(f"Terjadi kesalahan: {response.status_code}")

Request berhasil!


In [36]:
# Mengambil data dari response
data = response.json()

# Ekstrak data negara
countries = data['data']['countries']

# Membuat DataFrame dari data yang diambil
# Menggunakan prefix untuk mencegah konflik nama kolom
df = pd.json_normalize(
    countries,
    record_path=['languages'],
    meta=['code', 'name', ['continent', 'name']],
    meta_prefix='country_',
    record_prefix='language_'
)

# Mengganti nama kolom untuk kemudahan membaca
df.rename(
    columns={
        'country_name': 'country_name',
        'continent_name': 'continent',
        'language_name': 'language'
    },
    inplace=True
)

# Tampilkan DataFrame
print(df.head())

  language country_code          country_name country_continent.name
0  Catalan           AD               Andorra                 Europe
1   Arabic           AE  United Arab Emirates                   Asia
2   Pashto           AF           Afghanistan                   Asia
3    Uzbek           AF           Afghanistan                   Asia
4  Turkmen           AF           Afghanistan                   Asia


In [37]:
# Simpan DataFrame ke CSV
df.to_csv('countries_data.csv', index=False)

print("Data negara disimpan dalam 'countries_data.csv'.")

Data negara disimpan dalam 'countries_data.csv'.


In [38]:
!ls

books_scraped_clean.csv  books_scraped.csv  countries_data.csv	sample_data


In [39]:
from google.colab import files

files.download('countries_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>