In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
url = 'https://books.toscrape.com/catalogue/page-1.html'

In [3]:
response = requests.get(url)
#print(response):  = [200] -> meaning that the request was successful

In [4]:
response = response.content

In [5]:
#print(response): Will give us all the content from the page
#Then we introduce Beautiful soup to convert all this into HTML readable content

In [6]:
soup = BeautifulSoup(response, 'html.parser')
#print(soup) will give the full html code of the page
#We use the inspector to analyze the HTML code on the browser
#To understand exactly which are the features we should scrape

In [7]:
#The content is inside ol: ordered list
#We use the function 'find' to extract the first element of the ordered list
ol = soup.find('ol')
#print(ol) gives the whole content inside the ordered list but we'll need to filter the content we need
#What we need is the name of the book, the link and reviews

In [8]:
#find_all: is used to find all the content inside 'article' and specifically inside class: 'product_pod'
#We add '_' to class => class_ : to differentiate it from class in Python
articles = ol.find_all('article', class_ = 'product_pod')
#print(articles) will give us all the 'article' content which is related to each book on the page number 1

In [39]:
books = [] #empty list that will contain the output of the function
#Next we are going to create a function that extracts the information we seek
#To get the title of the book we should going inside 'img' then the text is inside 'alt'
#star reviews is inside of 'p' (paragraph)
for article in articles:
    image = article.find('img')
    title = image.attrs['alt'] #'alt' is an attribute, therefore, we use 'attrs' function
    #print(title): to get all the titles
    star = article.find('p')
    star = star['class'][1]
    #print(star): to get the reviews. Howver, we only need the number of stars so we can skip the first part of the review
    price = article.find('p', class_ = 'price_color').text
    #print(price): will return all the content inside p and we only want the price
    #so we add '.text' at the end
    price = float(price[1:]) #This will remove the pound sign and convert it into a float instead of char
    books.append([title, price, star])
print(books) #prints a list of the books on the first page with title, price, stars

[['A Light in the Attic', 51.77, 'Three'], ['Tipping the Velvet', 53.74, 'One'], ['Soumission', 50.1, 'One'], ['Sharp Objects', 47.82, 'Four'], ['Sapiens: A Brief History of Humankind', 54.23, 'Five'], ['The Requiem Red', 22.65, 'One'], ['The Dirty Little Secrets of Getting Your Dream Job', 33.34, 'Four'], ['The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull', 17.93, 'Three'], ['The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics', 22.6, 'Four'], ['The Black Maria', 52.15, 'One'], ['Starving Hearts (Triangular Trade Trilogy, #1)', 13.99, 'Two'], ["Shakespeare's Sonnets", 20.66, 'Four'], ['Set Me Free', 17.46, 'Five'], ["Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)", 52.29, 'Five'], ['Rip it Up and Start Again', 35.02, 'Five'], ['Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991', 57.25, 'Three'], ['Olio', 23.88, 'One'], ['Mesaerion: The Best Science Fiction Storie