In [1]:
# Goal: Get the title of every book with a 2-star rating

In [2]:
# import web scraping libraries
import requests
import bs4

In [4]:
# get the URL of the site and the pages
# URL to loop across pages: https://books.toscrape.com/catalogue/page-1.html

In [5]:
# get the base url and format per page
base_url = 'https://books.toscrape.com/catalogue/page-{}.html'

In [6]:
# example for page 20
base_url.format('20')

'https://books.toscrape.com/catalogue/page-20.html'

In [9]:
# inspect elements of the book elements and grab the titles with 2-star rating
# get the response from page 1
res = requests.get(base_url.format('1'))

In [10]:
# convert to soup object
soup = bs4.BeautifulSoup(res.text, 'lxml')

In [13]:
# expect 20 books per page
len(soup.select('.product_pod'))

20

In [26]:
# my own solution answered before continuing the video
for book in soup.select('.product_pod'):
    if book.select('.star-rating.Two'):
        print(book.select('h3 a')[0]['title'])

Starving Hearts (Triangular Trade Trilogy, #1)
Libertarianism for Beginners
It's Only the Himalayas


In [28]:
# get the container of the book item
products = soup.select('.product_pod')

In [29]:
# do some example first to test
example = products[0]

In [31]:
# one way is the plain search of the text in example
'star-rating Three' in str(example)

True

In [32]:
# preferred way is by selecting the element by class
example.select('.star-rating.Three')

[<p class="star-rating Three">
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 </p>]

In [33]:
# check if empty list is returned on no results
example.select('.star-rating.Wrong')

[]

In [34]:
# use this to validate condition
[] == example.select('.star-rating.Wrong')

True

In [36]:
# inspect example to plan what element to scrape
example

<article class="product_pod">
<div class="image_container">
<a href="a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
</div>
<p class="star-rating Three">
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
</p>
<h3><a href="a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
<div class="product_price">
<p class="price_color">Â£51.77</p>
<p class="instock availability">
<i class="icon-ok"></i>
    
        In stock
    
</p>
<form>
<button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
</form>
</div>
</article>

In [37]:
# the title is in the link but there are two links available
# 1: for the image link
# 2: for the text link
example.select('a')

[<a href="a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>,
 <a href="a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a>]

In [38]:
# use the text link which has a more complete title
example.select('a')[1]

<a href="a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a>

In [39]:
# get the title from the attribute of the link
example.select('a')[1]['title']

'A Light in the Attic'

In [40]:
# Get book title if it is rated 2 stars using:
# string call in example.select(rating)
# example.select('a')[1]['title']

In [47]:
# prepare list
two_star_titles = []

# scan all pages of the website (total 50 pages)
for page in range(1, 51):
    # scrape books per page selectdd by .product_pod
    scrape_url = base_url.format(page)
    res = requests.get(scrape_url)
    soup = bs4.BeautifulSoup(res.text, 'lxml')
    books = soup.select('.product_pod')
    
    # check each book
    for book in books:
        # whether it has the 2-star rating
        if len(book.select('.star-rating.Two')) != 0:
            # if yes, add the book title to the list
            book_title = book.select('a')[1]['title']
            two_star_titles.append(book_title)

In [45]:
# 196 out of 1000 books has 2-star rating
len(two_star_titles)

196

In [46]:
# print the first 10 books in the list
print(two_star_titles[:10])

['Starving Hearts (Triangular Trade Trilogy, #1)', 'Libertarianism for Beginners', "It's Only the Himalayas", 'How Music Works', 'Maude (1883-1993):She Grew Up with the country', "You can't bury them all: Poems", 'Reasons to Stay Alive', 'Without Borders (Wanderlove #1)', 'Soul Reader', 'Security']
