# Using lxml to scrape 

## Installation

In [None]:
%pip install lxml beautifulsoup4 requests

In [3]:
# importing the libraries 
import requests 
from bs4 import BeautifulSoup
from lxml import etree

## Basic Setup


In [4]:
# let us do the basic setup to start scraping using XPath in lxml
url = "https://books.toscrape.com/index.html" 
try:
    response = requests.get(url) # get the webpage 
    response.raise_for_status()
    
    # parse with BeautifulSoup with the lxml parser 
    soup = BeautifulSoup(response.text, 'lxml')
    
    # convert the beautifulSoup object into a lxml element 
    dom = etree.HTML(str(soup))
    
    print(f"Request successful with status code: {response.status_code}")
except requests.exceptions.HTTPError as http_err:
    print(f"HTTP error occurred : {http_err}") # outputs the HTTP errors 
except requests.exceptions.RequestException as err:
    print(f"Error occurred: {err}") # for other request-related errors

Request successful with status code: 200


In [None]:
# let us have a look at the html page content 
print(soup.prettify())

## Scrape the webpage using XPath

In [None]:
# Notice the title element - this carries the title of the webpage 
# Let us scrape the title of the webpage 

title = dom.xpath('//title/text()') # gets the title element 
print(title) # prints the  text of the title as a list

# clean text
print(title[0].strip())

['\n    All products | Books to Scrape - Sandbox\n']
All products | Books to Scrape - Sandbox


In [19]:
# list of all the anchor elements
anchor = dom.xpath('//a')

for a in anchor:
    print(etree.tostring(a, pretty_print=True).decode('utf-8'))

<a href="index.html">Books to Scrape</a>

<a href="index.html">Home</a>


<a href="catalogue/category/books_1/index.html">
                            
                                Books
                            
                        </a>


<a href="catalogue/category/books/travel_2/index.html">
                            
                                Travel
                            
                        </a>


<a href="catalogue/category/books/mystery_3/index.html">
                            
                                Mystery
                            
                        </a>


<a href="catalogue/category/books/historical-fiction_4/index.html">
                            
                                Historical Fiction
                            
                        </a>


<a href="catalogue/category/books/sequential-art_5/index.html">
                            
                                Sequential Art
                            
   

In [20]:
# selection based on a id - id="promotions_left"
id_select = dom.xpath('//*[@id="promotions_left"]')

for item in id_select:
    print(etree.tostring(item, pretty_print=True).decode('utf-8'))

<div id="promotions_left">
</div>




In [24]:
# selection based on a class - the element has two classes namely class="alert alert-warning"

class_select = dom.xpath("//*[contains(concat(' ',@class,' '), ' alert ')]")

for item in class_select:
    print(etree.tostring(item, pretty_print=True).decode('utf-8'))





In [32]:
# I want to see the a element wth href = 'index.html'

a_href = dom.xpath('//a[@href="index.html"]')

for item in a_href:
    print(etree.tostring(item, pretty_print=True).decode('utf-8'))
    print(item.text.strip())
    
# we can also extract the text in a list 
a_href_text = dom.xpath('//a[@href="index.html"]/text()')
print(a_href_text)


<a href="index.html">Books to Scrape</a>

Books to Scrape
<a href="index.html">Home</a>


Home
['Books to Scrape', 'Home']


In [34]:
# Suppose I want all the a tags with href="catalogue/category/books/mystery_3/index.html" or href="catalogue/category/books/travel_2/index.html"
a_tags = dom.xpath('//a[@href="catalogue/category/books/mystery_3/index.html" or @href="catalogue/category/books/travel_2/index.html"]')

for item in a_tags:
    print(etree.tostring(item, pretty_print=True).decode('utf-8'))
    print(item.text.strip())

<a href="catalogue/category/books/travel_2/index.html">
                            
                                Travel
                            
                        </a>


Travel
<a href="catalogue/category/books/mystery_3/index.html">
                            
                                Mystery
                            
                        </a>


Mystery


In [37]:
# say I want all the a tags having the word category inside it 
a_tags = dom.xpath("//a[contains(@href, 'category')]")

for item in a_tags:
    print(etree.tostring(item, pretty_print=True).decode('utf-8'))
    
# alternative way
# Note that XPath does not natively support XPath 
import re 

print("If we need to use a full regular expression with XPath, you can apply it after retrieving the elements by filtering them in Python with re.")
a_tags = dom.xpath('//a')
filtered_a_tags = [a for a in a_tags if re.search(r'category', a.get("href", ""))]

for item in filtered_a_tags:
    print(etree.tostring(item, pretty_print=True).decode('utf-8'))


<a href="catalogue/category/books_1/index.html">
                            
                                Books
                            
                        </a>


<a href="catalogue/category/books/travel_2/index.html">
                            
                                Travel
                            
                        </a>


<a href="catalogue/category/books/mystery_3/index.html">
                            
                                Mystery
                            
                        </a>


<a href="catalogue/category/books/historical-fiction_4/index.html">
                            
                                Historical Fiction
                            
                        </a>


<a href="catalogue/category/books/sequential-art_5/index.html">
                            
                                Sequential Art
                            
                        </a>


<a href="catalogue/category/books/classics_6/i

In [40]:
# say I want a tags whose text contains the word - Fiction
filtered_a_tags = dom.xpath('//a[contains(text(), "Fiction")]')
for item in filtered_a_tags:
    print(etree.tostring(item, pretty_print=True).decode('utf-8'))
    
# I can also use a regular expresssion for the same
print("Alternative way using Regular Expression") 
a_tags = dom.xpath("//a")
filtered_a_tags = [a for a in a_tags if a.text and re.search(r'Fiction', a.text)]
for item in filtered_a_tags:
    print(etree.tostring(item, pretty_print=True).decode('utf-8'))

<a href="catalogue/category/books/historical-fiction_4/index.html">
                            
                                Historical Fiction
                            
                        </a>


<a href="catalogue/category/books/womens-fiction_9/index.html">
                            
                                Womens Fiction
                            
                        </a>


<a href="catalogue/category/books/fiction_10/index.html">
                            
                                Fiction
                            
                        </a>


<a href="catalogue/category/books/science-fiction_16/index.html">
                            
                                Science Fiction
                            
                        </a>


<a href="catalogue/category/books/adult-fiction_29/index.html">
                            
                                Adult Fiction
                            
                        </a>


<a 

In [41]:
# Filter <a> tags where either href contains "category" and text contains 'Fiction'
filtered_a_tags = [
    a for a in a_tags
    if re.search(r"category", a.get("href", "")) and (a.text and re.search(r"Fiction", a.text))
]

# Print matching <a> tags
for a_tag in filtered_a_tags:
    print(etree.tostring(a_tag, pretty_print=True).decode("utf-8"))

<a href="catalogue/category/books/historical-fiction_4/index.html">
                            
                                Historical Fiction
                            
                        </a>


<a href="catalogue/category/books/womens-fiction_9/index.html">
                            
                                Womens Fiction
                            
                        </a>


<a href="catalogue/category/books/fiction_10/index.html">
                            
                                Fiction
                            
                        </a>


<a href="catalogue/category/books/science-fiction_16/index.html">
                            
                                Science Fiction
                            
                        </a>


<a href="catalogue/category/books/adult-fiction_29/index.html">
                            
                                Adult Fiction
                            
                        </a>


<a 

In [43]:
# say, now I want all a with href and title attributes 
filtered_a_tags = dom.xpath("//a[@href and @title]")

# Print matching <a> tags
for a_tag in filtered_a_tags:
    print(etree.tostring(a_tag, pretty_print=True).decode("utf-8"))


<a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a>

<a href="catalogue/tipping-the-velvet_999/index.html" title="Tipping the Velvet">Tipping the Velvet</a>

<a href="catalogue/soumission_998/index.html" title="Soumission">Soumission</a>

<a href="catalogue/sharp-objects_997/index.html" title="Sharp Objects">Sharp Objects</a>

<a href="catalogue/sapiens-a-brief-history-of-humankind_996/index.html" title="Sapiens: A Brief History of Humankind">Sapiens: A Brief History ...</a>

<a href="catalogue/the-requiem-red_995/index.html" title="The Requiem Red">The Requiem Red</a>

<a href="catalogue/the-dirty-little-secrets-of-getting-your-dream-job_994/index.html" title="The Dirty Little Secrets of Getting Your Dream Job">The Dirty Little Secrets ...</a>

<a href="catalogue/the-coming-woman-a-novel-based-on-the-life-of-the-infamous-feminist-victoria-woodhull_993/index.html" title="The Coming Woman: A Novel Based on the Life of the Infamous

In [44]:
# i want to get all the p tags whose class attr starts with price 
filtered_p_tags = dom.xpath("//p[starts-with(@class, 'price')]")

for p_tag in filtered_p_tags:
    print(etree.tostring(p_tag, pretty_print=True).decode("utf-8"))

<p class="price_color">&#194;&#163;51.77</p>


<p class="price_color">&#194;&#163;53.74</p>


<p class="price_color">&#194;&#163;50.10</p>


<p class="price_color">&#194;&#163;47.82</p>


<p class="price_color">&#194;&#163;54.23</p>


<p class="price_color">&#194;&#163;22.65</p>


<p class="price_color">&#194;&#163;33.34</p>


<p class="price_color">&#194;&#163;17.93</p>


<p class="price_color">&#194;&#163;22.60</p>


<p class="price_color">&#194;&#163;52.15</p>


<p class="price_color">&#194;&#163;13.99</p>


<p class="price_color">&#194;&#163;20.66</p>


<p class="price_color">&#194;&#163;17.46</p>


<p class="price_color">&#194;&#163;52.29</p>


<p class="price_color">&#194;&#163;35.02</p>


<p class="price_color">&#194;&#163;57.25</p>


<p class="price_color">&#194;&#163;23.88</p>


<p class="price_color">&#194;&#163;37.59</p>


<p class="price_color">&#194;&#163;51.33</p>


<p class="price_color">&#194;&#163;45.17</p>




In [46]:
# i want to get all the p tags whose class attr ends with color
p_tags = dom.xpath("//p")
filtered_p_tags = [p for p in p_tags if re.search(r'color$', p.get('class',''))]

for p_tag in filtered_p_tags:
    print(etree.tostring(p_tag, pretty_print=True).decode("utf-8"))

<p class="price_color">&#194;&#163;51.77</p>


<p class="price_color">&#194;&#163;53.74</p>


<p class="price_color">&#194;&#163;50.10</p>


<p class="price_color">&#194;&#163;47.82</p>


<p class="price_color">&#194;&#163;54.23</p>


<p class="price_color">&#194;&#163;22.65</p>


<p class="price_color">&#194;&#163;33.34</p>


<p class="price_color">&#194;&#163;17.93</p>


<p class="price_color">&#194;&#163;22.60</p>


<p class="price_color">&#194;&#163;52.15</p>


<p class="price_color">&#194;&#163;13.99</p>


<p class="price_color">&#194;&#163;20.66</p>


<p class="price_color">&#194;&#163;17.46</p>


<p class="price_color">&#194;&#163;52.29</p>


<p class="price_color">&#194;&#163;35.02</p>


<p class="price_color">&#194;&#163;57.25</p>


<p class="price_color">&#194;&#163;23.88</p>


<p class="price_color">&#194;&#163;37.59</p>


<p class="price_color">&#194;&#163;51.33</p>


<p class="price_color">&#194;&#163;45.17</p>




In [50]:
# Now it is easy to get  the titles of the books 
titles = dom.xpath('//a[@title]/@title')
titles

['A Light in the Attic',
 'Tipping the Velvet',
 'Soumission',
 'Sharp Objects',
 'Sapiens: A Brief History of Humankind',
 'The Requiem Red',
 'The Dirty Little Secrets of Getting Your Dream Job',
 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull',
 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics',
 'The Black Maria',
 'Starving Hearts (Triangular Trade Trilogy, #1)',
 "Shakespeare's Sonnets",
 'Set Me Free',
 "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)",
 'Rip it Up and Start Again',
 'Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991',
 'Olio',
 'Mesaerion: The Best Science Fiction Stories 1800-1849',
 'Libertarianism for Beginners',
 "It's Only the Himalayas"]

In [56]:
# say i want the star rating of these books 
rating = dom.xpath("//p[contains(concat(' ',@class,' '), ' star-rating ')]/@class")
rating_list = [r.split(' ')[-1] for r in rating]
rating_list

['Three',
 'One',
 'One',
 'Four',
 'Five',
 'One',
 'Four',
 'Three',
 'Four',
 'One',
 'Two',
 'Four',
 'Five',
 'Five',
 'Five',
 'Three',
 'One',
 'One',
 'Two',
 'Two']

In [7]:
# accessing child element 
'''
<div class="product_price">
    p class="price_color">
        Â£17.93
    </p>
    ...
</div>
'''
# I want all such p elements 
prices = dom.xpath('//div[@class="product_price"]/child::p[@class="price_color"]/text()')
price = [float(p[2:]) for p in prices]
price

[51.77,
 53.74,
 50.1,
 47.82,
 54.23,
 22.65,
 33.34,
 17.93,
 22.6,
 52.15,
 13.99,
 20.66,
 17.46,
 52.29,
 35.02,
 57.25,
 23.88,
 37.59,
 51.33,
 45.17]

In [None]:
# each book in this page is a li object which is child of ul - I want all such li
list_books = dom.xpath('//ol[@class="row"]/child::li')

for book in list_books:
    print(etree.tostring(book, pretty_print=True).decode("utf-8"))

In [10]:
# using descendant to get to the same prices 
prices = dom.xpath('//ol[@class="row"]/descendant::p[@class="price_color"]/text()')
price = [float(p[2:]) for p in prices]
price

[51.77,
 53.74,
 50.1,
 47.82,
 54.23,
 22.65,
 33.34,
 17.93,
 22.6,
 52.15,
 13.99,
 20.66,
 17.46,
 52.29,
 35.02,
 57.25,
 23.88,
 37.59,
 51.33,
 45.17]

In [None]:
# get the same prices using general sibling
'''
<div class="product_price">
    <p class="price_color">
        Â£51.77
    </p>
    <p class="instock availability">
        <i class="icon-ok">
        </i>
        In stock
    </p>
    ...
</div>
'''
# I want to use the other p as sibling to get to the prices 
prices = dom.xpath('//p[contains(@class, "instock")]/preceding-sibling::p[@class="price_color"]/text()') 
'''
XPath expression assumes that the <p> element with class price_color follows the <p> element with class instock, if you use following-sibling but in your case, it's the reverse. 
'''
price = [float(p[2:]) for p in prices]
price

[51.77,
 53.74,
 50.1,
 47.82,
 54.23,
 22.65,
 33.34,
 17.93,
 22.6,
 52.15,
 13.99,
 20.66,
 17.46,
 52.29,
 35.02,
 57.25,
 23.88,
 37.59,
 51.33,
 45.17]

In [26]:
# Adjacent sibling 
'''
<a href="index.html">
    Books to Scrape
</a>
<small>
    We love being scraped!
</small>
'''
# I want the <small> tag based on the <a> tag - the <small> element is an adjacent sibling of the <a> element 

small = dom.xpath('//a[@href="index.html"]/following-sibling::*[1][self::small]/text()')
small

[' We love being scraped!']

In [29]:
# select first p element among the siblings 
'''
<div class="product_price">
    <p class="price_color">
        Â£51.77
    </p>
    <p class="instock availability">
        <i class="icon-ok">
        </i>
        In stock
    </p>
    ...
</div>
'''
# I select the prices - however, now the approach is to select the first p sibling
prices = dom.xpath('//div[@class="product_price"]/p[1]/text()')

price = [float(p[2:]) for p in prices]
price

[51.77,
 53.74,
 50.1,
 47.82,
 54.23,
 22.65,
 33.34,
 17.93,
 22.6,
 52.15,
 13.99,
 20.66,
 17.46,
 52.29,
 35.02,
 57.25,
 23.88,
 37.59,
 51.33,
 45.17]

In [32]:
# select the last p 
prices = dom.xpath('//div[@class="product_price"]/p[last()]')

for p in prices:
    print(etree.tostring(p, pretty_print=True).decode("utf-8"))

<p class="instock availability">
<i class="icon-ok"/>
    
        In stock
    
</p>


<p class="instock availability">
<i class="icon-ok"/>
    
        In stock
    
</p>


<p class="instock availability">
<i class="icon-ok"/>
    
        In stock
    
</p>


<p class="instock availability">
<i class="icon-ok"/>
    
        In stock
    
</p>


<p class="instock availability">
<i class="icon-ok"/>
    
        In stock
    
</p>


<p class="instock availability">
<i class="icon-ok"/>
    
        In stock
    
</p>


<p class="instock availability">
<i class="icon-ok"/>
    
        In stock
    
</p>


<p class="instock availability">
<i class="icon-ok"/>
    
        In stock
    
</p>


<p class="instock availability">
<i class="icon-ok"/>
    
        In stock
    
</p>


<p class="instock availability">
<i class="icon-ok"/>
    
        In stock
    
</p>


<p class="instock availability">
<i class="icon-ok"/>
    
        In stock
    
</p>


<p class="instock availability">

In [41]:
# using the generalized approach 
# say i want the 3rd book and the 7th book from the list of items 
n = [3,7]
for i in n:
    books_3_7 = dom.xpath(f'//ol[@class="row"]/li[{i}]')
    print(etree.tostring(books_3_7[0], pretty_print=True).decode("utf-8"))

<li class="col-xs-6 col-sm-4 col-md-3 col-lg-3">
<article class="product_pod">
<div class="image_container">
<a href="catalogue/soumission_998/index.html"><img alt="Soumission" class="thumbnail" src="media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg"/></a>
</div>
<p class="star-rating One">
<i class="icon-star"/>
<i class="icon-star"/>
<i class="icon-star"/>
<i class="icon-star"/>
<i class="icon-star"/>
</p>
<h3><a href="catalogue/soumission_998/index.html" title="Soumission">Soumission</a></h3>
<div class="product_price">
<p class="price_color">&#194;&#163;50.10</p>
<p class="instock availability">
<i class="icon-ok"/>
    
        In stock
    
</p>
<form>
<button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
</form>
</div>
</article>
</li>


<li class="col-xs-6 col-sm-4 col-md-3 col-lg-3">
<article class="product_pod">
<div class="image_container">
<a href="catalogue/the-dirty-little-secrets-of-getting-your-dream-job_994/in

In [44]:
# Negation 
# I want all the ,a> tags that does not have a <title> attribute 
filtered_a_tags = dom.xpath('//a[not(@title)]')

for a in filtered_a_tags:
    print(etree.tostring(a, pretty_print=True).decode("utf-8"))

<a href="index.html">Books to Scrape</a>

<a href="index.html">Home</a>


<a href="catalogue/category/books_1/index.html">
                            
                                Books
                            
                        </a>


<a href="catalogue/category/books/travel_2/index.html">
                            
                                Travel
                            
                        </a>


<a href="catalogue/category/books/mystery_3/index.html">
                            
                                Mystery
                            
                        </a>


<a href="catalogue/category/books/historical-fiction_4/index.html">
                            
                                Historical Fiction
                            
                        </a>


<a href="catalogue/category/books/sequential-art_5/index.html">
                            
                                Sequential Art
                            
   