In [70]:
from lxml.html import parse
tree = lxml.html.parse('http://cssbook.net/d/restaurants/index.html')

# get the restaurant names via XPATH 
print([e.text_content().strip() for e in tree.xpath('//h3')])

# get the restaurant names via CSS Selector
print([e.text_content().strip() for e in tree.getroot().cssselect('h3')])

['Pizzeria Roma', 'Trattoria Napoli', 'Curry King']
['Pizzeria Roma', 'Trattoria Napoli', 'Curry King']


In [69]:
# three ways of extracting text
print('Appending "/text()" to the XPATH gives you exactly the text that is in the element itself, including line-breaks that happen to be in the source code:' )
print(tree.xpath('//div[@class="restaurant"]/text()'))

print('\nUsing the "text" property of the elements in the list of elements that are matched by the XPATH expression gives you the text of the elements themselves without the line breaks:')
print([e.text for e in tree.xpath('//div[@class="restaurant"]')])

print('\nUsing the "text_content()" method instead returns the text of the element *and the text of its children*:')
print([e.text_content() for e in tree.xpath('//div[@class="restaurant"]')])

print('\nThe same but using CSS Selectors (note the .getroot() method, because the selectors can only be applied to HTML elements, not to DOM trees):')
print([e.text_content() for e in tree.getroot().cssselect('.restaurant')])

Appending "/text()" to the XPATH gives you exactly the text that is in the element itself, including line-breaks that happen to be in the source code:
[' ', '\n      ', '\n      ', '\n    ', ' ', '\n      ', '\n      ', '\n    ', ' ', '\n      ', '\n      ', '\n    ']

Using the "text" property of the elements in the list of elements that are matched by the XPATH expression gives you the text of the elements themselves without the line breaks:
[' ', ' ', ' ']

Using the "text_content()" method instead returns the text of the element *and the text of its children*:
['  Pizzeria Roma \n       Here you can get ... ... \n       Read the full review here\n    ', '  Trattoria Napoli \n       Another restaurant ... ... \n       Read the full review here\n    ', '  Curry King \n       Some description. \n       Read the full review here\n    ']

The same but using CSS Selectors (note the .getroot() method, because the selectors can only be applied to HTML elements, not to DOM trees):
['  Pizze

In [78]:
import requests
from lxml.html import fromstring
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0'}

htmlsource = requests.get('http://cssbook.net/d/restaurants/index.html', headers = headers).text
tree = fromstring(htmlsource)
print([e.text_content().strip() for e in tree.xpath('//h3')])

['Pizzeria Roma', 'Trattoria Napoli', 'Curry King']
