In [70]:
from lxml.html import parse
tree = lxml.html.parse('http://cssbook.net/d/restaurants/index.html')

# get the restaurant names via XPATH 
print([e.text_content().strip() for e in tree.xpath('//h3')])

# get the restaurant names via CSS Selector
print([e.text_content().strip() for e in tree.getroot().cssselect('h3')])

['Pizzeria Roma', 'Trattoria Napoli', 'Curry King']
['Pizzeria Roma', 'Trattoria Napoli', 'Curry King']


In [69]:
# three ways of extracting text
print('Appending "/text()" to the XPATH gives you exactly the text that is in the element itself, including line-breaks that happen to be in the source code:' )
print(tree.xpath('//div[@class="restaurant"]/text()'))

print('\nUsing the "text" property of the elements in the list of elements that are matched by the XPATH expression gives you the text of the elements themselves without the line breaks:')
print([e.text for e in tree.xpath('//div[@class="restaurant"]')])

print('\nUsing the "text_content()" method instead returns the text of the element *and the text of its children*:')
print([e.text_content() for e in tree.xpath('//div[@class="restaurant"]')])

print('\nThe same but using CSS Selectors (note the .getroot() method, because the selectors can only be applied to HTML elements, not to DOM trees):')
print([e.text_content() for e in tree.getroot().cssselect('.restaurant')])

Appending "/text()" to the XPATH gives you exactly the text that is in the element itself, including line-breaks that happen to be in the source code:
[' ', '\n      ', '\n      ', '\n    ', ' ', '\n      ', '\n      ', '\n    ', ' ', '\n      ', '\n      ', '\n    ']

Using the "text" property of the elements in the list of elements that are matched by the XPATH expression gives you the text of the elements themselves without the line breaks:
[' ', ' ', ' ']

Using the "text_content()" method instead returns the text of the element *and the text of its children*:
['  Pizzeria Roma \n       Here you can get ... ... \n       Read the full review here\n    ', '  Trattoria Napoli \n       Another restaurant ... ... \n       Read the full review here\n    ', '  Curry King \n       Some description. \n       Read the full review here\n    ']

The same but using CSS Selectors (note the .getroot() method, because the selectors can only be applied to HTML elements, not to DOM trees):
['  Pizze

In [78]:
import requests
from lxml.html import fromstring
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0'}

htmlsource = requests.get('http://cssbook.net/d/restaurants/index.html', headers = headers).text
tree = fromstring(htmlsource)
print([e.text_content().strip() for e in tree.xpath('//h3')])

['Pizzeria Roma', 'Trattoria Napoli', 'Curry King']


In [150]:
baseurl = 'http://myreviewsite.com/amsterdam/hotels.html?page='
tenpages = [f'{baseurl}{i+1}' for i in range(10)]
print(tenpages)

['http://myreviewsite.com/amsterdam/hotels.html?page=1', 'http://myreviewsite.com/amsterdam/hotels.html?page=2', 'http://myreviewsite.com/amsterdam/hotels.html?page=3', 'http://myreviewsite.com/amsterdam/hotels.html?page=4', 'http://myreviewsite.com/amsterdam/hotels.html?page=5', 'http://myreviewsite.com/amsterdam/hotels.html?page=6', 'http://myreviewsite.com/amsterdam/hotels.html?page=7', 'http://myreviewsite.com/amsterdam/hotels.html?page=8', 'http://myreviewsite.com/amsterdam/hotels.html?page=9', 'http://myreviewsite.com/amsterdam/hotels.html?page=10']


In [151]:
import json
import pandas as pd  # not really needed: we only use pandas to show you the scraped data
from lxml.html import parse

BASEURL = 'http://cssbook.net/d/restaurants/'

def get_restaurants(url):
    '''takes the URL of an overview page as input and returns a list of (restaurantname, link) tuples'''
    tree = parse(url)
    restaurants_names = [e.text.strip() for e in tree.xpath('//div[@class="restaurant"]/h3')]
    restaurants_links = [e.attrib['href'] for e in tree.xpath('//div[@class="restaurant"]//a')]
    return list(zip(restaurants_names, restaurants_links))
   
def get_reviews(url):
    '''takes the URL of a page with reviews as input and yields reviews on it '''
    print(f"Downloading and parsing review page {url}...")
    tree = parse(url)
    usernames = [e.text.strip() for e in tree.xpath('//div[@class="review"]/h3')]
    reviewtexts = [e.text.strip() for e in tree.xpath('//div[@class="review"]/p')]
    ratings = [e.text.strip() for e in tree.xpath('//div[@class="review"]/div[@class="rating"]')]
    for u, rew, rat in zip(usernames, reviewtexts, ratings):
        review = {}
        review["username"] = u.replace("wrote:","").strip()
        review["rewiewtext"] = rew
        review["rating"] = rat
        yield review
    if len(tree.xpath('//span[@class="backbutton"]')) > 0:
        print("Found page with older reviews! I'll recursively call myself to parse that one as well!")
        nexturl = tree.xpath('//span[@class="backbutton"]/a')[0].attrib["href"]
        for nextreview in get_reviews(BASEURL+nexturl):
            yield nextreview
        
print("Retrieving all restaurants and their links...")
restaurantlinks = get_restaurants(BASEURL+'index.html')
print(restaurantlinks)

with open("reviews.json", mode = 'w') as f:
    for restaurant, link in restaurantlinks:
        print(f"Processing reviews for {restaurant}...")
        for r in get_reviews(BASEURL+link):
            r['restaurant'] = restaurant
            f.write(json.dumps(r))
            f.write("\n")
            
# you do not need to use pandas -- just added for illustration purposes here
# Note that we stored one JSON object per line instead of per file
df = pd.read_json("reviews.json", lines=True)
df

Retrieving all restaurants and their links...
[('Pizzeria Roma', 'review0001.html'), ('Trattoria Napoli', 'review0002.html'), ('Curry King', 'review0003.html')]
Processing reviews for Pizzeria Roma...
Downloading and parsing review page http://cssbook.net/d/restaurants/review0001.html...
Processing reviews for Trattoria Napoli...
Downloading and parsing review page http://cssbook.net/d/restaurants/review0002.html...
Processing reviews for Curry King...
Downloading and parsing review page http://cssbook.net/d/restaurants/review0003.html...
Found page with older reviews! I'll recursively call myself to parse that one as well!
Downloading and parsing review page http://cssbook.net/d/restaurants/review0003-1.html...
Found page with older reviews! I'll recursively call myself to parse that one as well!
Downloading and parsing review page http://cssbook.net/d/restaurants/review0003-2.html...


Unnamed: 0,username,rewiewtext,rating,restaurant
0,gourmet2536,"The best thing to do is ordering a full menu, ...",7.0/10,Pizzeria Roma
1,foodie12,The worst food I ever had!,1.0/10,Pizzeria Roma
2,mrsdiningout,"If nothing else is open, you can do it.",6.5/10,Trattoria Napoli
3,foodie12,Best Italian in town!,8.6/10,Trattoria Napoli
4,smith,Love it!,9.0/10,Curry King
5,foodie12,Superb!,9.2/10,Curry King
6,doesntlikeindianfood,"As expected, I didn't like it",4.0/10,Curry King
7,someotherguy,Try the yoghurt curry!,7.7/10,Curry King
8,tasty,We went here for dinner once and,7.0/10,Curry King
9,anna,I have mixed feeling about this one.,6.2/10,Curry King


In [144]:
%cat reviews.json

{"username": "smith", "rewiewtext": "Love it!", "rating": "9.0/10", "restaurant": "Curry King"}
{"username": "foodie12", "rewiewtext": "Superb!", "rating": "9.2/10", "restaurant": "Curry King"}
{"username": "doesntlikeindianfood", "rewiewtext": "As expected, I didn't like it", "rating": "4.0/10", "restaurant": "Curry King"}
{"username": "someotherguy", "rewiewtext": "Try the yoghurt curry!", "rating": "7.7/10", "restaurant": "Curry King"}
{"username": "tasty", "rewiewtext": "We went here for dinner once and", "rating": "7.0/10", "restaurant": "Curry King"}
{"username": "anna", "rewiewtext": "I have mixed feeling about this one.", "rating": "6.2/10", "restaurant": "Curry King"}
{"username": "hans", "rewiewtext": "Not much to say", "rating": "5.0/10", "restaurant": "Curry King"}
{"username": "bee1983", "rewiewtext": "I am a huge fan!", "rating": "10/10", "restaurant": "Curry King"}
{"username": "rhebjf", "rewiewtext": "The service is good, the food not so much", "rating": "6.5/10

In [125]:
x.attrib['href']

'review0001.html'

In [116]:
x.xpath('//a')

[<Element a at 0x7f924b85b458>,
 <Element a at 0x7f924b85b188>,
 <Element a at 0x7f924b85b4f8>]