In [6]:
from lxml.html import parse, fromstring
from urllib.request import urlopen
import json
# we only use pandas to show you the scraped data
import pandas as pd  
from lxml.html import parse
import requests

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import (
    WebDriverWait)
from selenium.webdriver.support import (
    expected_conditions as EC)
from selenium.webdriver.common.by import By
import time


In [8]:
tree=parse(urlopen(
    "https://cssbook.net/d/eat/index.html"))

# get the restaurant names via XPATH 
print([e.text_content().strip() for e in 
       tree.xpath('//h3')])

# get the restaurant names via CSS Selector
print([e.text_content().strip() for e in
       tree.getroot().cssselect('h3')])

['Pizzeria Roma', 'Trattoria Napoli', 'Curry King']
['Pizzeria Roma', 'Trattoria Napoli', 'Curry King']


In [11]:
# three ways of extracting text
print('Appending "/text()" to the XPATH gives you'
      'exactly the text that is in the element'
      'itself, including line-breaks that happen'
      'to be in the source code:' )
print(tree.xpath(
    '//div[@class="restaurant"]/text()'))

print('\nUsing the "text" property of the'
      'elements in the list of elements that are'
      'matched by the XPATH expression gives you'
      'the text of the elements themselves'
      'without the line breaks:')
print([e.text for e in tree.xpath(
    '//div[@class="restaurant"]')])

print('\nUsing the "text_content()" method'
      'instead returns the text of the element'
      '*and the text of its children*:')
print([e.text_content() for e in tree.xpath(
    '//div[@class="restaurant"]')])

print('\nThe same but using CSS Selectors (note'
      'the .getroot() method, because the'
      'selectors can only be applied to HTML'
      'elements, not to DOM trees):')
print([e.text_content() for e in
       tree.getroot().cssselect('.restaurant')])

Appending "/text()" to the XPATH gives youexactly the text that is in the elementitself, including line-breaks that happento be in the source code:
[' ', '\n      ', '\n      ', '\n    ', ' ', '\n      ', '\n      ', '\n    ', ' ', '\n      ', '\n      ', '\n    ']

Using the "text" property of theelements in the list of elements that arematched by the XPATH expression gives youthe text of the elements themselveswithout the line breaks:
[' ', ' ', ' ']

Using the "text_content()" methodinstead returns the text of the element*and the text of its children*:
['  Pizzeria Roma \n       Here you can get ... ... \n       Read the full review here\n    ', '  Trattoria Napoli \n       Another restaurant ... ... \n       Read the full review here\n    ', '  Curry King \n       Some description. \n       Read the full review here\n    ']

The same but using CSS Selectors (notethe .getroot() method, because theselectors can only be applied to HTMLelements, not to DOM trees):
['  Pizzeria Roma \n 

In [12]:
linkelements = tree.xpath('//a')
linktexts = [e.text for e in linkelements]
links = [e.attrib['href'] for e in linkelements]

print(linktexts)
print(links)

['here', 'here', 'here']
['review0001.html', 'review0002.html', 'review0003.html']


In [16]:
import requests
from lxml.html import fromstring
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0'}

htmlsource = requests.get('https://cssbook.net/d/eat/index.html', headers = headers).text
tree = fromstring(htmlsource)
print([e.text_content().strip() for e in tree.xpath('//h3')])

['Pizzeria Roma', 'Trattoria Napoli', 'Curry King']


In [14]:
with open('test.html', mode='w') as fo:
    fo.write(htmlsource)

In [17]:
baseurl="http://myreviews.com/nl/hotel.html?page="
tenpages = [f"{baseurl}{i+1}" for i in range(10)]
print(tenpages)

['http://myreviews.com/nl/hotel.html?page=1', 'http://myreviews.com/nl/hotel.html?page=2', 'http://myreviews.com/nl/hotel.html?page=3', 'http://myreviews.com/nl/hotel.html?page=4', 'http://myreviews.com/nl/hotel.html?page=5', 'http://myreviews.com/nl/hotel.html?page=6', 'http://myreviews.com/nl/hotel.html?page=7', 'http://myreviews.com/nl/hotel.html?page=8', 'http://myreviews.com/nl/hotel.html?page=9', 'http://myreviews.com/nl/hotel.html?page=10']


In [19]:
BASEURL = 'https://cssbook.net/d/eat/'

def get_restaurants(url):
    '''takes the URL of an overview page as input and returns a list of (restaurantname, link) tuples'''
    tree = parse(urlopen(url))
    restaurants_names = [e.text.strip() for e in tree.xpath('//div[@class="restaurant"]/h3')]
    restaurants_links = [e.attrib['href'] for e in tree.xpath('//div[@class="restaurant"]//a')]
    return list(zip(restaurants_names, restaurants_links))
   
def get_reviews(url):
    '''takes the URL of a page with reviews as input and yields reviews on it '''
    while True:
        print(f"Downloading and parsing review page {url}...")
        tree = parse(urlopen(url))
        usernames = [e.text.strip() for e in tree.xpath('//div[@class="review"]/h3')]
        reviewtexts = [e.text.strip() for e in tree.xpath('//div[@class="review"]/p')]
        ratings = [e.text.strip() for e in tree.xpath('//div[@class="review"]/div[@class="rating"]')]
        for u, rew, rat in zip(usernames, reviewtexts, ratings):
            review = {}
            review["username"] = u.replace("wrote:","").strip()
            review["reviewtext"] = rew
            review["rating"] = rat
            yield review
        if len(tree.xpath('//span[@class="backbutton"]')) > 0:
            print("Found page with older reviews! I'll process that one next")
            url = BASEURL+tree.xpath('//span[@class="backbutton"]/a')[0].attrib["href"]
        else:
            print("No more pages found.")
            break
        
print("Retrieving all restaurants and their links...")
restaurantlinks = get_restaurants(BASEURL+'index.html')
print(restaurantlinks)

with open("reviews.json", mode = 'w') as f:
    for restaurant, link in restaurantlinks:
        print(f"Processing reviews for {restaurant}...")
        for r in get_reviews(BASEURL+link):
            r['restaurant'] = restaurant
            f.write(json.dumps(r))
            f.write("\n")
            
# you do not need to use pandas -- just added for illustration purposes here
# Note that we stored one JSON object per line instead of per file
df = pd.read_json("reviews.json", lines=True)
df['reviewtext'] = df['reviewtext'].map(lambda x: x[:30]) # shorten for printing
print(df)

Retrieving all restaurants and their links...
[('Pizzeria Roma', 'review0001.html'), ('Trattoria Napoli', 'review0002.html'), ('Curry King', 'review0003.html')]
Processing reviews for Pizzeria Roma...
Downloading and parsing review page https://cssbook.net/d/eat/review0001.html...
No more pages found.
Processing reviews for Trattoria Napoli...
Downloading and parsing review page https://cssbook.net/d/eat/review0002.html...
No more pages found.
Processing reviews for Curry King...
Downloading and parsing review page https://cssbook.net/d/eat/review0003.html...
Found page with older reviews! I'll process that one next
Downloading and parsing review page https://cssbook.net/d/eat/review0003-1.html...
Found page with older reviews! I'll process that one next
Downloading and parsing review page https://cssbook.net/d/eat/review0003-2.html...
No more pages found.
         username                      reviewtext  rating        restaurant
0     gourmet2536  The best thing to do is orderi  7.0/

In [24]:
df['rewiewtext'] = df['rewiewtext'].map(lambda x: x[:30])
print(df)

         username                      rewiewtext  rating  restaurant
0     gourmet2536  The best thing to do is orderi  7.0/10  Pizzeria R
1        foodie12      The worst food I ever had!  1.0/10  Pizzeria R
2    mrsdiningout  If nothing else is open, you c  6.5/10  Trattoria 
3        foodie12           Best Italian in town!  8.6/10  Trattoria 
4           smith                        Love it!  9.0/10  Curry King
5        foodie12                         Superb!  9.2/10  Curry King
6      dontlikeit   As expected, I didn't like it  4.0/10  Curry King
7        otherguy          Try the yoghurt curry!  7.7/10  Curry King
8           tasty  We went here for dinner once a  7.0/10  Curry King
9            anna  I have mixed feeling about thi  6.2/10  Curry King
10           hans                 Not much to say  5.0/10  Curry King
11        bee1983                I am a huge fan!   10/10  Curry King
12         rhebjf  The service is good, the food   6.5/10  Curry King
13  foodcritic555   

In [30]:
df.__repr__?

In [116]:
x.xpath('//a')

[<Element a at 0x7f924b85b458>,
 <Element a at 0x7f924b85b188>,
 <Element a at 0x7f924b85b4f8>]

In [20]:
driver = webdriver.Firefox()
driver.get("https://www.google.com")
element = driver.find_element_by_name("q")
# also check out other options such as 
# .find_element_by_xpath
# or .find_element_by_css_selector
element.send_keys("TinTin")
element.send_keys(Keys.RETURN)
try:
    # let's wait until relevant stuff is loaded
    element2 = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located(
            (By.ID, "rso")))
    element3 = driver.\
    find_element_by_partial_link_text("Tintin")
    element3.click()
    # let's be cautious and wait 10 seconds
    # so that everything is loaded
    time.sleep(10)
    driver.save_screenshot("screenshotTinTin.png")
finally:
    # whatever happens, let's make sure that even
    # if there is an error, we close the browser
    driver.quit()

ElementClickInterceptedException: Message: Element <a href="https://en.wikipedia.org/wiki/The_Adventures_of_Tintin"> is not clickable at point (180,343) because another element <div class="jw8mI"> obscures it


In [13]:
URL = 'http://www.geenstijl.nl/mt/archieven/2014/05/das_toch_niet_normaal.html'

# circumvent cookie wall by setting a specific cookie: the key-value pair (cpc: 10)
client = requests.session()
r = client.get(URL)
cookies = client.cookies.items()
cookies.append(('cpc','10'))
response = client.get(URL,cookies=dict(cookies))
# end circumvention

tree = fromstring(response.text)
allcomments = [e.text_content().strip() for e in tree.cssselect(".cmt-content")]
print(f"There are {len(allcomments)} comments.")

There are 72 comments.


In [15]:
r = requests.get(URL,cookies={"cpc": "10"})
tree = fromstring(r.text)
allcomments = [e.text_content().strip() for e in tree.cssselect(".cmt-content")]
print(f"There are {len(allcomments)} comments.")

There are 72 comments.


['@tiswat | 05-05-14 | 22:57\nOud wijf.',
 "Een VZ800 Marauder, wat een giga kutmotor is dat. Na 10 km. heb je al een houten kont. Ooit zo'n ding nieuw gekocht. Anderhalf jaar later was ik blij dat ik er van af was.",
 'Prachtige omschrijving ook; Een vadsig, dik postuur...\nWaarschijnlijk staan ze hem op te wachten bij hoofdkantoor Telegraaf, ze zijn nogal snel in hun eer aangetast.',
 'Ze zijn hem al aan het zoeken?\nwww.telegraaf.nl/binnenland/22589137/_...',
 'proefrit is altijd mogelijk, maar vraag dan het rijbewijs als borg. Niet bij? Dan even terug naar huis en ophalen, anders elders motor kopen. Hoe moeilijk kan het zijn?',
 'Moet je maar geen zaken doen met Rifapen.\nEconomisch uitsluiten, dat kudtvolk! Vrijheid is immers ook vrijheid geen zaken te doen met lieden van een bepaalde etniciteit.',
 'Er worden steeds meer motorfietsen gestolen, ik durf hem ook nergens te parkeren, alleen als ik er vlak naast zit.\nLichtgetint? Verbaast me niks, ze kiezen meestal voor het gemak, en