In [1]:
from lxml.html import parse, fromstring
from urllib.request import urlopen
import json
# we only use pandas to show you the scraped data
import pandas as pd  
from lxml.html import parse
import requests

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import (
    WebDriverWait)
from selenium.webdriver.support import (
    expected_conditions as EC)
from selenium.webdriver.common.by import By
import time

In [2]:
tree=parse(urlopen(
    "https://cssbook.net/d/eat/index.html"))

# get the restaurant names via XPATH 
print([e.text_content().strip() for e in 
       tree.xpath("//h3")])

# get the restaurant names via CSS Selector
print([e.text_content().strip() for e in
       tree.getroot().cssselect("h3")])

['Pizzeria Roma', 'Trattoria Napoli', 'Curry King']
['Pizzeria Roma', 'Trattoria Napoli', 'Curry King']


In [5]:
# three ways of extracting text
print("Appending `/text()` to the XPATH gives you "
      "exactly the text that is in the element "
      "itself, including line-breaks that happen "
      "to be in the source code:" )
print(tree.xpath(
    "//div[@class='restaurant']/text()"))

print("\nUsing the `text` property of the"
      "elements in the list of elements that are "
      "matched by the XPATH expression gives you "
      "the text of the elements themselves "
      "without the line breaks: ")
print([e.text for e in tree.xpath(
    "//div[@class='restaurant']")])

print("\nUsing the `text_content()` method "
      "instead returns the text of the element "
      "*and the text of its children*:")
print([e.text_content() for e in tree.xpath(
    "//div[@class='restaurant']")])

print("\nThe same but using CSS Selectors (note "
      "the .getroot() method, because the "
      "selectors can only be applied to HTML "
      "elements, not to DOM trees): ")
print([e.text_content() for e in
       tree.getroot().cssselect(".restaurant")])

Appending `/text()` to the XPATH gives you exactly the text that is in the element itself, including line-breaks that happen to be in the source code:
[' ', '\n      ', '\n      ', '\n    ', ' ', '\n      ', '\n      ', '\n    ', ' ', '\n      ', '\n      ', '\n    ']

Using the `text` property of theelements in the list of elements that are matched by the XPATH expression gives you the text of the elements themselves without the line breaks: 
[' ', ' ', ' ']

Using the `text_content()` method instead returns the text of the element *and the text of its children*:
['  Pizzeria Roma \n       Here you can get ... ... \n       Read the full review here\n    ', '  Trattoria Napoli \n       Another restaurant ... ... \n       Read the full review here\n    ', '  Curry King \n       Some description. \n       Read the full review here\n    ']

The same but using CSS Selectors (note the .getroot() method, because the selectors can only be applied to HTML elements, not to DOM trees): 
['  Pizz

In [51]:
linkelements = tree.xpath("//a")
linktexts = [e.text for e in linkelements]
links = [e.attrib["href"] for e in linkelements]

print(linktexts)
print(links)

['here', 'here', 'here']
['review0001.html', 'review0002.html', 'review0003.html']


In [52]:
import requests
from lxml.html import fromstring
headers = {"User-Agent": "Mozilla/5.0 (Windows "
    "NT 10.0; Win64; x64; rv:60.0) "
    "Gecko/20100101 Firefox/60.0"}

htmlsource = requests.get(
    "https://cssbook.net/d/eat/index.html", 
    headers = headers).text
tree = fromstring(htmlsource)
print([e.text_content().strip() for e in 
       tree.xpath("//h3")])

['Pizzeria Roma', 'Trattoria Napoli', 'Curry King']


In [53]:
with open("test.html", mode="w") as fo:
    fo.write(htmlsource)

In [54]:
baseurl="https://reviews.com/?page="
tenpages = [f"{baseurl}{i+1}" for i in range(10)]
print(tenpages)

['https://reviews.com/?page=1', 'https://reviews.com/?page=2', 'https://reviews.com/?page=3', 'https://reviews.com/?page=4', 'https://reviews.com/?page=5', 'https://reviews.com/?page=6', 'https://reviews.com/?page=7', 'https://reviews.com/?page=8', 'https://reviews.com/?page=9', 'https://reviews.com/?page=10']


In [7]:
BASEURL = "https://cssbook.net/d/eat/"

def get_restaurants(url):
  """takes the URL of an overview page as input
  returns a list of (name, link) tuples"""
  tree = parse(urlopen(url))
  names = [e.text.strip() for e in 
    tree.xpath("//div[@class='restaurant']/h3")]
  links = [e.attrib["href"] for e in 
    tree.xpath("//div[@class='restaurant']//a")]
  return list(zip(names, links))

def get_reviews(url):
  """yields reviews on the specified page"""
  while True:
    print(f"Downloading {url}...")
    tree = parse(urlopen(url))
    names = [e.text.strip() for e in 
      tree.xpath("//div[@class='review']/h3")]
    texts = [e.text.strip() for e in 
      tree.xpath("//div[@class='review']/p")]
    ratings = [e.text.strip() for e in tree.xpath(
      "//div[@class='rating']")]
    for u,txt,rating in zip(names,texts,ratings):
      review = {}
      review["username"] = u.replace("wrote:","")
      review["reviewtext"] = txt
      review["rating"] = rating
      yield review
    bb=tree.xpath("//span[@class='backbutton']/a")
    if bb:
      print("Processing next page")
      url = BASEURL+bb[0].attrib["href"]
    else:
      print("No more pages found.")
      break
        
print("Retrieving all restaurants...")
links = get_restaurants(BASEURL+"index.html")
print(links)

with open("reviews.json", mode = "w") as f:
    for restaurant, link in links:
        print(f"Processing {restaurant}...")
        for r in get_reviews(BASEURL+link):
            r["restaurant"] = restaurant
            f.write(json.dumps(r))
            f.write("\n")
            
# You can process the results with pandas
# (using lines=True since it"s one json per line)
df = pd.read_json("reviews.json", lines=True)
print(df)

Retrieving all restaurants...
[('Pizzeria Roma', 'review0001.html'), ('Trattoria Napoli', 'review0002.html'), ('Curry King', 'review0003.html')]
Processing Pizzeria Roma...
Downloading https://cssbook.net/d/eat/review0001.html...
No more pages found.
Processing Trattoria Napoli...
Downloading https://cssbook.net/d/eat/review0002.html...
No more pages found.
Processing Curry King...
Downloading https://cssbook.net/d/eat/review0003.html...
Processing next page
Downloading https://cssbook.net/d/eat/review0003-1.html...
Processing next page
Downloading https://cssbook.net/d/eat/review0003-2.html...
No more pages found.
          username                                         reviewtext  rating  \
0     gourmet2536   The best thing to do is ordering a full menu, ...  7.0/10   
1        foodie12                          The worst food I ever had!  1.0/10   
2    mrsdiningout             If nothing else is open, you can do it.  6.5/10   
3        foodie12                               Best 

In [59]:
driver = webdriver.Firefox()
driver.implicitly_wait(10)
driver.get("https://www.duckduckgo.com")
element = driver.find_element_by_name("q")
# also check out other options such as 
# .find_element_by_xpath
# or .find_element_by_css_selector
element.send_keys("TinTin")
element.send_keys(Keys.RETURN)
try:
    driver.find_element_by_css_selector(
        "#links a").click()
    # let"s be cautious and wait 10 seconds
    # so that everything is loaded
    time.sleep(10)
    driver.save_screenshot("screenshotTinTin.png")
finally:
    # whatever happens, close the browser
    driver.quit()

In [6]:
URL = "https://www.geenstijl.nl/5160019/page"

# circumvent cookie wall by setting a specific
# cookie: the key-value pair (cpc: 10)
client = requests.session()
r = client.get(URL)

cookies = client.cookies.items()
cookies.append(("cpc","10"))
response = client.get(URL,cookies=dict(cookies))
# end circumvention

tree = fromstring(response.text)
allcomments = [e.text_content().strip() for e in 
               tree.cssselect(".cmt-content")]
print(f"There are {len(allcomments)} comments.")

Een kudtkoekiewall. Omdat dat moet, van de kudtkoekiewet.
There are 318 comments.


In [61]:
r = requests.get(URL,cookies={"cpc": "10"})
tree = fromstring(r.text)
allcomments = [e.text_content().strip() for e in 
               tree.cssselect(".cmt-content")]
print(f"There are {len(allcomments)} comments.")

There are 318 comments.
