# Webscraping intro

## Scraping rules
- You should check a site's terms and conditions before you scrape them. It's their data and they likely have some rules to govern it.
- Be nice - A computer will send web requests much quicker than a user can. Make sure you space out your requests a bit so that you don't hammer the site's server.
- Scrapers break - Sites change their layout all the time. If that happens, be prepared to rewrite your code.
- Web pages are inconsistent - There's sometimes some manual clean up that has to happen even after you've gotten your data.

<h3>Import necessary modules</h3>

In [None]:
import requests
from bs4 import BeautifulSoup
import json
import os

## requests
- requests executes HTTP requests, like GET
- The requests object holds the results of the request. This is page content and other items like HTTP status codes and headers.
- requests only gets the page content without any parsing.
- Beautiful Soup does the parsing of the HTML and finding content within the HTML.

In [None]:
url ='https://www.zara.com/uk/en/search?searchTerm='
keywords = input("Search: ")
url += keywords
url

### Get result page as function

In [None]:
def get_soup(url, keywords=''):
    response = requests.get(url + keywords)
    if not response.status_code == 200:
        return None
    return BeautifulSoup(response.content, 'lxml')

In [None]:
soup = get_soup(url, keywords)
soup.html;

## Headless Selenium

In [None]:
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options  
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

In [None]:
chrome_options = Options()  
chrome_options.add_argument("--headless")
chrome_options.add_argument("--incognito")
chrome_options.binary_location = '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary'

In [None]:
driver = webdriver.Chrome(executable_path=os.path.abspath('../_driver_headless/chromedriver'), chrome_options=chrome_options) 
driver.get(url)
driver.current_url

### XML etree

In [None]:
from lxml import etree

In [None]:
tree = etree.HTML(driver.page_source)
result = etree.tostring(tree, pretty_print=True, method="html")
result;

In [None]:
[div for div in tree.xpath("//img")]

In [None]:
# !!! Index starts @ 1 not 0
[etree.tostring(node) for node in tree.xpath("/html/body/div[2]//a//img")]

In [None]:
[etree.tostring(node)[:100] for node in tree.xpath("//div[2]")]

In [None]:
['class:{}, id:{}'.format(node.xpath("@class"), node.xpath("@id")) for node in tree.xpath("//div")]

In [None]:
['class:{}, id:{}'.format(node.xpath("@class"), node.xpath("@id")) for node in tree.xpath("//section")]

In [None]:
[etree.tostring(node) for node in tree.xpath("//*[@id='products']/*")]

In [None]:
[etree.tostring(node) for node in tree.xpath("//ul[@class='product-list _productList']/*")]

In [None]:
[etree.tostring(node)[:100] for node in tree.xpath("//*[contains(., 'dress')]")]

In [None]:
['class:{}, id:{}'.format(node.xpath("@class"), node.xpath("@name")) for node in tree.xpath("//*[contains(., 'product')]")]

In [None]:
[etree.tostring(div)[:100] for div in tree.xpath("//section[@class='_results']")] # product-list _productList

In [None]:
[etree.tostring(div)[:100] for div in tree.xpath("//section._results")] # product-list _productList

In [None]:
[etree.tostring(div)[:100] for div in tree.xpath("//lu[@class='product-list _productList']")]

In [None]:
[etree.tostring(div)[:100] for div in tree.xpath("//*[@class='product _product']")]

In [None]:
tree.xpath("//*[@id='product-6504767']")

In [None]:
# tree.xpath("/html/body/div[2]/section")
product_list = [etree.tostring(li) for li in tree.xpath("/html/body/div[2]/section/div/section/ul/li")]
product_list
# tree.xpath('//*[@id="product-6504767"')
tree.xpath('//div[@class="product-info _product-info"]') # //a[@class="item _item"]/@href')  # class="_ariaResults wai-aria-messages"


In [None]:
[li for li in product_list]

## Logging in to a web server, e.g. wikipedia

Store your credentials in a encrypted/protected file (line1 = name, line2 = pwd)

In [None]:
with open('../credentials.txt') as f:
    contents = f.read().split('\n')
    username = contents[0]
    password = contents[1]

### Construct object that contains requested login data
Inspect the login-form in your browser

<h3>get the value of the login token</h3>

In [None]:
def get_login_token(response):
    soup = BeautifulSoup(response.text, 'lxml')
    token = soup.find('input', {'name': "wpLoginToken"}).get('value')
    return token

In [None]:
payload = {
    'wpName': username,
    'wpPassword': password,
    'wploginattempt': 'Log in',
    'wpEditToken': '+\\',
    'title': 'Special:UserLogin',
    'authAction': 'login',
    'force': '',
    'wpForceHttps': '1',
    'wpFromhttp': '1',
    'wpLoginToken': 'get_login_token(response)'
    }

<h3>Setup a session, login, and get data</h3>

In [None]:
with requests.session() as s:
    response = s.get('https://en.wikipedia.org/w/index.php?title=Special:UserLogin&returnto=Main+Page')
    
    # Set login token
    payload['wpLoginToken'] = get_login_token(response)
    
    # Send the login request
    response_post = s.post('https://en.wikipedia.org/w/index.php?title=Special:UserLogin&action=submitlogin&type=login',
                           data=payload)
    
    # Get another page and check if we’re still logged in
    response = s.get('https://en.wikipedia.org/wiki/Special:Watchlist')
    data = BeautifulSoup(response.content, 'lxml')

In [None]:
print(data.find('div', class_='mw-changeslist').get_text())