# Webscraping intro

## Scraping rules
- You should check a site's terms and conditions before you scrape them. It's their data and they likely have some rules to govern it.
- Be nice - A computer will send web requests much quicker than a user can. Make sure you space out your requests a bit so that you don't hammer the site's server.
- Scrapers break - Sites change their layout all the time. If that happens, be prepared to rewrite your code.
- Web pages are inconsistent - There's sometimes some manual clean up that has to happen even after you've gotten your data.

<h3>Import necessary modules</h3>

In [None]:
import requests
from bs4 import BeautifulSoup
import json
import os

## requests
- requests executes HTTP requests, like GET
- The requests object holds the results of the request. This is page content and other items like HTTP status codes and headers.
- requests only gets the page content without any parsing.
- Beautiful Soup does the parsing of the HTML and finding content within the HTML.

In [None]:
url = "http://www.epicurious.com/search/Tofu Chili"
response = requests.get(url)
if response.status_code == 200:
    print("Success")
else:
    print("Failure")

### requests - connect as function

In [None]:
def connect(url):
    response = requests.get(url)
    if response.status_code == 200:
        print('successfully connected, response code: {}'.format(response.status_code))
    else:
        print('connection failed')
    return response

In [None]:
connect(url);

### requests - passing cookies

In [None]:
session = requests.Session()
# You may pass in custom cookie
r = session.get('http://httpbin.org/get', cookies={'my-cookie': 'browser'})
print(r.text)
session.close()

### requests - streaming
- http://docs.python-requests.org/en/master/user/advanced/#streaming-requests

In [None]:
r = requests.get('http://httpbin.org/stream/20', stream=True)
for line in r.iter_lines():
 # filter out keep-alive new lines
 if line:
    decoded_line = line.decode('utf-8')
    print(json.loads(decoded_line))

### requests pass search keyword

In [None]:
keywords = input("Please enter the things you want to see in a recipe: ")
connect('http://www.epicurious.com/search/' + keywords)

## BeautifulSoup

In [None]:
n_chars = 1000
soup = BeautifulSoup(connect(url).content, 'lxml')
print(soup.prettify()[:n_chars])

### Get result page as function

In [None]:
def result_page(url, keywords=''):
    response = requests.get(url + keywords)
    if not response.status_code == 200:
        return None
    return BeautifulSoup(response.content, 'lxml')

In [None]:
keywords = input("Please enter the things you want to see in a recipe: ")
url = 'http://www.epicurious.com/search/'
results_page = result_page(url, keywords)
# print(results_page.prettify()[100:])

## Headless Selenium

In [None]:
# https://duo.com/decipher/driving-headless-chrome-with-python
# Install Chrome Canary
# Download latest Chromedriver (to Downloads)
# !mkdir going_headless

In [None]:
# !cp ~/Downloads/chromedriver going_headless/

In [None]:
# !ls going_headless/

In [None]:
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options  
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By 
# from selenium.webdriver.support.ui import WebDriverWait 
# from selenium.webdriver.support import expected_conditions as EC 
# from selenium.common.exceptions import TimeoutException

In [None]:
chrome_options = Options()  
chrome_options.add_argument("--headless")
chrome_options.add_argument("--incognito")
chrome_options.binary_location = '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary'

### Selenium interactive

In [None]:
driver = webdriver.Chrome(executable_path=os.path.abspath('going_headless/chromedriver'), chrome_options=chrome_options)  
driver.get("http://www.duo.com")

### General driver functions

In [None]:
print(driver.name, driver.title, driver.current_url, driver.get_cookies)

In [None]:
# Other driver functions
driver.back()
driver.forward()
driver.refresh()
driver.quit()
# driver.close()

#### Interactive functions

In [None]:
# magnifying_glass = driver.find_element_by_id("js-open-icon")  
# if magnifying_glass.is_displayed():  
#     magnifying_glass.click()  
# else:  
#     menu_button = driver.find_element_by_css_selector(".menu-trigger.local")  
#     menu_button.click()

In [None]:
# search_field = driver.find_element_by_id("site-search")  
# search_field.clear()  
# search_field.send_keys("Olabode")  
# search_field.send_keys(Keys.RETURN)  
# assert "Looking Back at Android Security in 2016" in driver.page_source   
# driver.page_source[:1000]

In [None]:
driver.close()

### Browser Selenium

In [None]:
url ='https://www.zara.com/uk/en/search?searchTerm='
keyword = 'dress'
url += keyword

In [None]:
# menu_man = driver.find_element_by_partial_link_text('MAN').click()
url ='https://www.zara.com/uk/en/search?searchTerm='
keyword = 'man trousers'
url += keyword

In [None]:
browser_options = Options()  
browser_options.add_argument("--incognito")
browser_options.binary_location = '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary'

In [None]:
# driver.close()
driver.quit()
browser.quit()
# browser.close()
browser = webdriver.Chrome(executable_path=os.path.abspath('going_headless/chromedriver'), chrome_options=browser_options)  
browser.get(url)

In [None]:
# //*[@id="catalog-area"]/img
# //*[@id="catalog-area"]/script[1]/text()
# //*[@id="catalog-area"]/img

# # country popup
# <div class="popup _popup" role="dialog" id="geolocation-popup" style="z-index: 500; left: 219px; top: 193px; opacity: 1;"><div class="_popup-wrapper"><div class="popup-header "><h1 class="popup-title " data-first="true" tabindex="0" aria-label="."></h1><div class="close  _closeHandler" role="button" aria-label="close" tabindex="0"><i class="icon icon-close"></i></div></div><div class="popup-navigation"><div class="componentNav no-display _componentNav"><div class="prevItem _prevItem"><span class="arrow left-arrow"></span></div><div class="nextItem _nextItem"><span class="arrow right-arrow"></span></div></div></div><div class="popup-content" style="height: auto; min-height: 0px;"><div class="content _content"><header><h2 class="popup-title">Hello,</h2></header><section class="" data-controller="shared/geolocation-main-controller"><div class="info">You are accessing this website from Nederland / Netherlands. Would you like to visit our website in Nederland / Netherlands?</div><section class="button-grp"><button class="button-primary button-big _confirm">Yes, go to the website for Nederland / Netherlands</button><button class="button-secondary button-big _closeHandler">No, continue on the website for United Kingdom</button></section><div class="footer"><span>We are present in over 50 stores, please select yours <a class="_country-selector-trigger">here</a></span></div></section></div></div><div class="popup-footer"><div data-last="true" tabindex="0" aria-lable="."><div></div></div></div></div><div id="loading" style="position: absolute; display: none;"></div></div>
# .popup .popup-header .close
# #geolocation-popup > div._popup-wrapper > div.popup-header > div
# #geolocation-popup > div._popup-wrapper > div.popup-content > div > section > section > button.button-secondary.button-big._closeHandler

# var r = Math.random(),a = r * 10000000000000,i = document.createElement("IMG");i.src = "https://ad.doubleclick.net/ddm/activity/src=8370426;type=counter;cat=remar0;dc_lat=;dc_rdid=;tag_for_child_directed_treatment=;ord=" + a;document.body.appendChild(i);

In [None]:
# #products > ul
# //*[@id="products"]/ul
# //*[@id="product-img-5338551"]
# //*[@id="product-5338551"]/a/div
# #product-5338551 > a > div

In [None]:
import time
import pandas as pd

SCROLL_PAUSE_TIME = 6
# SCROLL_HEIGHT = 'document.body.scrollHeight'
df = pd.DataFrame()
for i, tag in enumerate(browser.find_elements_by_xpath('//*[@id="products"]/ul/li/a/div/img')):
    # Scroll every 4 items
    if i % 4 == 0:
        browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(SCROLL_PAUSE_TIME)
        if i > 40:
            break
    id_ = tag.get_attribute('id')
    alt = tag.get_attribute('alt')
    src = tag.get_attribute('src')
    df.loc[i, 'id'] = id_.split('-')[2]
    df.loc[i, 'ts'] = src.split('=')[1]
    df.loc[i, 'description'] = alt
    df.loc[i, 'source'] = src
    
    print('\b\b\b{}'.format(i), sep='', end='', flush=True)

In [None]:
df

In [None]:
driver.quit()
driver = webdriver.Chrome(executable_path=os.path.abspath('going_headless/chromedriver'), chrome_options=chrome_options) 
driver.get(url)

In [None]:
driver.current_url

In [None]:
# menu_man = driver.find_element_by_partial_link_text('MAN').click()

In [None]:
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')

In [None]:
import time
SCROLL_PAUSE_TIME = 2

for image in soup.find_all('img'):
    time.sleep(SCROLL_PAUSE_TIME)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    print(image['src'])

In [None]:
for tag in soup.find_all('li'):
    try:
        image = tag.a.img
        if tag.a.text[:3] == 'MAN':
            print(tag.a.text)
            tag.a.click()
    except:
        continue

In [None]:
styles = {}
for i, tag in enumerate(soup.find_all('img')):
    try:
        src = tag['src']
        if src[:8] == '//static':
            key = '_'.join(tag['alt'].split())
            key += '_{}'.format(tag['data-productid'])
            styles[key] = 'https:{}'.format(tag['src'], i)
        else:
            print(src)
    except:
        continue
styles

### Save images

In [None]:
# !mkdir images

In [None]:
for key in styles:
    r = requests.get(styles[key], allow_redirects=True)
    open('images/{}.jpg'.format(key), 'wb').write(r.content)

In [None]:
#### Take screenshot

In [None]:
driver.get_screenshot_as_file('sample_screenshot_2.png');

In [None]:
# !ls images/

In [None]:
#### use search field interactively

In [None]:
search_field = driver.find_element_by_class_name("search")  
search_field
# search_field.clear()
# search_field.send_keys(keyword)
# search_field.send_keys(Keys.RETURN)  

In [None]:
for li in driver.find_elements_by_tag_name('img')[2:25]:
    src = li.get_attribute('src')
    alt = li.get_attribute('alt')
    print(src)

In [None]:
for tag in driver.find_elements_by_tag_name('img')[22:42]:
    src = tag.get_attribute('src')
    alt = tag.get_attribute('alt')
    print(src, alt)

In [None]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

driver.implicitly_wait(20) # seconds
    
for tag in driver.find_elements_by_tag_name('img')[:250]:
    src = tag.get_attribute('src')
    alt = tag.get_attribute('alt')
    print(src, alt)
    try:
        alert = driver.switch_to_alert()
        alert.dismiss()
        alert.click()
#         alert.accept()
    except:
        continue

### Open Chrome instance with Selenium

In [None]:
xpath = "//*[@class='product-list _productList']"

In [None]:
products = driver.find_elements_by_xpath(xpath)

In [None]:
events1 = driver.find_elements_by_xpath('//ul[contains(., "product")]/*') # //*[@id="product-6504767"]
events2 = driver.find_elements_by_xpath('//*[@id="products"]/*')
events1, events2

In [None]:
for event in events:
    print(event.text)

In [None]:
#### Python events TODO

In [None]:
def get_events(url):
    driver = webdriver.Chrome(executable_path=os.path.abspath('going_headless/chromedriver'), chrome_options=chrome_options) 
    driver.get(url)
    events = driver.find_elements_by_xpath('//ul[contains(., "product-list _productList")]/li')
    print(events)
    for event in events:
        print(event)
        event_details = dict()
        event_details['name'] = event.find_element_by_xpath('h3[@class="event-title"]/a').text
        event_details['location'] = event.find_element_by_xpath('p/span[@class="event-location"]').text
        event_details['time'] = event.find_element_by_xpath('p/time').text
        print(event_details)
    driver.close()                                      

In [None]:
events = get_events('https://www.python.org/events/python-events/')
print(events)

In [None]:
### New section

In [None]:
tree = html.fromstring(str(results_page))

In [None]:
[div for div in tree.xpath("/html/body/div")]

In [None]:
# !!! Index starts @ 1 not 0
[etree.tostring(node)[:100] for node in tree.xpath("/html/body/div")]

In [None]:
[etree.tostring(node)[:100] for node in tree.xpath("//div")]

In [None]:
['class:{}, id:{}'.format(node.xpath("@class"), node.xpath("@id")) for node in tree.xpath("//div")]

In [None]:
['class:{}, id:{}'.format(node.xpath("@class"), node.xpath("@id")) for node in tree.xpath("//section")]

In [None]:
[etree.tostring(node) for node in tree.xpath("//*[@id='products']/*")]

In [None]:
[etree.tostring(node) for node in tree.xpath("//ul[@class='product-list _productList']/*")]

In [None]:
[etree.tostring(node)[:100] for node in tree.xpath("//*[contains(., 'dress')]")]

In [None]:
['class:{}, id:{}'.format(node.xpath("@class"), node.xpath("@name")) for node in tree.xpath("//*[contains(., 'product')]")]

In [None]:
[etree.tostring(div)[:100] for div in tree.xpath("//section[@class='_results']")] # product-list _productList

In [None]:
[etree.tostring(div)[:100] for div in tree.xpath("//section._results")] # product-list _productList

In [None]:
[etree.tostring(div)[:100] for div in tree.xpath("//lu[@class='product-list _productList']")]

In [None]:
[etree.tostring(div)[:100] for div in tree.xpath("//*[@class='product _product']")]

In [None]:
tree.xpath("//*[@id='product-6504767']")

In [None]:
# tree.xpath("/html/body/div[2]/section")
product_list = [etree.tostring(li) for li in tree.xpath("/html/body/div[2]/section/div/section/ul/li")]
product_list
# tree.xpath('//*[@id="product-6504767"')
tree.xpath('//div[@class="product-info _product-info"]') # //a[@class="item _item"]/@href')  # class="_ariaResults wai-aria-messages"


In [None]:
[li for li in product_list]
# etree.tostring(tree.xpath("/html/body/div[2]/section/div/section/ul"))
# etree.tostring(product_list)

<h3>BS4 functions</h3>

#### find_all  
list of results

In [None]:
results_page = result_page(url, keyword)
results_page.body.div

In [None]:
n_lines = 5
all_a_tags = results_page.find_all('a')
print(type(all_a_tags))
all_a_tags[:n_lines]

#### find  
first result


In [None]:
div_tag = results_page.find('div')

In [None]:
results_page.find_all('a')[0] == results_page.find('a')

In [None]:
type(div_tag), div_tag

### Recursively apply on elements (traverse)

In [None]:
(results_page
    .find('div')
    .find('a')
    .get_text())

### find and find_all  
as css selectors
<li>using selector=value, e.g. class_='recipe-content-card')
<li>using a dictionary, e.g. {'class':'recipe-content-card'}
<li>class is a reserved word in python, please use as 'class' or class_

In [None]:
selector = 'recipe-content-card'
results_page.find_all('article', class_=selector)[0] == results_page.find('article', {'class':selector})

### get_text() 
Returns the content enclosed in a tag

In [None]:
results_page.find('article',{'class':selector}).get_text()

### get()
Returns the value of a tag attribute

In [None]:
recipe_tag = results_page.find('article',{'class':selector})
recipe_link = recipe_tag.find('a')
recipe_content = recipe_tag.find('a').get_text()
link_url = recipe_link.get('href')

print('a tag: {}\n - content: {}\n - link url: {}\n - link type: {} '.format(recipe_link, recipe_content, link_url, type(link_url)))

### List of recipes

In [None]:
def get_recipes(url, keywords='', selector=''):
    recipe_list = []
    try:
        results_page = result_page(url, keywords)
        recipes = results_page.find_all('article', class_=selector)
        
        for recipe in recipes:
            recipe_link = url + recipe.find('a').get('href')
            recipe_name = recipe.find('a').get_text()
            try:
                recipe_description = recipe.find('p', class_='dek').get_text()
            except:
                recipe_description = ''
            recipe_list.append((recipe_name, recipe_link, recipe_description))
            
        return recipe_list
    except:
        return None

In [None]:
url = 'http://www.epicurious.com/search/'
keywords = input('Please enter the things you want to see in a recipe: ')
selector = 'recipe-content-card'
get_recipes(url, keywords, selector)

### Recipe ingredients and preparation

In [None]:
def get_recipe_info(url, keywords='', selector=''):
    recipe_dict = {}
    try:
        results_page = result_page(url, keywords)
        ingredient_list, prep_steps_list = [], []
        for ingredient in results_page.find_all('li', class_='ingredient'):
            ingredient_list.append(ingredient.get_text())
            
        for prep_step in results_page.find_all('li', class_='preparation-step'):
            prep_steps_list.append(prep_step.get_text().strip())
            
        recipe_dict['ingredients'], recipe_dict['preparation'] = ingredient_list, prep_steps_list
        return recipe_dict
    except:
        return recipe_dict
        

In [None]:
url = 'http://www.epicurious.com'
link = '/recipes/food/views/spicy-lemongrass-tofu-233844'
recipe_info = get_recipe_info(url + link)
recipe_info

### Get all recipes

In [None]:
def get_all_recipes(url, keywords='', selector=''):
    results = []
    all_recipes = get_recipes(url, keywords, selector)
    for recipe in all_recipes:
        recipe_dict = get_recipe_info(recipe[1])
        recipe_dict['name'] = recipe[0]
        recipe_dict['description'] = recipe[2]
        results.append(recipe_dict)
    return(results)

In [None]:
url = 'http://www.epicurious.com/search/'
keywords = input('Please enter the things you want to see in a recipe: ')
selector = 'recipe-content-card'
all_recipes = get_all_recipes(url, keywords, selector)
all_recipes 

In [None]:
import pandas as pd
pd.DataFrame(all_recipes)

## JSON

In [None]:
import json

data_string = '[{"b": [2, 4], "c": 3.0, "a": "A"}]'
python_data = json.loads(data_string)
print(python_data)
print(python_data[0]['b'][1])

### json.loads recursively decodes a string in JSON format into equivalent python objects
 - data_string's outermost element is converted into a python list
 - the first element of that list is converted into a dictionary
 - the key of that dictionary is converted into a string
 - the value of that dictionary is converted into a list of two integer elements

In [None]:
print(type(data_string), type(python_data))
print(type(python_data[0]), python_data[0])
print(type(python_data[0]['b']), python_data[0]['b'])

### json.dumps and json.loads

In [None]:
JSON_string = "JSON throws exception when not in correct format"
print(JSON_string)

# Stringify strings
JSON_stringified = json.dumps(JSON_string)
print(JSON_stringified)

# Correct
json.loads(JSON_stringified)

# JSONDecodeError 
# json.loads(JSON_string)

In [None]:
address = 'Amsterdam, Netherlands'
url = 'https://maps.googleapis.com/maps/api/geocode/json?address={}'.format(address)
response = requests.get(url).json()
type(response), response

In [None]:
base_url = 'http://maps.googleapis.com/maps/api/geocode/json'
my_params = {'address': '100 Broadway, New York, NY, U.S.A', 
             'language': 'en'}
response = requests.get(base_url, params=my_params)
results = response.json()['results']
x_geo = results[0]['geometry']['location']
print(x_geo['lng'], x_geo['lat'])

In [None]:
### Get JSON formatted content

In [None]:
def get_json(url, decode='utf-8'):
    try:
        response = requests.get(url)
        if not response.status_code == 200:
            print('HTTP error, response code: {}'.format(response.status_code))
        else:
            try:
                response_data = response.json()
            except:
                print("response not in valid JSON format")
    except:
        print('something went wrong with requests.get')
        
    return response_data

In [None]:
response_data = get_json(url)
response_data

In [None]:
### Get address, latitude, longitude

In [None]:
def get_lat_lng(url):
    response = get_json(url)
    result = response['results'][0]
    formatted_address = result['formatted_address']
    lat = result['geometry']['location']['lat']
    lng = result['geometry']['location']['lng']
    return formatted_address, lat, lng

In [None]:
get_lat_lng(url)

In [None]:
address = 'London Business School'
url = 'https://maps.googleapis.com/maps/api/geocode/json?address={}'.format(address)
get_lat_lng(url)

In [None]:
### Get list of addresses with lat, lon

In [None]:
def get_lat_lng_list(url):
    response = get_json(url)
    result_list = []
    for result in response['results']:
        formatted_address = result['formatted_address']
        lat = result['geometry']['location']['lat']
        lng = result['geometry']['location']['lng']
        result_list.append((formatted_address, lat, lng))
    return result_list

In [None]:
address = 'Baker Street'
url = 'https://maps.googleapis.com/maps/api/geocode/json?address={}'.format(address)
get_lat_lng_list(url)

## Logging in to a web server, e.g. wikipedia

Store your credentials in a encrypted/protected file (line1 = name, line2 = pwd)

In [None]:
with open('../credentials.txt') as f:
    contents = f.read().split('\n')
    username = contents[0]
    password = contents[1]

### Construct object that contains requested login data
Inspect the login-form in your browser

<h3>get the value of the login token</h3>

In [None]:
def get_login_token(response):
    soup = BeautifulSoup(response.text, 'lxml')
    token = soup.find('input', {'name': "wpLoginToken"}).get('value')
    return token

In [None]:
payload = {
    'wpName': username,
    'wpPassword': password,
    'wploginattempt': 'Log in',
    'wpEditToken': '+\\',
    'title': 'Special:UserLogin',
    'authAction': 'login',
    'force': '',
    'wpForceHttps': '1',
    'wpFromhttp': '1',
    'wpLoginToken': 'get_login_token(response)'
    }

<h3>Setup a session, login, and get data</h3>

In [None]:
with requests.session() as s:
    response = s.get('https://en.wikipedia.org/w/index.php?title=Special:UserLogin&returnto=Main+Page')
    
    # Set login token
    payload['wpLoginToken'] = get_login_token(response)
    
    # Send the login request
    response_post = s.post('https://en.wikipedia.org/w/index.php?title=Special:UserLogin&action=submitlogin&type=login',
                           data=payload)
    
    # Get another page and check if we’re still logged in
    response = s.get('https://en.wikipedia.org/wiki/Special:Watchlist')
    data = BeautifulSoup(response.content, 'lxml')

In [None]:
print(data.find('div', class_='mw-changeslist').get_text())

## Use case example - 'Sgraping'

In [None]:
url = 'https://www.jancisrobinson.com'
red_ = '/learn/grape-varieties/red/'
white_ = '/learn/grape-varieties/white/'

In [None]:
def get_hrefs(url, tag, class_, keywords=''):
    try:
        results_page = result_page(url, keywords)
        href_list = results_page.find_all(tag, class_=class_)[0].find_all('a')
        return ['{}{}'.format(url_base, href.get('href')) for href in href_list]
    except:
        return None

In [None]:
hrefs_red = get_hrefs(url, 'ul', 'info-table', red_)
hrefs_white = get_hrefs(url, 'ul', 'info-table', white_)
print(hrefs_white[:5])

### Get text

In [None]:
def get_grape_text(url, tag, class_1, class_2, tag_1, color, keywords=''):
    try:
        results_page = result_page(url, keywords)
        grape = results_page.find_all(tag, class_=class_1)[0].find_all(tag_1)[0].get_text()
        content = results_page.find_all(tag, class_=class_2)[0].get_text()
        return grape, color ,content
    except:
        return None

### Sgrape all varieties

In [None]:
def get_grape_varieties(url, tag, class_1, class_2, tag_1, color):
    grape_var_list = []
    for color in colors:
        hrefs = get_hrefs(url, 'ul', 'info-table', '/learn/grape-varieties/' + color)
        for href in hrefs_red[:10]:
            grape_text_list.append(get_grape_text(href, tag, class_1, class_2, tag_1, color))
    return grape_text_list

In [None]:
url, tag, class_1, class_2, tag_1, colors = 'https://www.jancisrobinson.com', 'div', 'learn-header', 'row', 'h1', ['red', 'white']
get_grape_varieties(href, tag, class_1, class_2, tag_1, colors);

In [None]:
df_grapes = pd.DataFrame(grape_text_list)
df_grapes.columns = ['Grape', 'Color', 'Description']
df_grapes.head()

### Clean - remove excessive spaces

In [None]:
df_grapes['Grape'] = [str(x.strip()) for x in df_grapes['Grape']]

### Convert to UTF (English alfabeth)

In [None]:
# !pip3 install unidecode
import unidecode

In [None]:
# Encode to english, removing special chars
df_grapes['Grape_utf'] = [str(unidecode.unidecode(x).strip()) for x in df_grapes['Grape']]
df_grapes.head()

### Save the grapes

In [None]:
# Save df_grapes
df_grapes.to_csv('grape_descr.csv')

## Reddit

In [None]:
import os
import re
import praw
import requests
from datetime import datetime
from bs4 import BeautifulSoup
import pprint

In [None]:
# Create credentials file:
# !touch reddit_credentials.txt
# !echo "OUR_CLIENT_ID\nOUR_SECRET" > reddit_credentials.txt
# !chmod 400 reddit_credentials.txt

In [None]:
with open('reddit_credentials.txt') as f:
    contents = f.read().split('\n')
    OUR_CLIENT_ID = contents[0]
    OUR_SECRET = contents[1]
APP = 'reddit_test_app/1.0'

In [None]:
reddit = praw.Reddit(client_id=OUR_CLIENT_ID,
    client_secret=OUR_SECRET,
    grant_type='client_credentials',
    user_agent=APP)
subs = reddit.subreddit('Python').top(limit=10)
pprint.pprint([(s.score, s.title) for s in subs])

In [None]:
def get_reddit():
    return praw.Reddit(
        client_id=OUR_CLIENT_ID,
        client_secret=OUR_SECRET,
        grant_type='client_credentials',
        user_agent=APP
    )

In [None]:
def get_top(subreddit_name):
    today = datetime.now().strftime(r'%Y-%m-%d')
    dirname = os.path.join('news-%s' % today, subreddit_name)
    os.makedirs(dirname, exist_ok=True)

    # Get top 50 submissions from reddit
    reddit = get_reddit()
    top_subs = reddit.subreddit(subreddit_name).top(limit=50)

    # Remove those submissions that belongs to reddit
    subs = [sub for sub in top_subs if not sub.domain.startswith('self.')]

    count = 10
    while subs and count > 0:
        sub = subs.pop(0)
        article = get_article(sub.url)
        if article:
            text = '\n\n'.join(article['content'])
            filename = re.sub(r'\W+', '_', article['title']) + '.md'
            open(os.path.join(dirname, filename), 'w').write(text)
            count -= 1

In [None]:
def get_article(url):
    print('  - Retrieving %s' % url)
    try:
        res = requests.get(url)
        if (res.status_code == 200 and 'content-type' in res.headers and
                res.headers.get('content-type').startswith('text/html')):
            article = parse_article(res.text)
            print('      => done, title = "%s"' % article['title'])
            return article
        else:
            print('      x fail or not html')
    except Exception:
        pass

In [None]:
def parse_article(text):
    soup = BeautifulSoup(text, 'html.parser')

    # find the article title
    h1 = soup.body.find('h1')

    # find the common parent for <h1> and all <p>s.
    root = h1
    while root.name != 'body' and len(root.find_all('p')) < 5:
        root = root.parent

    if len(root.find_all('p')) < 5:
        return None

    # find all the content elements.
    ps = root.find_all(['h2', 'h3', 'h4', 'h5', 'h6', 'p', 'pre'])
    ps.insert(0, h1)
    content = [tag2md(p) for p in ps]

    return {'title': h1.text, 'content': content}

In [None]:
def tag2md(tag):
    if tag.name == 'p':
        return tag.text
    elif tag.name == 'h1':
        return f'{tag.text}\n{"=" * len(tag.text)}'
    elif tag.name == 'h2':
        return f'{tag.text}\n{"-" * len(tag.text)}'
    elif tag.name in ['h3', 'h4', 'h5', 'h6']:
        return f'{"#" * int(tag.name[1:])} {tag.text}'
    elif tag.name == 'pre':
        return f'```\n{tag.text}\n```'

In [None]:
for sub in subs:
  res = requests.get(sub.url)
  if (res.status_code == 200 and 'content-type' in res.headers and
      res.headers.get('content-type').startswith('text/html')):
    html = res.text

In [None]:
# Main
subreddits = ['javascript', 'Python', 'news']
for sr in subreddits:
    print('Scraping /r/%s...' % sr)
    get_top(sr)

## Beer
http://blog.kaggle.com/2017/01/31/scraping-for-craft-beers-a-dataset-creation-tutorial/?utm_medium=email&utm_source=intercom&utm_campaign=new+user+onboarding

In [None]:
from urllib.request import urlopen
 
from bs4 import BeautifulSoup
import pandas as pd
import re

In [None]:
# Determines if a table_row is a beer entry
def is_beer_entry(table_row):
    row_cells = table_row.findAll("td")
    beer_id = get_beer_id(row_cells[0].text)
    return ( len(row_cells) == 8 and beer_id )

In [None]:
# Return the beer entry numerical identifier from the "Entry" column.
def get_beer_id(cell_value):
    r = re.match("^(\d{1,4})\.$", cell_value)
    if r and len(r.groups()) == 1:
        beer_id = r.group(1)
        return int(beer_id)
    else:
        return None

In [None]:
def get_all_beers(html_soup):
    beers = []
    all_rows_in_html_page = html_soup.findAll("tr")
    for table_row in all_rows_in_html_page:
        if is_beer_entry(table_row):
            row_cells = table_row.findAll("td")
            beer_entry = {
                "id": get_beer_id(row_cells[0].text),
                "name": row_cells[1].text,
                "brewery_name": row_cells[2].text,
                "brewery_location": row_cells[3].text,
                "style": row_cells[4].text,
                "size": row_cells[5].text,
                "abv": row_cells[6].text,    
                "ibu": row_cells[7].text
            }
            beers.append(beer_entry)
    return beers

In [None]:
url = "http://craftcans.com/db.php?search=all&sort=beerid&ord=desc&view=text"
html = urlopen(url)
html_soup = BeautifulSoup(html, 'html.parser')
beers_list = get_all_beers(html_soup)

In [None]:
df = pd.DataFrame(beers_list)
df.head(5)

In [None]:
breweries = df[["brewery_location", "brewery_name"]]
breweries = breweries.drop_duplicates().reset_index(drop=True)
breweries["id"] = breweries.index
breweries.head(5)

In [None]:
beers = pd.merge(df,
                 breweries,
                 left_on=["brewery_name", "brewery_location"],
                 right_on=["brewery_name", "brewery_location"],
                 sort=True,
                 suffixes=('_beer', '_brewery'))
beers = beers[["abv", "ibu", "id_beer",
               "name", "size", "style", "id_brewery"]]
beers_columns_rename = {
    "id_beer": "id",
    "id_brewery": "brewery_id"
}
beers.rename(inplace=True, columns=beers_columns_rename)
beers.head(5)

In [None]:
breweries["city"] = breweries["brewery_location"].apply(
    lambda location: location.split(",")[0])
breweries["state"] = breweries["brewery_location"].apply(
    lambda location: location.split(",")[1])
breweries = breweries[["brewery_name", "city", "state"]]
breweries.rename(inplace=True, columns={"brewery_name": "name"})

In [None]:
def string_pct_to_float(value):
    stripped = str(value).strip('%')
    try:
        return float(stripped)/100
    except ValueError:    
        return None
 
beers["abv"] = beers["abv"].apply(string_pct_to_float)
 
def string_to_int(value):
    try:
        return int(value)
    except ValueError:  
        return None
 
beers["ibu"] = beers["ibu"].apply(string_to_int)

In [None]:
for possible_value in set(beers["size"].tolist()):
    print(possible_value)

In [None]:
def extract_ounces(value):
    stripped = value.strip("oz")
    match = re.match("(\d{1,2}\.*\d*)", value)
    if match:
        return float(match.group(0))
    else:
        return None
 
beers["ounces"] = beers["size"].apply(extract_ounces)    
del beers["size"]
beers.head(5)

In [None]:
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import sys

In [None]:
def getTitle(url):
    try:
        html = urlopen(url)
    except HTTPError as e:
        print(e)
        return None
    try:
        bsObj = BeautifulSoup(html.read(), 'html.parser')  # or 'lxml'
        title = bsObj.body.h1
    except AttributeError as e:
        return None
    return title

In [None]:
title = getTitle("http://www.pythonscraping.com/exercises/exercise1.html")
if title == None:
    print("Title could not be found")
else:
    print(title)

In [None]:
### Select by class

In [None]:
html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bsObj = BeautifulSoup(html, "html.parser")
nameList = bsObj.findAll("span", {"class": "green"})

for name in nameList:
    print(name.get_text())

In [None]:
### Select by Attribute

In [None]:
html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bsObj = BeautifulSoup(html, "html.parser")
allText = bsObj.findAll(id="text")
print(allText[0].get_text())

In [None]:
### Find descendants(children)

In [None]:
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html, "html.parser")

for child in bsObj.find("table",{"id": "giftList"}).children:
    print(child)

In [None]:
### Find siblings

In [None]:
for sibling in bsObj.find("table",{"id": "giftList"}).tr.next_siblings:
    print(sibling) 

In [None]:
### Find parents

In [None]:
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html, "html.parser")
print(bsObj.find("img",{"src":"../img/gifts/img1.jpg"}).parent.previous_sibling.get_text())

In [None]:
### Regex

In [None]:
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html, "html.parser")
images = bsObj.findAll("img", {"src":re.compile("\.\.\/img\/gifts/img.*\.jpg")})
for image in images: 
    print(image["src"])

In [None]:
### Lambda exp

In [None]:
html = urlopen("http://www.pythonscraping.com/pages/page2.html")
bsObj = BeautifulSoup(html, "html.parser")
tags = bsObj.findAll(lambda tag: len(tag.attrs) == 2)
for tag in tags:
	print(tag)