# Webscraping intro

## Scraping rules
- You should check a site's terms and conditions before you scrape them. It's their data and they likely have some rules to govern it.
- Be nice - A computer will send web requests much quicker than a user can. Make sure you space out your requests a bit so that you don't hammer the site's server.
- Scrapers break - Sites change their layout all the time. If that happens, be prepared to rewrite your code.
- Web pages are inconsistent - There's sometimes some manual clean up that has to happen even after you've gotten your data.

<h3>Import necessary modules</h3>

In [None]:
import requests
from bs4 import BeautifulSoup
import json
import os

## requests
- requests executes HTTP requests, like GET
- The requests object holds the results of the request. This is page content and other items like HTTP status codes and headers.
- requests only gets the page content without any parsing.
- Beautiful Soup does the parsing of the HTML and finding content within the HTML.

### requests - connect as function

In [None]:
def connect(url):
    response = requests.get(url)
    if response.status_code == 200:
        print('successfully connected, response code: {}'.format(response.status_code))
    else:
        print('connection failed')
    return response

In [None]:
url = 'http://www.epicurious.com/search/'

In [None]:
connect(url);

### requests pass search keyword

In [None]:
keywords = input("Please enter the things you want to see in a recipe: ")
connect(url + keywords)

## BeautifulSoup

In [None]:
n_chars = 1000
soup = BeautifulSoup(connect(url).content, 'lxml')
print(soup.prettify()[:n_chars])

### Get result page as function

In [None]:
def result_page(url, keywords=''):
    response = requests.get(url + keywords)
    if not response.status_code == 200:
        return None
    return BeautifulSoup(response.content, 'lxml')

In [None]:
keywords = input("Please enter the things you want to see in a recipe: ")
soup = result_page(url, keywords)

In [None]:
soup.body.div

<h3>BS4 functions</h3>

#### find_all  
list of results

In [None]:
n_lines = 5
all_a_tags = soup.find_all('a')
print(type(all_a_tags))
all_a_tags[:n_lines]

#### find  
first result


In [None]:
div_tag = soup.find('div')

In [None]:
type(div_tag), div_tag

In [None]:
soup.find_all('a')[0] == soup.find('a')

### Recursively apply on elements (traverse)

In [None]:
(soup
    .find('div')
    .find('a')
    .get_text())

### find and find_all  
as css selectors
<li>using selector=value, e.g. class_='recipe-content-card')
<li>using a dictionary, e.g. {'class':'recipe-content-card'}
<li>class is a reserved word in python, please use as 'class' or class_

In [None]:
selector = 'recipe-content-card'
soup.find_all('article', class_=selector)[0] == results_page.find('article', {'class':selector})

### get_text() 
Returns the content enclosed in a tag

In [None]:
soup.find('article', {'class':selector}).get_text()

### get()
Returns the value of a tag attribute

In [None]:
recipe_tag = soup.find('article',{'class':selector})
recipe_link = recipe_tag.find('a')
link_url = recipe_link.get('href')
recipe_content = recipe_tag.find('a').get_text()

print('a tag: {}\n - content: {}\n - link url: {}\n - link type: {} '.format(recipe_link, recipe_content, link_url, type(link_url)))

### List of recipes

In [None]:
def get_recipes(url, keywords='', selector=''):
    recipe_list = []
    try:
        soup = result_page(url, keywords)
        recipes = soup.find_all('article', class_=selector)
        
        for recipe in recipes:
            recipe_link = url + recipe.find('a').get('href')
            recipe_name = recipe.find('a').get_text()
            try:
                recipe_description = recipe.find('p', class_='dek').get_text()
            except:
                recipe_description = ''
            recipe_list.append((recipe_name, recipe_link, recipe_description))
            
        return recipe_list
    except:
        return None

In [None]:
url = 'http://www.epicurious.com/search/'
keywords = input('Please enter the things you want to see in a recipe: ')
selector = 'recipe-content-card'
get_recipes(url, keywords, selector)

### Recipe ingredients and preparation

In [None]:
def get_recipe_info(url, keywords='', selector=''):
    recipe_dict = {}
    try:
        soup = result_page(url, keywords)
        ingredient_list, prep_steps_list = [], []
        for ingredient in soup.find_all('li', class_='ingredient'):
            ingredient_list.append(ingredient.get_text())
            
        for prep_step in soup.find_all('li', class_='preparation-step'):
            prep_steps_list.append(prep_step.get_text().strip())
            
        recipe_dict['ingredients'], recipe_dict['preparation'] = ingredient_list, prep_steps_list
        return recipe_dict
    except:
        return recipe_dict

In [None]:
url = 'http://www.epicurious.com'
link = '/recipes/food/views/spicy-lemongrass-tofu-233844'
recipe_info = get_recipe_info(url + link)
recipe_info

### Get all recipes

In [None]:
def get_all_recipes(url, keywords='', selector=''):
    results = []
    all_recipes = get_recipes(url, keywords, selector)
    for recipe in all_recipes:
        recipe_dict = get_recipe_info(recipe[1])
        recipe_dict['name'] = recipe[0]
        recipe_dict['description'] = recipe[2]
        results.append(recipe_dict)
    return results

In [None]:
keywords = input('Please enter the things you want to see in a recipe: ')
selector = 'recipe-content-card'
all_recipes = get_all_recipes(url, keywords, selector)
all_recipes

In [None]:
import pandas as pd
pd.DataFrame(all_recipes)