# Webscraping intro

## Scraping rules
- You should check a site's terms and conditions before you scrape them. It's their data and they likely have some rules to govern it.
- Be nice - A computer will send web requests much quicker than a user can. Make sure you space out your requests a bit so that you don't hammer the site's server.
- Scrapers break - Sites change their layout all the time. If that happens, be prepared to rewrite your code.
- Web pages are inconsistent - There's sometimes some manual clean up that has to happen even after you've gotten your data.

<h3>Import necessary modules</h3>

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import os

## requests
- requests executes HTTP requests, like GET
- The requests object holds the results of the request. This is page content and other items like HTTP status codes and headers.
- requests only gets the page content without any parsing.
- Beautiful Soup does the parsing of the HTML and finding content within the HTML.

In [None]:
url = 'https://www.jancisrobinson.com'
red_ = '/learn/grape-varieties/red/'
white_ = '/learn/grape-varieties/white/'

In [None]:
def connect(url):
    response = requests.get(url)
    if response.status_code == 200:
        print('successfully connected, response code: {}'.format(response.status_code))
    else:
        print('connection failed')
    return response

In [None]:
connect(url+red_);

## 'Sgraping'

In [None]:
def result_page(url, keywords=''):
    response = requests.get(url + keywords)
    if not response.status_code == 200:
        return None
    return BeautifulSoup(response.content, 'lxml')

In [None]:
def get_hrefs(url, tag, class_, keywords=''):
    try:
        results_page = result_page(url, keywords)
        href_list = results_page.find_all(tag, class_=class_)[0].find_all('a')
        return ['{}{}'.format(url, href.get('href')) for href in href_list]
    except:
        return None

In [None]:
hrefs_red = get_hrefs(url, 'ul', 'info-table', red_)
hrefs_white = get_hrefs(url, 'ul', 'info-table', white_)
hrefs_red[:5], hrefs_white[:5]

### Get text

In [None]:
def get_grape_text(url, tag, class_1, class_2, tag_1, color, keywords=''):
    try:
        results_page = result_page(url, keywords)
        grape = results_page.find_all(tag, class_=class_1)[0].find_all(tag_1)[0].get_text()
        content = results_page.find_all(tag, class_=class_2)[0].get_text()
        return grape, color ,content
    except:
        return None

### Sgrape all varieties

In [None]:
def get_grape_varieties(url, tag, class_1, class_2, tag_1, color):
    grape_list = []
    for color in colors:
        hrefs = get_hrefs(url, 'ul', 'info-table', '/learn/grape-varieties/' + color)
        for href in hrefs:
            grape_list.append(get_grape_text(href, tag, class_1, class_2, tag_1, color))
    return grape_list

In [None]:
url, tag, class_1, class_2, tag_1, colors = 'https://www.jancisrobinson.com', 'div', 'learn-header', 'row', 'h1', ['red', 'white']
grape_list = get_grape_varieties(url, tag, class_1, class_2, tag_1, colors);

In [None]:
df_grapes = pd.DataFrame(grape_list)
df_grapes.columns = ['Grape', 'Color', 'Description']
df_grapes.info()
df_grapes.head()

### Clean - remove excessive spaces

In [None]:
df_grapes['Grape'] = [str(x.strip()) for x in df_grapes['Grape']]

### Convert to UTF (English alfabeth)

In [None]:
# !pip3 install unidecode
import unidecode

In [None]:
# Encode to english, removing special chars
df_grapes['Grape_utf'] = [str(unidecode.unidecode(x).strip()) for x in df_grapes['Grape']]
df_grapes

### Save the grapes

In [None]:
# Save df_grapes
df_grapes.to_csv('../_data/grape_descr.csv')