In [47]:
import pandas as pd
import re
from fuzzywuzzy import fuzz
import json
from datetime import datetime
from bs4 import BeautifulSoup
import urllib
import pdb
import ssl

from selenium import webdriver

import matplotlib.pyplot as plt
%matplotlib inline

SCHEMA =  [
    "name", "species", "heat", "region", "origin", "min_shu", "max_shu",
    "min_jrp", "max_jrp", "link", "source_name"
]

In [None]:
with open('../data/pepperscale_20171028.json') as json_data:
    data = json.load(json_data)
pepperscale = pd.DataFrame(data["peppers"])
pepperscale.sample(3)

# Shared functions

In [57]:
def get_page_html(url, expired_cert=False):
    headers = {
                "user-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36(KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
            }

    request = urllib.request.Request(url, 
                                     headers=headers)
    if not expired_cert:
        return BeautifulSoup(urllib.request.urlopen(request).read().decode('utf-8'), 'html.parser')
    context = ssl._create_unverified_context()
    return BeautifulSoup(urllib.request.urlopen(request, context=context).read().decode('utf-8'), 'html.parser')

def process_row(row, source_name, schema=["name", "max_shu", "link", "source_name"]):
    row_url = cd_url if not row.find("a") else row.find("a")["href"]
    row_data = [col.text for col in row.find_all("td")] + [row_url] + [source_name]
    row_data[1] = int(row_data[1].replace(",", ""))
    return dict(zip(schema, row_data))

# Chiliworld

In [None]:
chiliworld_url = "https://www.chilliworld.com/factfile/scoville-scale#ChilliPepperScovilleScale"
chiliworld_html = get_page_html(chiliworld_url)
len(chiliworld_html.find(id="ChilliPepperScovilleScale").find_all("tr"))

In [None]:
def compare_pepper_to_pepperscale(pepper):
    possibilities = []
    for pepperscale_pepper in pepperscale["name"].str.lower().str.replace("pepper", "").str.replace("chile ", ""):
        if fuzz.ratio(pepper, pepperscale_pepper) > 70:
            possibilities.append(pepperscale_pepper)
    return possibilities

def sanitize_name(name):
    if "<b>" not in str(name):
        pepper_name = name.split("(")[0].split(",")[0]
        sanitized_pepper_name = pepper_name.lower().replace(" pepper", "").replace("the ", "")
        return " ".join([part.strip().capitalize() for part in sanitized_pepper_name.split()])
    
def sanitize_shu(shu):
    return [int(val) for val in shu.replace(" (reported) ", "").replace(",", "").split(" - ")]

def sanitize_location(name):
    if name and "<b>" not in str(name):
        location = name.split("(")[1] if len(name.split("(")) > 1 else None
        if location and ("Wales" in location or "England" in location):
            return "United Kingdom"
        elif location and "South Carolina" in location:
            return "United States"
        elif location and "Australia" in location:
            return "Australia"
        return None

def process_chiliworld_peppers(write=False):
    peppers = []
    
    for row in chiliworld_html.find(id="ChilliPepperScovilleScale").find_all("tr"):
        raw_shu, raw_name = [element.contents[0] for element in row.find_all("td")]
        name = sanitize_name(raw_name)
        if name: # manual discard of sweet bell, since bell pepper in pepperscale
            name = name + " Pepper"
            location = sanitize_location(raw_name)
            shu = sanitize_shu(raw_shu)
            min_shu, max_shu = shu if len(shu) > 1 else [None, shu[0]]
            source = "ChiliWorld"
            link = "https://www.chilliworld.com/factfile/scoville-scale#ChilliPepperScovilleScale"
            peppers.append([name, link, min_shu, max_shu, location])
            
    labels = ["name", "link", "min_shu", "max_shu", "origin"]
    peppers = pd.DataFrame(peppers, columns=labels)
        
def process_chiliworld_peppers_compare(write=False):
    new_peppers = []
    for row in chiliworld_html.find(id="ChilliPepperScovilleScale").find_all("tr"):
        raw_shu, raw_name = [element.contents[0] for element in row.find_all("td")]
        name = sanitize_name(raw_name)
        if name and name != "Sweet Bell": # manual discard of sweet bell, since bell pepper in pepperscale
            if len(compare_pepper_to_pepperscale(name)) == 0:
                name = name + " Pepper"
                location = sanitize_location(raw_name)
                shu = sanitize_shu(raw_shu)
                min_shu, max_shu = shu if len(shu) > 1 else [None, shu[0]]
                source = "ChiliWorld"
                link = "https://www.chilliworld.com/factfile/scoville-scale#ChilliPepperScovilleScale"
                new_peppers.append([name, link, min_shu, max_shu, None, None, None, None, location])
    labels = ["name", "link", "min_shu", "max_shu", "min_jrp", "max_jrp", "heat", "species", "origin"]
    new_peppers = pd.DataFrame(new_peppers, columns=labels)
    if write:
        file_name = "{}/chiliworld_{}.csv".format("../data/", str(datetime.now().date()).replace("-",""))
        new_peppers.to_csv(file_name, index=False)

In [None]:
process_chiliworld_peppers(write=True)

# Pepperheads

In [51]:
ph_url = "https://pepperheadsforlife.com/the-scoville-scale/"
ph_html = get_page_html(ph_url, expired_cert=True)
ph_rows = ph_html.find("tbody").find_all("tr")
len(ph_rows)

463

In [58]:
ph_data = [process_row(row, "Pepperheads") for row in ph_rows]
ph_data

[{'link': 'http://pepperheadsforlife.com/resiniferatoxin/',
  'max_shu': 16000000000,
  'name': 'Resiniferatoxin',
  'source_name': 'Pepperheads'},
 {'link': 'http://pepperheadsforlife.com/extract-scale/tinyatoxin/',
  'max_shu': 5300000000,
  'name': 'Tinyatoxin',
  'source_name': 'Pepperheads'},
 {'link': 'http://pepperheadsforlife.com/pure-capsaicin-16000000-scoville-units/',
  'max_shu': 16000000,
  'name': 'Pure Capsaicin',
  'source_name': 'Pepperheads'},
 {'link': 'http://pepperheadsforlife.com/extract-scale/dihydrocapsaicin/',
  'max_shu': 15000000,
  'name': 'Dihydrocapsaicin',
  'source_name': 'Pepperheads'},
 {'link': 'http://pepperheadsforlife.com/extract-scale/nonivamide/',
  'max_shu': 9200000,
  'name': 'Nonivamide',
  'source_name': 'Pepperheads'},
 {'link': 'http://pepperheadsforlife.com/the-scoville-scale/nordihydrocapsaicin/',
  'max_shu': 9100000,
  'name': 'Nordihydrocapsaicin',
  'source_name': 'Pepperheads'},
 {'link': 'http://pepperheadsforlife.com/pepper-extrac

# Cayenne Diane

In [41]:
cd_url = "https://www.cayennediane.com/the-scoville-scale/"
cd_html = get_page_html(cd_url)
cd_table = cd_html.find("tbody")
cd_rows = cd_table.find_all("tr")
len(cd_rows)

435

In [44]:
cd_rows[0]

<tr class="row-1 odd">
<th class="column-1">Name</th><th class="column-2">Scovilles</th>
</tr>

In [40]:
def process_cd_row(row):
    row_url = cd_url if not row.find("a") else row.find("a")["href"]
    row_data = [col.text for col in row.find_all("td")] + [row_url] + ["Cayenne Diane"]
    row_data[1] = int(row_data[1].replace(",", ""))
    return dict(zip(["name", "max_shu", "link", "source_name"], row_data))
    
cd_data = [process_cd_row(row) for row in cd_rows]
cd_data

[{'link': 'https://www.cayennediane.com/the-scoville-scale/',
  'max_shu': 16000000,
  'name': 'Pure Capsaicin',
  'source_name': 'Cayenne Diane'},
 {'link': 'https://www.cayennediane.com/peppers/carolina-reaper/',
  'max_shu': 2200000,
  'name': 'Carolina Reaper',
  'source_name': 'Cayenne Diane'},
 {'link': 'https://www.cayennediane.com/peppers/trinidad-moruga-scorpion/',
  'max_shu': 2000000,
  'name': 'Trinidad Moruga Scorpion',
  'source_name': 'Cayenne Diane'},
 {'link': 'https://www.cayennediane.com/peppers/7-pot-douglah/',
  'max_shu': 1853396,
  'name': '7 Pot Douglah',
  'source_name': 'Cayenne Diane'},
 {'link': 'https://www.cayennediane.com/peppers/7-pot-primo/',
  'max_shu': 1473480,
  'name': '7 Pot Primo',
  'source_name': 'Cayenne Diane'},
 {'link': 'https://www.cayennediane.com/peppers/trinidad-scorpion-butch-t-pepper/',
  'max_shu': 1463700,
  'name': 'Trinidad Scorpion Butch T',
  'source_name': 'Cayenne Diane'},
 {'link': 'https://www.cayennediane.com/peppers/komodo

# HotStuff

In [3]:
hotstuff_url = "http://ushotstuff.com/Heat.Scale.htm"

In [None]:
hotstuff = get_page_html(hotstuff_url)
len(hotstuff.find("table", id="G2").find_all("tr")) # not all of the entries

In [4]:
path_to_chromedriver = '/Users/asiega/Development/chromedriver' # change path as needed
browser = webdriver.Chrome(executable_path = path_to_chromedriver)
browser.get(hotstuff_url)
table_xpath = '//*[@id="G2"]/tbody'
table_element = browser.find_element_by_xpath(table_xpath)

In [5]:
hotstuff_html = BeautifulSoup(table_element.get_attribute('innerHTML'), "html.parser")
hotstuff_peppers = hotstuff_html.find_all("tr")
len(hotstuff_peppers) # much better!

372

In [61]:
def sanitize_shu(shu):
    shu = shu.text.strip().split(" ~ ")
    if len(shu) == 1:
        if "-" in shu[0]:
            min_shu, max_shu = shu[0].split("-")
            magnitude = "".join(max_shu.split(",")[1:])
            return [int(min_shu+magnitude), int(max_shu.replace(",", ""))]
        return [None, int(shu[0].replace(",", ""))]
    return [int(s.replace(",", "")) for s in shu]
    

def sanitize_name(name):
    return name.text.strip()

def get_link(link):
    if len(link.findChildren()) == 0:
        return hotstuff_url # link, location, species
    return "http://ushotstuff.com/" + link.find("a", href=True)["href"] # only want first link

def extract_hotstuff_pepper_info(row):
    elements = row.find_all("td")
    try:
        link, name, shu = [e for e in elements]
        name = sanitize_name(name)
        link = get_link(link)
        shu = sanitize_shu(shu)
        return dict(zip(["name", "link", "min_shu", "max_shu"], [name, link] + shu))
    except:
        if len(row) > 1:
            name, shu = row.find_all('td')
            name = sanitize_name(name)
            shu = sanitize_shu(shu)
            link = "http://ushotstuff.com/Heat.Scale.htm"
            return dict(zip(["name", "link", "min_shu", "max_shu"], [name, link] + shu))

def generate_hotstuff_data(pepper_html):
    clean_hotstuff = [extract_hotstuff_pepper_info(row) for row in pepper_html[1:] if extract_hotstuff_pepper_info(row)] # skip colheader
    hotstuff_data = pd.DataFrame(clean_hotstuff)
    hotstuff_data["source_name"] = "Uncle Steve's Hot Stuff"
    return hotstuff_data

        
hotstuff_data = generate_hotstuff_data(hotstuff_peppers)
hotstuff_data

Unnamed: 0,link,max_shu,min_shu,name,source_name
0,http://ushotstuff.com/Heat.Scale.htm,0,,All Sweet Bells,Uncle Steve's Hot Stuff
1,http://ushotstuff.com/Heat.Scale.htm,0,,Aconcagua,Uncle Steve's Hot Stuff
2,http://ushotstuff.com/Heat.Scale.htm,0,,Aji Chuncho,Uncle Steve's Hot Stuff
3,http://ushotstuff.com/Heat.Scale.htm,0,,Aladdin,Uncle Steve's Hot Stuff
4,http://ushotstuff.com/Heat.Scale.htm,0,,Branco Diamante,Uncle Steve's Hot Stuff
5,http://ushotstuff.com/Heat.Scale.htm,0,,Carliston,Uncle Steve's Hot Stuff
6,http://ushotstuff.com/Heat.Scale.htm,0,,Carmagnola Rosso,Uncle Steve's Hot Stuff
7,http://ushotstuff.com/Heat.Scale.htm,0,,Corno Verde,Uncle Steve's Hot Stuff
8,http://ushotstuff.com/Heat.Scale.htm,0,,Corona,Uncle Steve's Hot Stuff
9,http://ushotstuff.com/Heat.Scale.htm,0,,Cuneo Giallo,Uncle Steve's Hot Stuff


In [60]:
hotstuff_seed_html = get_page_html("http://ushotstuff.com/").find("table", class_="T1")

seed_data = []

for row in hotstuff_seed_html.find_all("tr")[1:]:
    row_tds = row.find_all("td")
    row_data = {
        "link": "http://ushotstuff.com/" + row.find("a")["href"],
        "heat": row_tds[1].text.lower(),
        "species": row_tds[2].text.lower()
    }
    seed_data.append(row_data)
    
seed_data = pd.DataFrame(seed_data)
seed_data

Unnamed: 0,heat,link,species
0,hot,http://ushotstuff.com/pepper-seeds/AjiHabanero...,baccatuum
1,hot,http://ushotstuff.com/pepper-seeds/AjiOmni-Col...,baccatuum
2,verry hot,http://ushotstuff.com/pepper-seeds/TrinidadSco...,chinense
3,mild-medium,http://ushotstuff.com/pepper-seeds/BishopsCrow...,baccatuum
4,hot,http://ushotstuff.com/pepper-seeds/BolivianRai...,annuum
5,low-medium,http://ushotstuff.com/pepper-seeds/BrazilianSt...,baccatuum
6,hot,http://ushotstuff.com/pepper-seeds/RedSquashSe...,annuum
7,medium,http://ushotstuff.com/pepper-seeds/ChineseMult...,annuum
8,very hot,http://ushotstuff.com/pepper-seeds/ChocolateHa...,chinense
9,hot,http://ushotstuff.com/pepper-seeds/CobinchoSee...,exile


In [46]:
hotstuff_data.merge(seed_data, how="left")

Unnamed: 0,link,max_shu,min_shu,name,source_name,heat,species
0,http://ushotstuff.com/Heat.Scale.htm,0,,All Sweet Bells,Uncle Steve's Hot Stuff,,
1,http://ushotstuff.com/Heat.Scale.htm,0,,Aconcagua,Uncle Steve's Hot Stuff,,
2,http://ushotstuff.com/Heat.Scale.htm,0,,Aji Chuncho,Uncle Steve's Hot Stuff,,
3,http://ushotstuff.com/Heat.Scale.htm,0,,Aladdin,Uncle Steve's Hot Stuff,,
4,http://ushotstuff.com/Heat.Scale.htm,0,,Branco Diamante,Uncle Steve's Hot Stuff,,
5,http://ushotstuff.com/Heat.Scale.htm,0,,Carliston,Uncle Steve's Hot Stuff,,
6,http://ushotstuff.com/Heat.Scale.htm,0,,Carmagnola Rosso,Uncle Steve's Hot Stuff,,
7,http://ushotstuff.com/Heat.Scale.htm,0,,Corno Verde,Uncle Steve's Hot Stuff,,
8,http://ushotstuff.com/Heat.Scale.htm,0,,Corona,Uncle Steve's Hot Stuff,,
9,http://ushotstuff.com/Heat.Scale.htm,0,,Cuneo Giallo,Uncle Steve's Hot Stuff,,


In [62]:

BASE_URL = "http://ushotstuff.com/Heat.Scale.htm"
SEED_URL = "http://ushotstuff.com/"


### FETCHER FUNCTIONS

def run(headers, driver_path):
    # scrape and sanitize base pepper data
    base_html = _scrape_pepper_page(driver_path)
    base_data = [_extract_hotstuff_pepper_info(row) for row in base_html[1:] if _extract_hotstuff_pepper_info(row)] # skip colheader
    base_data = pd.DataFrame(base_data)
    base_data["source_name"] = "Uncle Steve's Hot Stuff"
    print("%d peppers fetched from HotStuff!" % len(base_data))

    # scrape seed data
    seed_html = get_page_html("http://ushotstuff.com/", headers).find("table", class_="T1")
    seed_data = [_get_seed_row(row) for row in hotstuff_seed_html.find_all("tr")[1:]] # skip header row
    seed_data = pd.DataFrame(seed_data)

    # join seed data to base data, where available
    return base_data.merge(seed_data, how="left")

def _get_page_html(url, headers):
    request = urllib.request.Request(url, headers=headers)
    return BeautifulSoup(urllib.request.urlopen(request).read().decode('utf-8'), 'html.parser')

def _scrape_pepper_page(driver_path):
    # set up browser with selenium
    browser = webdriver.Chrome(executable_path=driver_path)
    browser.get(BASE_URL)

    # find and parse table element with all of the pepper rows
    table_xpath = '//*[@id="G2"]/tbody'
    table_element = browser.find_element_by_xpath(table_xpath)
    hotstuff_html = BeautifulSoup(table_element.get_attribute('innerHTML'), "html.parser")
    return hotstuff_html.find_all("tr")


## SANITIZATION FUNCTIONS

def _get_seed_row(row):
    row_tds = row.find_all("td")
    row_data = {
        "link": "http://ushotstuff.com/" + row.find("a")["href"],
        "heat": row_tds[1].text.lower(),
        "species": row_tds[2].text.lower()
    }
    return row_data

def _sanitize_shu(shu):
    shu = shu.text.strip().split(" ~ ")
    if len(shu) == 1:
        if "-" in shu[0]:
            min_shu, max_shu = shu[0].split("-")
            magnitude = "".join(max_shu.split(",")[1:])
            return [int(min_shu+magnitude), int(max_shu.replace(",", ""))]
        return [None, int(shu[0].replace(",", ""))]
    return [int(s.replace(",", "")) for s in shu]

def _sanitize_name(name):
    return name.text.strip()

def _get_link(link):
    if len(link.findChildren()) == 0:
        return link # link, heat, species to match to seed data later
    return "http://ushotstuff.com/" + link.find("a", href=True)["href"] # only want first link

def _extract_hotstuff_pepper_info(row):
    elements = row.find_all("td")
    try:
        link, name, shu = [e for e in elements]
        name = _sanitize_name(name)
        link = _get_link(link)
        shu = _sanitize_shu(shu)
        return dict(zip(["name", "link", "min_shu", "max_shu"], [name, link] + shu))
    except: # malformed rows of not-quite-peppers at the end of the data; good for pepper comparison
        if len(row) > 1:
            name, shu = row.find_all('td')
            name = _sanitize_name(name)
            shu = _sanitize_shu(shu)
            link = "http://ushotstuff.com/Heat.Scale.htm"
            return dict(zip(["name", "link", "min_shu", "max_shu"], [name, link] + shu))

In [64]:
HEADERS = {
    "user-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36(KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
}
DRIVER_PATH = '/Users/asiega/Development/chromedriver'

run(HEADERS, DRIVER_PATH)

AttributeError: 'NoneType' object has no attribute 'keys'