In [3]:
import pandas as pd
import re
from fuzzywuzzy import fuzz
import json

import matplotlib.pyplot as plt
%matplotlib inline

### STRIP PEPPERSCALE DATA

In [8]:
with open('../data/peppers_20171013.json') as json_data:
    data = json.load(json_data)
pepperscale = pd.DataFrame(data["peppers"])
pepperscale.sample(3)

Unnamed: 0,heat,link,max_jrp,max_shu,min_jrp,min_shu,name,origin,region,species
63,medium,http://www.pepperscale.com/japones-pepper,12,30000,2,15000.0,Japones Pepper,Japan,Asia,annuum
81,extra hot,http://www.pepperscale.com/thai-peppers,40,100000,6,50000.0,Thai Pepper,Thailand,Asia,annuum
100,super hot,http://www.pepperscale.com/chocolate-habanero,231,577000,53,425000.0,Chocolate Habanero,Jamaica,Central America and the Caribbean,chinense


# Chiliworld

In [7]:
import urllib2
from bs4 import BeautifulSoup

headers = {
            "user-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36(KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
  f      }
request = urllib2.Request("https://www.chilliworld.com/factfile/scoville-scale#ChilliPepperScovilleScale", headers=headers)
page_html = urllib2.urlopen(request).read()

In [None]:
import pdb
chiliworld_html = BeautifulSoup(page_html, 'html.parser')

def compare_pepper_to_pepperscale(pepper):
    possibilities = []
    for pepperscale_pepper in pepperscale["name"].str.lower().str.replace("pepper", "").str.replace("chile ", ""):
        if fuzz.ratio(pepper, pepperscale_pepper) > 70:
            possibilities.append(pepperscale_pepper)
    return possibilities

def sanitize_name(name):
    if "<b>" not in name.encode('utf-8'):
        pepper_name = name.split("(")[0].split(",")[0]
        sanitized_pepper_name = pepper_name.lower().replace(" pepper", "").replace("the ", "")
        return " ".join([part.strip().capitalize() for part in sanitized_pepper_name.split()])
    
def sanitize_shu(shu):
    return [int(val) for val in shu.replace(" (reported) ", "").replace(",", "").split(" - ")]

def sanitize_location(name):
    if name and "<b>" not in name.encode('utf-8'):
        location = name.encode('utf-8').split("(")[1] if len(name.encode('utf-8').split("(")) > 1 else None
        if location and ("Wales" in location or "England" in location):
            return "United Kingdom"
        elif location and "South Carolina" in location:
            return "United States"
        elif location and "Australia" in location:
            return "Australia"
        return None

def process_peppers():
    new_peppers = []
    for row in chiliworld_html.find(id="ChilliPepperScovilleScale").find_all("tr"):
        raw_shu, raw_name = [element.contents[0] for element in row.find_all("td")]
        name = sanitize_name(raw_name)
        if name and name != "Sweet Bell": # manual discard of sweet bell, since bell pepper in pepperscale
            if len(compare_pepper_to_pepperscale(name)) == 0:
                name = name + " Pepper"
                location = sanitize_location(raw_name)
                shu = sanitize_shu(raw_shu)
                min_shu, max_shu = shu if len(shu) > 1 else [None, shu[0]]
                source = "ChiliWorld"
                link = "https://www.chilliworld.com/factfile/scoville-scale#ChilliPepperScovilleScale"
                new_peppers.append([name, link, min_shu, max_shu, None, None, None, location])
    labels = ["name", "link", "min_shu", "max_shu", "heat", "jrp", "species", "origin"]
    return pd.DataFrame(new_peppers, columns=labels)

In [None]:
process_peppers()