In [3]:
import pandas as pd
import re
from fuzzywuzzy import fuzz
import json
from datetime import datetime
from bs4 import BeautifulSoup
import urllib

import matplotlib.pyplot as plt
%matplotlib inline

SCHEMA =  [
    "name", "species", "heat", "region", "origin", "min_shu", "max_shu",
    "min_jrp", "max_jrp", "link", "source_name"
]

In [2]:
with open('../data/pepperscale_20171028.json') as json_data:
    data = json.load(json_data)
pepperscale = pd.DataFrame(data["peppers"])
pepperscale.sample(3)

Unnamed: 0,heat,link,max_jrp,max_shu,min_jrp,min_shu,name,origin,region,source_name,species
93,extra hot,http://www.pepperscale.com/goat-pepper,140,350000,12,100000.0,Goat Pepper,Caribbean,Central America and the Caribbean,PepperScale,chinense
31,mild,http://www.pepperscale.com/chilaca-pepper,0,2500,-8,1000.0,Chilaca Pepper,Mexico,North America,PepperScale,annuum
23,mild,http://www.pepperscale.com/anaheim-pepper,0,2500,-16,500.0,Anaheim Pepper,Mexico,North America,PepperScale,annuum


# Chiliworld

In [4]:
headers = {
            "user-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36(KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
        }

request = urllib.request.Request("https://www.chilliworld.com/factfile/scoville-scale#ChilliPepperScovilleScale", 
                                 headers=headers)
page_html = urllib.request.urlopen(request).read().decode('utf-8')

In [8]:
chiliworld_html = BeautifulSoup(page_html, 'html.parser')
len(chiliworld_html.find(id="ChilliPepperScovilleScale").find_all("tr"))

44

In [None]:
import pdb
chiliworld_html = BeautifulSoup(page_html, 'html.parser')

def compare_pepper_to_pepperscale(pepper):
    possibilities = []
    for pepperscale_pepper in pepperscale["name"].str.lower().str.replace("pepper", "").str.replace("chile ", ""):
        if fuzz.ratio(pepper, pepperscale_pepper) > 70:
            possibilities.append(pepperscale_pepper)
    return possibilities

def sanitize_name(name):
    if "<b>" not in str(name):
        pepper_name = name.split("(")[0].split(",")[0]
        sanitized_pepper_name = pepper_name.lower().replace(" pepper", "").replace("the ", "")
        return " ".join([part.strip().capitalize() for part in sanitized_pepper_name.split()])
    
def sanitize_shu(shu):
    return [int(val) for val in shu.replace(" (reported) ", "").replace(",", "").split(" - ")]

def sanitize_location(name):
    if name and "<b>" not in str(name):
        location = name.split("(")[1] if len(name.split("(")) > 1 else None
        if location and ("Wales" in location or "England" in location):
            return "United Kingdom"
        elif location and "South Carolina" in location:
            return "United States"
        elif location and "Australia" in location:
            return "Australia"
        return None

def process_chiliworld_peppers(write=False):
    peppers = []
    
    for row in chiliworld_html.find(id="ChilliPepperScovilleScale").find_all("tr"):
        raw_shu, raw_name = [element.contents[0] for element in row.find_all("td")]
        name = sanitize_name(raw_name)
        if name: # manual discard of sweet bell, since bell pepper in pepperscale
            name = name + " Pepper"
            location = sanitize_location(raw_name)
            shu = sanitize_shu(raw_shu)
            min_shu, max_shu = shu if len(shu) > 1 else [None, shu[0]]
            source = "ChiliWorld"
            link = "https://www.chilliworld.com/factfile/scoville-scale#ChilliPepperScovilleScale"
            peppers.append([name, link, min_shu, max_shu, location])
            
    labels = ["name", "link", "min_shu", "max_shu", "origin"]
    peppers = pd.DataFrame(peppers, columns=labels)
        
def process_chiliworld_peppers_compare(write=False):
    new_peppers = []
    for row in chiliworld_html.find(id="ChilliPepperScovilleScale").find_all("tr"):
        raw_shu, raw_name = [element.contents[0] for element in row.find_all("td")]
        name = sanitize_name(raw_name)
        if name and name != "Sweet Bell": # manual discard of sweet bell, since bell pepper in pepperscale
            if len(compare_pepper_to_pepperscale(name)) == 0:
                name = name + " Pepper"
                location = sanitize_location(raw_name)
                shu = sanitize_shu(raw_shu)
                min_shu, max_shu = shu if len(shu) > 1 else [None, shu[0]]
                source = "ChiliWorld"
                link = "https://www.chilliworld.com/factfile/scoville-scale#ChilliPepperScovilleScale"
                new_peppers.append([name, link, min_shu, max_shu, None, None, None, None, location])
    labels = ["name", "link", "min_shu", "max_shu", "min_jrp", "max_jrp", "heat", "species", "origin"]
    new_peppers = pd.DataFrame(new_peppers, columns=labels)
    if write:
        file_name = "{}/chiliworld_{}.csv".format("../data/", str(datetime.now().date()).replace("-",""))
        new_peppers.to_csv(file_name, index=False)

In [None]:
pwd

In [None]:
process_chiliworld_peppers(write=True)