In [161]:
import re
from bs4 import BeautifulSoup as soup
def preprocess(string):
    return re.sub('(\\n|\\t|\\r)', '', string.encode().decode('unicode_escape')).strip()

def extract_cocktail(page):

    # Get name
    name = page.find('h1').text

    # Get image_url
    image_url = page.find('div', {'class':'product-gallery-static'}).find('img').get('src')

    # Get serving_container and instructio
    cells = page.find('article').findAll('div', {'class':'cell'})
    for cell in cells:
        if 'Serve in' in str(cell):
            serving_container = preprocess(cell.find('a').text)
        if 'How to make' in str(cell):
            instructions = preprocess(cell.find('p').text)

    # Get ingredients
    ingredients = []
    itable = page.find('table', {'class':'ingredients-table'})
    rows = itable.findAll('tr')[:-1]
    for row in rows:
        cells = row.findAll('td')
        quantity = preprocess(cells[0].text)
        ingredient_link = cells[1].find('a')
        ingredient_link = ingredient_link.get('href') if ingredient_link else None
        ingredient = preprocess(cells[1].text)
        ingredients.append({'quantity':quantity, 'name':ingredient, 'link':ingredient_link})
    
    return {
        'name':name,
        'image_url':image_url,
        'serving_container':serving_container,
        'instructions':instructions,
        'ingredients':ingredients
    }

In [172]:
%%time
import os
directory = '../data_scraping/diffordsguide_cocktails'
cocktails = []

failed, succeeded = [], []
for filename in os.listdir(directory):
    try:
        file_path = directory + '/' + filename
        with open(file_path,'rb') as f:
            content = f.read()
            page = soup(content)
            cocktails.append(extract_cocktail(page))
        succeeded.append(filename)
    except Exception as e:
        print('FAILED:', filename, str(e))
        failed.append(filename)

FAILED: 3136.txt 'NoneType' object has no attribute 'find'
FAILED: 3863.txt 'NoneType' object has no attribute 'find'
FAILED: 4329.txt 'NoneType' object has no attribute 'find'
FAILED: 4063.txt 'NoneType' object has no attribute 'find'
FAILED: 3095.txt 'NoneType' object has no attribute 'find'
FAILED: 3917.txt 'NoneType' object has no attribute 'find'
FAILED: 3903.txt 'NoneType' object has no attribute 'find'
FAILED: 3902.txt 'NoneType' object has no attribute 'find'
FAILED: 3094.txt 'NoneType' object has no attribute 'find'
FAILED: 4062.txt 'NoneType' object has no attribute 'find'
FAILED: 3282.txt 'NoneType' object has no attribute 'find'
FAILED: 4512.txt 'NoneType' object has no attribute 'find'
FAILED: 920.txt 'NoneType' object has no attribute 'find'
FAILED: 3453.txt 'NoneType' object has no attribute 'find'
FAILED: 3862.txt 'NoneType' object has no attribute 'find'
FAILED: 3137.txt 'NoneType' object has no attribute 'find'
FAILED: 3135.txt 'NoneType' object has no attribute 'find

In [173]:
print(len(succeeded)/float(len(succeeded) + len(failed)), '% successfully processed')

0.9079829372333943 % successfully processed


In [211]:
# import pickle
# pickle.dump(cocktails, open('cleaned_cocktails.pkl', 'wb'))
cocktails = pickle.load(open('cleaned_cocktails.pkl','rb'))

# Get Master Ingredient List

In [212]:
import pandas as pd
df = pd.DataFrame([ing for cocktail in cocktails for ing in cocktail['ingredients']])
print('-----> Found {} total ingredients for {} total cocktails'.format(len(df), len(cocktails)))
df = df.drop_duplicates('name')
print('-----> Found {} unique ingredients de-duping on name'.format(len(df)))

-----> Found 13412 total ingredients for 2980 total cocktails
-----> Found 689 unique ingredients de-duping on name


# Ingredient Parsing

In [139]:
import requests
url = 'https://www.diffordsguide.com/beer-wine-spirits/525/bacardi-carta-blanca-light-rum'
content = str(requests.get(url).content)
ingredient_id = url.split('beer-wine-spirits/')[1].split('/')[0]
with open('./diffordsguide_ingredients/' + ingredient_id + '.txt', 'w') as f:
    f.write(content)

In [141]:

with open('./diffordsguide_ingredients/' + str(525) + '.txt','rb') as f:
    content = f.read()
page = soup(content)

In [158]:

def extract_ingredient(page):

    # Get name
    name = page.find('h1').text

    # Get image_url
    image_url = page.find('div', {'class':'product-gallery-static'}).find('img').get('src')

    # Get serving_container and instructio
    for cell in page.findAll('p'):
        if 'alc./vol:' in str(cell):
            alc_per_vol = preprocess(cell.text.replace('alc./vol:',''))
        if 'Proof:' in str(cell):
            proof = preprocess(cell.text.replace('Proof:',''))
        if 'Vintage:' in str(cell):
            vintage = preprocess(cell.text.replace('Vintage:',''))
        if 'Aged:' in str(cell):
            age = preprocess(cell.text.replace('Aged:',''))

    return {
        'name':name,
        'image_url':image_url,
        'alc_per_vol':alc_per_vol,
        'proof':proof,
        'vintage':vintage,
        'age':age
    }

In [157]:

extract_ingredient(page)

{'name': 'Bacardi Carta Blanca Superior White Rum',
 'image_url': 'https://cdn.diffords.com/contrib/bws/2019/03/5c7d58056dd16.jpg',
 'alc_per_vol': '37.5%',
 'proof': '75Â°',
 'vintage': 'Non-vintage',
 'age': 'No age statement'}