# Data Format And Export
1. Classify ingredients
2. Formats cocktails & generates cocktails_db.pkl file <i>(deployed with flask server)</i>
3. Formats ingredients & generates ingredients_db.pkl f

In [98]:
import pandas as pd
import pickle
import re
from collections import OrderedDict, Counter
from data_utils import preprocess, name_to_id

cocktails = pickle.load(open('cleaned_cocktails.pkl', 'rb'))

# 1. Classify & Format Ingredients 

In [99]:
df = pd.DataFrame([ing for cocktail in cocktails for ing in cocktail['ingredients']])
print('-----> Found {} total ingredients for {} total cocktails'.format(len(df), len(cocktails)))
df = df.drop_duplicates('name')
print('-----> Found {} unique ingredients de-duping on name'.format(len(df)))

df = df[['name', 'link']]
df['name_clean'] = df.name.map(preprocess)
df['objectID'] = df.name.map(name_to_id)

-----> Found 13412 total ingredients for 2980 total cocktails
-----> Found 689 unique ingredients de-duping on name


In [100]:
def cr(keywords):
    return re.compile(r'\b' + r'\b|\b'.join(keywords) + r'\b')

mapping = OrderedDict({
    'vodka':cr(['vodka']),
    'tequila':cr(['tequila']),
    'vermouth':cr(['vermouth', 'noilly prat extra dry']),
    'mezcal':cr(['mezcal']),
    'brandy':cr(['brandy', 'avallen calvados']),
    'rum':cr(['rum']),
    'whiskey':cr(['whiskey', 'scotch', 'whisky']),
    'gin':cr(['gin']),
    'amaro':cr(['amaro']),
    'sherry':cr(['sherry']),
    'coffee liqueur': cr(['coffee.*liqueur', 'kahlua']),
    'irish cream liqueur':cr(['irish cream liqueur']),
    'cinnamon schnapps':cr(['cinnamon schnapps']),
    'apple schnapps':cr(['apple schnapps']),
    'peach schnapps':cr(['peach schnapps']),
    'triple sec': cr(['triple sec']),
    'st-germain': cr(['st-germain', 'st germain', 'stgermain']),
    'sambuca': cr(['sambuca']),
    'grand marnier':cr(['grand marnier']),
    'cognac':cr(['cognac']),
    'tonic water':cr(['tonic water', 'soda water']),
    'white wine':cr(['white wine']),
    'red wine':cr(['red wine']),
    'port wine':cr(['port']),
    'simple syrup':cr(['sugar syrup','simple syrup', 'cane syrup']),
    'bitters':cr(['bitters']),
    'lime juice':cr(['lime']),
    'lemon juice':cr(['lemon']),
    'apple juice':cr(['apple juice']),
    'pineapple juice':cr(['pineapple juice']),
    'cranberry juice':cr(['cranberry juice']),
    'absinthe':cr(['absinthe']),
    'coffee':cr(['coffee', 'espresso']),
    'egg':cr(['egg']),
    'orange juice':cr(['orange']),
    'tea':cr(['tea']),
    'grenadine':cr(['grenadine']),
    'ginger ale':cr(['ginger ale']),
    'kiwi':cr(['kiwi fruit']),
    'watermelon':cr(['watermelon']),
    'passion fruit':cr(['passion fruit']),
    'champagne':cr(['champagne']),
    'amaretto':cr(['amaretto']),
    'worcestershire sauce':cr(['worcestershire sauce']),
    'kumquats':cr(['kumquats']),
    'basil':cr(['basil']),
    'sprite':cr(['lemonlime soda', 'sprite', '7up']),
    'cola':cr(['cola']),
    'figs':cr(['figs']),
    'mint':cr(['mint']),
    'pear':cr(['pear']),
    'water':cr(['water']),
    'ice':cr(['ice cubes?'])
})

def classify(ingredient):
    for key, reg in mapping.items():
        if reg.findall(ingredient):
            return key
    return ingredient

df['category'] = df.name_clean.map(classify)
ing_map = dict(df[['objectID', 'category']].values)

# 2. Formats Cocktails & Generate cocktails_db.pkl File

In [101]:
def format_cocktail_ingredient(ing):
    ing['id'] = name_to_id(ing['name'])
    ing['category_id'] = name_to_id(ing_map[ing['id']])
#     try:
#         ing['name'] = ing['name'].encode('latin1').decode('utf-8')
#         print('did encode', ing['name'])
#     except:
#         print('failed to encode', ing['name'])
    return ing
    
cdf = pd.DataFrame(cocktails)
cdf['objectID'] = cdf.name.map(name_to_id)
cdf['ingredients'] = cdf.ingredients.map(lambda x: [format_cocktail_ingredient(y) for y in x])
cdf['num_ingredients'] = cdf.ingredients.map(lambda x: len(x))
cdf['ingredient_categories'] = cdf.ingredients.map(lambda x: [y['category_id'] for y in x])
pickle.dump(cdf, open('cocktails_db.pkl','wb'))

# 3. Formats Ingredients & Generates ingredients_db.pkl File

In [102]:
# Get # cocktails per ingredient (used for ranking purposes)
c = Counter([ing for ing_list in cdf.ingredient_categories for ing in ing_list])
icdf = pd.DataFrame(c.most_common(), columns=['objectID', 'count'])
icdf['name'] = icdf.objectID.map(lambda x: x.replace('-', ' '))
icdf['priority'] = icdf.name.map(lambda x: 1 if x in mapping else 0)
pickle.dump(icdf, open('ingredients_db.pkl','wb'))



# SCRATCH

## Explore Ingredient ngrams

In [39]:
def get_ngrams(string, n=1):
    tokens = string.split('-')
    ngrams = []
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]
        
from collections import Counter
c = Counter([ngram for x in df.name_clean.tolist() for ngram in get_ngrams(x, 2)]) 