# Data Collection

## Recipe Crawler

In [1]:
# import packages
import requests
from bs4 import BeautifulSoup 
import time

In [2]:
base_url = "http://www.drinksmixer.com/cat/1/"
pages = range(1,125)  # total 124 pages exist (2019.10.29)

cocktail_links = []
cocktail_name_list = []

for i in pages:
    
    # Set URL
    url = base_url+str(i)
    req = requests.get(url)
    html = req.text
    
    # Parse HTML with bs4
    soup = BeautifulSoup(html,'html.parser')
    
    # Find all recipe links
    drinks_box = soup.find("div",{"class":"m1"}).find("div",{"class":"min"}).find("div",{"class":"clr"}).find("tr")
    urls_in_page = drinks_box.find_all("a")
    
    for link in urls_in_page:    
        cocktail_links.append("http://www.drinksmixer.com" + link["href"])
        cocktail_name_list.append(link.text)
        
        
print("Links collected in {}".format(time.ctime()))

Links collected in Tue Oct 29 15:28:23 2019


In [3]:
print(len(cocktail_links),len(cocktail_name_list))

12334 12334


### Save Data into Pickle

In [4]:
# import pickle

# with open("./pickle_data/list_of_cocktail_recipe_links.pickle", "wb") as t:
#     pickle.dump(cocktail_links, t)

# with open("./pickle_data/list_of_cocktail_names.pickle", "wb") as y:
#     pickle.dump(cocktail_name_list, y)

### Load Data into Pickle

In [30]:
import pickle 

with open("./pickle_data/list_of_cocktail_recipe_links.pickle", "rb") as t:
    cocktail_links = pickle.load(t)

with open("./pickle_data/list_of_cocktail_names.pickle", "rb") as y:
    cocktail_name_list = pickle.load(y)

## Cocktail Ingredient Crawler

In [37]:
from selenium import webdriver

path_to_chromedriver = "/Users/nowgeun/Desktop/chromedriver"
driver = webdriver.Chrome(path_to_chromedriver)

In [35]:
# Tracking progress
done = []

In [46]:
cocktail_recipes = {}
cocktail_instructions = {}
cocktail_descriptions = {}

for one_url in cocktail_links:
    driver.get(one_url)
    
    # Cocktail name
    cocktail_name = driver.find_element_by_class_name("recipe_title").text

    # Cocktail Recipe (Ingredients)
    cocktail_recipe = driver.find_element_by_class_name("recipe_data").find_elements_by_class_name("ingredient")
    
    # Cocktail Recipe (Instructions)
    try:
        cocktail_inst = driver.find_element_by_xpath("//*[@class='RecipeDirections instructions']")
    except:
        pass
    
    recipe_dict = {} # Recipe of one cocktail
    
    for ingrdnt in cocktail_recipe:
        amount = ingrdnt.find_element_by_class_name("amount").text
        ing_name = ingrdnt.find_element_by_class_name("name").text
        
        recipe_dict[ing_name] = amount
    
    # Save one cocktail recipe to the whole dictionaries
    cocktail_recipes[cocktail_name] = recipe_dict
    
    if cocktail_inst:
        cocktail_instructions[cocktail_name] = cocktail_inst.text.strip()
        
    done.append(one_url)

In [41]:
import time

cocktail_instructions = {}

for one_url in cocktail_links:
    driver.get(one_url)
    # Cocktail name
    cocktail_name = driver.find_element_by_class_name("recipe_title").text
    
    # Cocktail Recipe (Instruction)
    try:
        cocktail_inst = driver.find_element_by_xpath("//*[@class='RecipeDirections instructions']")
    except:
        pass
    
    if cocktail_inst:
        cocktail_instructions[cocktail_name] = cocktail_inst.text.strip()

    done.append(one_url)

In [45]:
# Redundant Recipes were in the list

len(set(cocktail_name_list)) == len(cocktail_recipes.keys())

True

In [44]:
# # Save it into pickle 
# import pickle

# with open("./pickle_data/cocktail_recipe_dict.pickle", "wb") as f:
#     pickle.dump(cocktail_recipes, f)


# with open("./pickle_data/cocktail_recipe_instruction.pickle", "wb") as j:
#     pickle.dump(cocktail_instructions, j)

### Load Data

In [4]:
import pickle
import pandas as pd
import numpy as np

with open("./pickle_data/cocktail_recipe_dict.pickle", "rb") as g:
    cocktail_recipes = pickle.load(g)
    
with open("./pickle_data/cocktail_recipe_instruction.pickle", "rb") as h:
    cocktail_instructions = pickle.load(h)

### Cleaning Data

In [5]:
data_dict = {}
for key,value in cocktail_recipes.items():
    data_dict[key.replace(" recipe","")] = value

In [6]:
df = pd.DataFrame(data_dict)
df = df.fillna("") # fill NaN value with empty string ""
print(df.shape)
df.head()

(1974, 12242)


Unnamed: 0,'61 Imperial,.50 Caliber,007,10 Deep,100 Miles per Hour,136,151 Reasons,17 Twist,1800 Agave Nectar Squeeze,1800 Cosmolito,...,Zombie #4,Zombie #5,Zombie #6,Zombie (UK Style),Zone 23,Zoot,Zorbatini,Zorro's Revenge,Zultry Zoe,Zulu
white rum,1/2 oz,,,,,,,,,,...,1 tbsp,,,,,,,,,
vodka,1/2 oz,,,1.5 oz,,,,,,,...,,,,,1 oz,,,,,
151 proof rum,1/2 oz,,,,,,3/4 oz,,,,...,1 tsp,1/2 oz,2 cl,,,,,,,
creme de bananes,1/2 oz,,,,,,,,,,...,,,,,,,,,,1/2 oz
Blue Curacao liqueur,1 oz,,,,2 oz,,,,,,...,,,,,,,,,,


#### Some tests & checking data

In [7]:
# Test that no columns overlap
assert len(list(map(lambda x: x.lower(), list(df.columns)))) == len(set(df.columns))

In [8]:
# How many different values are existent?
all_values = list(np.unique(df.values))
len(all_values)

1713

In [9]:
from random import randint

# Generate a list of random numbers
rand_num_list = [randint(0,1713) for i in range(0,40)]

# Let's see some sample...
see_sample = [all_values[i] for i in rand_num_list]
see_sample

['1/4 - 1/2 ozfresh',
 '1/2 gal',
 '5 - 10 drops',
 '7 cl',
 '3/4 - 1 ozpremium',
 '1 sliced up',
 '3 leaves',
 '1 bottleOld 1889',
 '3 tbspfresh',
 '1/4 cup',
 '1 splash(4 ml)',
 '4 - 6 oziced',
 '2 ozblackberry infused',
 '8 ozSunny Delight',
 'Whole',
 '50 clfresh',
 '1 tspcrushed',
 '2 cupshalved',
 'fill with 1/3',
 '4 ozcan',
 '4-5',
 '1/4 shot',
 '1 tsp(heaped)',
 '3 1/2 ozfull pulp',
 '2 squirts',
 '3/4 tsp',
 'dry',
 '4 - 5 cl',
 '1 piece',
 '1 ozCanadian Hunter',
 '33 cl',
 '1 splash(4 ml)',
 '1 peeled, whole',
 'juice of 1',
 '2 splashes',
 '3 - 5 dashes',
 '1 partVanilla flavored',
 '1/3 pint',
 '1 ozSour Monkey',
 '50 mlWyborrow']

#### Exploring all the units in this recipe

In [10]:
import re

no_numbers = []
for amt in all_values:
    no_numbers.append(re.sub("[^a-zA-Z ]","",amt).strip())

In [11]:
no_numbers = list(set(no_numbers))
len(no_numbers)

835

In [12]:
print(no_numbers)

['', 'ozpink', 'Top with', 'glasschilled', 'ozchilled can', 'garnish', 'serving', 'ozcold', 'shelled', 'bottleBig Tom', 'tbspblack', 'kaffir', 'ozGrapefruit', 'long green seedless', 'ozBAWLS Guarana', 'ozextra dry', 'ozchilled', 'mlboiling', 'ozDiet', 'pinchground', 'mlfresh', 'clyellow', 'oziced', 'ozIzze Sparkling Clementine', 'Squeezed', 'ozPampero Anniversario', 'ozcanned', 'drops', 'ozblood', 'peeled and sliced', 'ozRye', 'ozsparkling', 'quart', 'garlic stuffed', 'ozconcentrated', 'pinchdried', 'fill with unsweetened', 'shotsStrawberry', 'ozred', 'glasses', 'fresh chopped', 'squirts', 'Carlo Rossi Burgundy', 'handfulcrushed', 'whole fresh', 'ozfull pulp', 'dropblue', 'ozpulpfree', 'ozSquirt', 'Slice of', 'partsice cold', 'chocolate', 'shot', 'count', 'ozCrown Royal', 'ozreposado', 'long', 'bottle', 'squirt', 'ozDemerara El Dorado', 'bag', 'tspcoarsely ground', 'fresh squeezed', 'scoopsneopolitan', 'partsBerry Blue', 'ozSweetened', 'ozKajmir', 'whole Baby', 'liters', 'ozlemon with'

## Problems and Challenges

1. Too many units (need them unified...)

2. Too many versions of cocktails. (Zombie1, Zombie2, Old fashioned(New York style))

3. Sparse matrix --> Computation time

4. Same ingredients in different name --> Named Entity Recognition Problem? (NER)

### Liquid Measurements

In [13]:
# use oz as standard unit
liquid_units = {"oz":1,
                "ml":0.033814,
                "cl": 0.33814,
                "tsp":0.166667,
                "teaspoon":0.166667,
                "tea spoon":0.166667,
                "tbsp":0.5,
                "tablespoon":0.5,
                "table spoon":0.5,
                "cup": 8,
                "cups": 8,
                "qt":0.03125,
                "quart":0.03125,
                "drop":0.0016907
               }

In [14]:
def frac_to_dec_converter(num_strings):
    """
    Takes a list of strings that contains fractions and convert them into floats.

    @Params
    - list_of_texts: list of str

    @Returns
    - list of floats

    @Example:
    [ln] >> frac_to_dec_converter(["1", "1/2", "3/2"])
    [Out] >> [1.0, 0.5, 1.5]
    """
    result = []

    for frac_str in num_strings:
        try:
            converted = float(frac_str)
        except ValueError:
            num, denom = frac_str.split('/')
            try:
                leading, num = num.split(' ')
                total = float(leading)
            except ValueError:
                total = 0
            frac = float(num) / float(denom)
            converted = total + frac

        result.append(converted)
        
    return result

In [15]:
def unit_unify(list_of_texts):
    """
    Takes a list of strings that contains liquid units, and convert them into fluid ounces.
    
    @Params
    - list_of_texts: list of str
    
    @Returns
    - list of str
    
    @Example:
    [ln] >> detector(["1 oz", "2ml", "4cup"])
    [Out] >> ["1 oz", "0.067628 oz", "32 oz"]
    """
    import re # use regex to find units
    
    # Defining re pattern
    pattern = r"(^[\d -/]+)(oz|ml|cl|tsp|teaspoon|tea spoon|tbsp|tablespoon|table spoon|cup|cups|qt|quart|drop|drops)"
    
    # Create Empty list to store refined data
    new_list = []
    
    
    # Search
    for text in list_of_texts:
        re_result = re.search(pattern, text)
        
        # If there is a matching result
        if re_result:
            # Seperate the matched pattern into two groups: amount(numbers), unit(measurement)
            amount = re_result.group(1).strip()
            unit = re_result.group(2).strip()

            # Convert all unit into oz
            ### Checking range in values 
            if "-" in amount:
                ranged = True
            else:
                ranged = False
            
            ### Replace non digit characters to plus sign
            ###### Dealing with exception type1: (1 /12 oz should be 1/12 oz)
            amount = re.sub(r"(\d) (/\d)",r"\1\2",amount) 
            amount = amount.replace("-","+").replace(" ","+").strip()
            ###### Dealing with exception type2: (1 - 2 produces 1+++2)
            amount = re.sub(r"[+]+","+",amount)
            ### Split them and add
            amount_in_dec = frac_to_dec_converter(amount.split("+"))
            amount = np.sum(amount_in_dec)
            
            if ranged:
                to_oz = (amount*liquid_units[unit])/2
            else:
                to_oz = amount*liquid_units[unit]

            # append refined string to the new list
            new_list.append(str(round(to_oz,2))+" oz")

        else:
            new_list.append(text)
            
    return new_list

In [16]:
for drink in df.columns:
    df[drink] = unit_unify(df[drink])

In [161]:
df.to_csv("recipe_cleaned_v1.csv")

#### Read cleaned Recipe

In [19]:
import pandas as pd

In [20]:
df = pd.read_csv("recipe_cleaned_v1.csv", index_col=0, dtype=str)
df = df.fillna("0")
df

FileNotFoundError: [Errno 2] File recipe_cleaned_v1.csv does not exist: 'recipe_cleaned_v1.csv'

In [21]:
for col in df:
    pair = zip(df.index,df[col].values)

In [22]:
list(df.index)

['white rum',
 'vodka',
 '151 proof rum',
 'creme de bananes',
 'Blue Curacao liqueur',
 'pineapple juice',
 "Jack Daniel's® Tennessee whiskey",
 'Jim Beam® bourbon whiskey',
 'dry gin',
 'Absolut® vodka',
 'Squirt® citrus soda',
 'Stoli® Ohranj vodka',
 'orange juice',
 'soda water',
 'Malibu® coconut rum',
 'ice',
 'cranberry-raspberry juice',
 'limes',
 'Wild Turkey® bourbon whiskey',
 'Bacardi® 151 rum',
 'Southern Comfort® peach liqueur',
 'Yukon Jack® Canadian whisky',
 'grenadine syrup',
 'rum',
 'Pepsi® Vanilla cola',
 '7-Up® soda',
 'lemonade',
 'Smirnoff® Raspberry Twist vodka',
 'Mountain Dew® citrus soda',
 '1800® Silver Tequila',
 'Agavero agave liqueur',
 'agave juice',
 'lime',
 'orange liqueur',
 'cranberry juice',
 'lime juice',
 '1800® Reposado Tequila',
 'creme de cassis',
 'apple cider',
 'lemon juice',
 'simple syrup',
 'club soda',
 'mint',
 'lemon',
 'passion-fruit juice',
 'raspberry liqueur',
 'pear puree',
 'pear',
 'PAMA® pomegranate liqueur',
 'grapefruit ju

### Check what hasn't been converted

In [23]:
import re

still_not_numbers = []
for amt in df.values.ravel():
    still_not_numbers.append(re.sub("[^a-zA-Z ]","",amt).strip())

In [24]:
still_not_numbers = list(set(still_not_numbers))

In [25]:
len(still_not_numbers)

428

In [26]:
still_not_numbers

['',
 'Top with',
 'glasschilled',
 'garnish',
 'shelled',
 'kaffir',
 'serving',
 'bottleBig Tom',
 'long green seedless',
 'pinchground',
 'Squeezed',
 'peeled and sliced',
 'garlic stuffed',
 'fill with unsweetened',
 'pinchdried',
 'shotsStrawberry',
 'glasses',
 'fresh chopped',
 'squirts',
 'handfulcrushed',
 'whole fresh',
 'Carlo Rossi Burgundy',
 'Slice of',
 'partsice cold',
 'chocolate',
 'shot',
 'count',
 'long',
 'bottle',
 'squirt',
 'bag',
 'fresh squeezed',
 'scoopsneopolitan',
 'partsBerry Blue',
 'whole Baby',
 'liters',
 'thin slice',
 'Freshly squeezed',
 'pintshulled quartered',
 'scoops',
 'packet',
 'medium ripe',
 'chocolate covered',
 'mediumsized',
 'black',
 'fresh tangelo or',
 'jiggerTanduay',
 'package',
 'splashfresh',
 'cansweetened',
 'splash proof',
 'splashsweetened',
 'raw',
 'small scoop',
 'bar',
 'fresh cut',
 'packageMarshmellows',
 'stoned',
 'fresh',
 'pinchcracked',
 'touch',
 'glass',
 'gallon',
 'dashfresh',
 'splashsmall',
 'fresh mint',
 

In [27]:
recipe_corpus = " ".join(list(cocktail_instructions.values()))

In [28]:
list(cocktail_instructions.values())

['In a tall chimney glass or hurricane glass, fill with cubed ice; add all liquor except the blue curacao. Add the pineapple juice, stir well, and float the blue curacao on top. Garnish with a pineapple slice and a maraschino cherry, and serve.',
 "Pour the Jack Daniel's Tennessee whiskey, Jim Beam bourbon whiskey, dry gin and Absolut vodka into a highball glass half-filled with ice cubes. Add Squirt, and serve.",
 'Serve on ice in a highball glass.',
 'All right start with 1/2 cup of ice mix the vodka and the coconut at the same time but leave the vodka for an extra second...then add the cran-rasb until it turns red and then add ur lime and tampico....make sure to mix it..',
 'Mix all ingredients together over ice in a highball glass. Shake up, then top with sprite and serve.',
 'Mix the rum, pepsi vanilla and grenadine in a cup and serve chilled, or mix the grenadine and rum in a shot glass and drop into a cup of pepsi vanilla.',
 'Stir together in a highball glass filled with ice cu

## Preprocessing for embedding

In [30]:
def phrase_to_word(phrase):
    """
    Replace spaces in a phrase to underscores to treat it as one word
    """
    if type(phrase) == str:
        return "_".join(phrase.split())
    
    elif type(phrase) == list:
        return ["_".join(element.split()) for element in phrase]

##### Processing recipe ingredients

In [18]:
cocktail_recipes[".50 Caliber recipe"]

{"Jack Daniel's® Tennessee whiskey": '2 oz',
 'Jim Beam® bourbon whiskey': '2 oz',
 'dry gin': '2 oz',
 'Absolut® vodka': '2 oz',
 'Squirt® citrus soda': '1 oz'}

In [19]:
cocktail_instructions

{"'61 Imperial recipe": 'In a tall chimney glass or hurricane glass, fill with cubed ice; add all liquor except the blue curacao. Add the pineapple juice, stir well, and float the blue curacao on top. Garnish with a pineapple slice and a maraschino cherry, and serve.',
 '.50 Caliber recipe': "Pour the Jack Daniel's Tennessee whiskey, Jim Beam bourbon whiskey, dry gin and Absolut vodka into a highball glass half-filled with ice cubes. Add Squirt, and serve.",
 '007 recipe': 'Serve on ice in a highball glass.',
 '10 Deep recipe': 'All right start with 1/2 cup of ice mix the vodka and the coconut at the same time but leave the vodka for an extra second...then add the cran-rasb until it turns red and then add ur lime and tampico....make sure to mix it..',
 '100 Miles per Hour recipe': 'Mix all ingredients together over ice in a highball glass. Shake up, then top with sprite and serve.',
 '136 recipe': 'Mix the rum, pepsi vanilla and grenadine in a cup and serve chilled, or mix the grenadin

In [155]:
corpus = []

for key,value in cocktail_instructions.items():
    drink_name = phrase_to_word(key)
    drink_name = drink_name.replace(" recipe","")
    ingred_names = phrase_to_word(list(cocktail_recipes[key].keys()))
    ingred_names = ", ".join(ingred_names)
    
    sentence = "{} is made with {}".format(drink_name, ingred_names)
    
    corpus.append(value + " " + sentence)
    

## Embedding with Fasttext

In [156]:
import fasttext
from sklearn.metrics.pairwise import cosine_similarity as cos_sim

In [157]:
corpus = " ".join(corpus)
with open("./corpus.txt","w") as f:
    f.write(corpus)

In [158]:
model = fasttext.train_unsupervised("./corpus.txt")

### Approach 1

In [159]:
embedded_drinks = [model.get_word_vector(x) for x in df.columns]

In [160]:
sim_matrix = pd.DataFrame(cos_sim(embedded_drinks), columns=phrase_to_word(list(df.columns)),
                            index=phrase_to_word(list(df.columns)))

In [161]:
def cos_sim_drinks(drink):
    """
    Returns the top 30 cosine similarity score of input drink name and other drinks.
    """
    return sim_matrix.sort_values(by=[drink],ascending=False)[drink][:30]

In [25]:
df

Unnamed: 0,'61 Imperial,.50 Caliber,007,10 Deep,100 Miles per Hour,136,151 Reasons,17 Twist,1800 Agave Nectar Squeeze,1800 Cosmolito,...,Zombie #4,Zombie #5,Zombie #6,Zombie (UK Style),Zone 23,Zoot,Zorbatini,Zorro's Revenge,Zultry Zoe,Zulu
white rum,0.5 oz,0,0,0,0,0,0,0,0,0,...,0.5 oz,0,0,0,0,0,0,0,0,0
vodka,0.5 oz,0,0,1.5 oz,0,0,0,0,0,0,...,0,0,0,0,1.0 oz,0,0,0,0,0
151 proof rum,0.5 oz,0,0,0,0,0,0.75 oz,0,0,0,...,0.17 oz,0.5 oz,0.68 oz,0,0,0,0,0,0,0
creme de bananes,0.5 oz,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.5 oz
Blue Curacao liqueur,1.0 oz,0,0,0,2.0 oz,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Alexander Keith's® Red Amber Ale,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
onion,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Faygo® Classic blueberry soda,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Vault® Zero,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [108]:
df.columns.tolist()

["'61 Imperial",
 '.50 Caliber',
 '007',
 '10 Deep',
 '100 Miles per Hour',
 '136',
 '151 Reasons',
 '17 Twist',
 '1800 Agave Nectar Squeeze',
 '1800 Cosmolito',
 '1800 Manzana Cider',
 '1800 Mint Lemonade',
 '1800 Passion',
 '1800 Pear Margarita',
 '1800 Pineapple Pomegranate',
 '1800 Pink Grapefruit Margarita',
 '1800 Pomegranate Breeze',
 '1800 Silver Dog',
 '187 Cocktail',
 '1964 Car Bomb',
 '1:00 Sunrise',
 '2 Step',
 '2000 Flushes',
 '2012',
 '209 East Cocktail',
 '242',
 '3 Fast Olives',
 '3 Fingers of Ron Burgundy',
 '3 for a Dollar Special',
 '302',
 '357 Magnum',
 '4 Godfathers',
 '414SS Daiquiri',
 '42 Flying Mules',
 '502',
 '57 Chevy',
 '5th Avenue',
 '6 Wise Men',
 '6am Sunrise',
 '7 Coconuts',
 '7 on 7',
 '7-Nut',
 '73 Bus',
 '73 Bus #2',
 '8-Ounce Twista',
 "80's Berry Wine Cooler",
 "80's Orange Wine Cooler",
 "80's Peach Wine Cooler",
 "80's Wine Cooler",
 '8th Birthday',
 'A Big Pink Dink',
 'A Bitter Canadian',
 'A Brain Teaser',
 "A Captain's Paradise",
 'A Clockwo

In [113]:
cos_sim_drinks("Bangin' Your Wife's Girlfriend")

Bar_Slut                                   1.000000
Apple_Slut                                 0.434556
YoYo_Blow_Out                              0.416003
Scotch_Daisy                               0.359501
Fruity_Slut                                0.346615
Babygirl                                   0.343489
Dirty_Bloody_Red_Headed_Slut               0.324251
Hawaiian_Ho                                0.315853
Hairy_Armpit                               0.311395
Cream_Bomb                                 0.307801
Hawaiian_Sea_Breeze                        0.303913
Pink_Mikey                                 0.302442
Cosmopolitan_Heston_Bar_Style_(Indiana)    0.298396
Bangkok_Bomb                               0.297128
Red_Gin                                    0.293167
French_Horn                                0.286994
Negus                                      0.284294
Hawaiian_Shoreline                         0.278864
Super_Slice                                0.278154
Reve_Satin  

### Approach 2

In [119]:
cocktail_recipes["Bangin' Your Wife's Girlfriend recipe"]

{'anisette': '1 tsp',
 'bourbon whiskey': '1 1/2 oz',
 'creme de cassis': '1 tsp',
 'Pernod® licorice liqueur': '1 tsp',
 'sugar syrup': '1 tsp'}

In [129]:
def drink_to_vector(drink_name):
    """
    Embedding drink name to vector using embedded ingredients
    """
    try:
        amounts = np.array([float(x.split(" ")[0]) for x in list(df[drink_name])])
    except:
        print(drink_name)
    ingred_vectors = np.array([model.get_word_vector(x) for x in df.index])
    
    return np.dot(amounts, ingred_vectors)

## TF-IDF

In [139]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words="english", token_pattern="(?u)\\b[\\w0-9][\\wÀ-ÖØ-öø-ÿ®`',.0-9&™\-()!:*]+\\b")
X = tfidf.fit_transform(corpus)

In [154]:
pd.DataFrame(X.todense(), columns=tfidf.get_feature_names())

Unnamed: 0,"0,05l","0,5l",0.15,0.5,0.75,007_recipe,0f,1-1,1-2,1-5,...,zorbatini_recipe,zorro's_revenge_recipe,zubrowka,zubrowka®_vodka,zultry_zoe_recipe,zulu_recipe,zwack,zwack®_apricot_brandy,â½,â¾
0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.705351,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12237,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
12238,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.586639,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
12239,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.678458,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
12240,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.365441,0.000000,0.0,0.0,0.0,0.0


## Co-occurrence Network

In [45]:
import networkx as nx

### Common Ingredient Cocktail Network

- **Node (Vertex)**: Cocktail
- **Edge**: If a pair of cocktail shares ingredient, weight 1 will be added. 

In [61]:
def common_value_network(data,threshold):
    """
    Take dictionary of dicts/lists/sets and compute number of common elements.
    
    @parameter
    ----
    data - dict of dicts/lists/sets
    threshold - least number of common elements required.
    
    @returns
    ----
    list - [(data.key1, data.key2, {"weight": int}), 
            (data.key2, data.key5, {"weight": int}), ... ]
    
    @example
    ----
    >> abc = {"a":[1,2,3], "b":[2,5,6], "c":[5,6,7]}
    >> common_value_network(abc, 2)

    [("b","c", {"weight": 2})]
    
    """
    from itertools import combinations
    
    # Pairing dictionary keys (cocktails)
    drink_pairs = combinations(data_dict.keys(),2)
    
    # List to save edges
    edges = []

    for pair in drink_pairs:
        common_ing = set(data[pair[0]]).intersection(set(data[pair[1]]))
        if len(common_ing) >= threshold:
            edges.append((pair[0],pair[1],{"weight": len(common_ing)}))
            
    return edges

In [62]:
cvn_thres1 = common_value_network(data_dict,1)
cvn_thres2 = common_value_network(data_dict,2)
cvn_thres3 = common_value_network(data_dict,3)

print(len(cvn_thres1), len(cvn_thres2), len(cvn_thres3), end="\n")

10787826 1179265 122416


#### Save edge lists as pickle

In [65]:
# with open("./pickle_data/common_ingredient_cocktail_network_threshold_1.pickle", "wb") as f:
#     pickle.dump(cvn_thres1, f)    
# with open("./pickle_data/common_ingredient_cocktail_network_threshold_2.pickle", "wb") as g:
#     pickle.dump(cvn_thres2, g)    
# with open("./pickle_data/common_ingredient_cocktail_network_threshold_3.pickle", "wb") as h:
#     pickle.dump(cvn_thres3, h)
    
# f.close()
# g.close()
# h.close()