# Scripts for learning 

In [1]:
dimensions_ingredients = 702
dimensions_nutrition_facts = 22
number_of_clusters=100

In [2]:
# Some Imports
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.decomposition import TruncatedSVD
import sklearn.metrics as sm
import numpy as np
import json
import pickle

In [3]:
# Define Inputs
X = None
Y = None

# Load Recipes
filename="recipes.json"
with open(filename) as json_data:
    recipes = json.load(json_data)
    Y = list(recipes.keys())
    X = np.zeros([len(Y), dimensions_ingredients+dimensions_nutrition_facts])
    index = 0
    for label, recipe in recipes.items():
        for ingredient in recipe["ingredients"]:
            X[index, ingredient[0]] = ingredient[1]
        for fact in recipe["nutrition"]:
            X[ index, dimensions_ingredients + fact[0] ] = fact[1]
        index+=1
print("File "+filename+" was successfully read")

File recipes.json was successfully read


In [4]:
def removeOutlayers(X,Y,thr=45.0):
    X_new = []
    Y_new = []
    mtrs = [(np.mean(xi), np.std(xi)) for xi in X.T]
    for xi,yi in zip(X,Y):
        out = False
        for j,f in enumerate(xi):
            if f > thr*mtrs[j][1]:
                out = True
        if not out:
            X_new.append(xi)
            Y_new.append(yi)
    return np.asarray(X_new), Y_new
    

X_new, Y_new = removeOutlayers(X,Y)

print('Original observations', X.shape)
print('After removing outlayers', X_new.shape)
        

Original observations (18215, 724)
After removing outlayers (17273, 724)


In [5]:
X = X_new
Y = Y_new

In [6]:
# See one example
print(Y[0])
print(X[0,:])

German Chocolate Picnic Cake
[  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   6.65849942e-02   0.00000000e+00   0.00000000e+00
   0.

In [7]:
# Train K-Means
# model = KMeans(n_clusters=number_of_clusters)
# model.fit(X)

In [8]:
# Show the labels
# model.labels_

In [9]:
# Show one of the groups
groups = []
for group in range(0, number_of_clusters):
    groups.append([])
#for index in range(0,len(Y)):
    # groups[model.labels_[index]].append(Y[index])

# Now go with nearest neighbors

In [10]:
X_ingredients = X[:,0:dimensions_ingredients]

from sklearn.neighbors import NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=10, algorithm='ball_tree').fit(X_ingredients)

In [11]:
# Find the nearest neighbor to a given one
def _neighbors_of(X):
    X_=np.mean(X, axis=0)
    X_ = X_.reshape(1,-1)
    neighbors=[]
    distances, indices = nbrs.kneighbors(X_)
    for n_index in indices[0]:
        neighbors.append(Y[int(n_index)])
    return neighbors
    
def neighbors_of(index):
    X=X_ingredients[index:index+1,:]
    return _neighbors_of(X)
    
neighbors_of(100)

['Dinner Tonight: Pasta with Roasted Cauliflower, Chickpeas, and Ricotta',
 'Broccoli Ricotta Pasta',
 'Broccoli And Feta Pasta Salad',
 'Pasta with Pistachios, Meyer Lemon and Broccoli',
 'Dinner Tonight: Pasta e Broccoli',
 'Pasta with Cauliflower',
 'Fresh Sardines Grilled in Paper With Melted Butter and Lemon',
 'Wasabi Honey Coleslaw',
 'Pasta with Prosciutto and Lettuce',
 'Honey-Balsamic Bean Salad']

In [12]:
# Or find the nearest to a group of those
def neighbors_of_list(elements, cluster=False):
    X=X_ingredients[elements,:]
    if cluster:
        print("Not implemented")
    else:
        return _neighbors_of(X)
        
neighbors_of_list([0, 268, 3930, 100])

['Chinois Chicken Salad',
 'Autumn Game Casserole',
 'Five Heads of Garlic Roast Chicken',
 'Roast Chicken with Truffles and Truffle Butter',
 'Chicken Stracciatella with Baby Spinach',
 'Sticky barbecue chicken wings and drumsticks',
 'Braised Rabbit with Mustard and Fennel',
 "James's Chicken Curry with Cashews",
 'Quick Chicken-Sesame Rice Balls',
 'Preserved Lemon Chicken']

# Singular value decomposition


In [13]:
svd = TruncatedSVD(n_components=100, n_iter=9, random_state=42)
X_svd = svd.fit_transform(X)

# Clustering 

In [14]:
# Train K-Means
number_of_clusters=120
model = KMeans(n_clusters=number_of_clusters)
model.fit(X_svd[100:])


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=120, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [15]:
preds = model.predict(X_svd[-100:])

In [16]:
def printCluster(preds,Y,n):
    c = 0
    for i in range(len(preds)):
        if preds[i]==n:
            c += 1
            print(Y[i])
    print ("Total:", c)
            
            
def getCentroid(cluster):
    return np.asarray([np.mean(c) for c in np.transpose(cluster)])

def getCentroidNutrition(cluster):
    return np.asarray([np.mean(c) for c in np.transpose(cluster[-22:])])

def getIndices(preds, idx):
    return [i for i in range(len(preds)) if preds[i] == idx]

def dist(x,y):   
    return np.sqrt(np.sum((x-y)**2))

In [17]:
preds

array([ 99,  79,  19,  51,  39,  99,  99,  87, 115,  70,  92, 117,  78,
        15,  83,  95,  15,  99,  70,  96,  45,  46,   5,  96,  70,  46,
        46,   0,  83,  78,   0,   5,  75,  70, 112,  70,  79,  95,  70,
        79,  66,  75, 105,  60,  94,  45,  51,  39,  15,  39,  66, 104,
        96,  31, 105,  74,  70, 105, 105,  96,  99,   5,  72,   5,  39,
        70,  46,  41,  94,  41, 117,  99,  99,  97, 115,  74,  60,  11,
        15,  60,  15,  70,  39,  37,  94,  15,   0,  70,  39,  87,  70,
        95,  41,  41,   5,  96,  70,  10,  96,  81], dtype=int32)

In [49]:
#Hierachical clustering
#aggModel = AgglomerativeClustering(n_clusters=120, linkage='ward')
aggModel = pickle.load(open('aggClusterModel.pkl','rb'))
preds = aggModel.fit_predict(X_svd)

In [19]:
preds[236]

74

In [35]:
c10 = getCentroid(X_svd[getIndices(preds,10)])
c25 = getCentroid(X_svd[getIndices(preds,25)])
c39 = getCentroid(X_svd[getIndices(preds,39)])
c16 = getCentroid(X_svd[getIndices(preds,16)])
c24 = getCentroid(X_svd[getIndices(preds,24)])
c = getCentroid(X_svd[getIndices(preds,20)])

print(dist(c10,c25))
print(dist(c10,c16))
print(dist(c10,c39))
print(dist(c10,c24))
print(dist(c10,c))
print(dist(c,c39))

0.449451345785
0.516873117783
1.05036437734
2.4941222311
4.93094324146
4.69125493973


In [47]:
c10 = getCentroidNutrition(X_svd[getIndices(preds,10)])
c25 = getCentroidNutrition(X_svd[getIndices(preds,25)])
c39 = getCentroidNutrition(X_svd[getIndices(preds,39)])
c16 = getCentroidNutrition(X_svd[getIndices(preds,16)])
c24 = getCentroidNutrition(X_svd[getIndices(preds,24)])
c2 = getCentroidNutrition(X_svd[getIndices(preds,2)])

print(dist(c39,c25))
print(dist(c39,c16))
print(dist(c10,c39))
#print(dist(c10,c24))
#print(dist(c10,c2))
#print(dist(c2,c24))

1.34100197858
1.60209833113
1.09844816872


In [33]:
printCluster(preds,Y,39)

At Home Cherry Garcia Ice Cream
Rabbit River Farms Egg Dip Recipe
German Chocolate Cake Recipe
Cranberry Mousse
Warm Chocolate Cakes with Mascarpone Cream
Recipe
French in a Flash (Classic): Vanilla Bean Crème Brûlée
Caramel Apples with Nuts
Bittersweet Chocolate Mousse
Chocolate Hazelnut Cheesecake Recipe
Apple pie with dulce de leche
French Waffles with Cointreau Whipped Cream
Chocolate Nut Muffins
Blueberry Soured Cream Cake
Chocolate Nut Upside-Down Cake
Aurora Tart
Matt And Ted Lee's Caramel Cake Recipe
Honey Lemon Pots De Creme
Goat-Cheese Panna Cotta With Bacon Syrup and Cashews
Béarnaise sauce
Toasted-Coconut Refrigerator Cake
Carrot Cake Ice Cream
Chocolate Espresso Mousse Cake
Chocolate Nutella Cookies
Chocolate Pots de Creme
Bananas Foster French Toast
Apple Dumplings
Pomegranate-Lemon-Mascarpone Tart with Coconut-Macadamia Nut Crust
Chocolate Whiskey Pots De Creme
Easy Vanilla Ice Cream
Macadamia Cake With Lime Syrup
Gashouse Eggs
Vanilla ice cream
Chocolate-Banana Tart
Lem

In [40]:
[(i,y) for i,y in enumerate(Y[0:100])]

[(0, 'German Chocolate Picnic Cake'),
 (1, 'Thai Black Rice Salad'),
 (2, 'Brown Rice Salad with Crunchy Sprouts and Seeds'),
 (3, 'Gluten-free blueberry muffins'),
 (4, 'Milk Chocolate and Black Pepper Ice Cream Recipe'),
 (5, 'Debesmanna (Cranberry Mousse)'),
 (6, 'Herb Roasted Chicken with Lemon and Sage'),
 (7, 'Mexican Mojito'),
 (8, 'Tropical Sunrise Porridge (Oatmeal)'),
 (9, 'Miso Cod | Steamy Kitchen'),
 (10, 'Sugared Jelly Candies'),
 (11, "Ramen with Chashu Pork from 'Hiroko's American Kitchen'"),
 (12, 'The Best Chocolate Ice Cream'),
 (13, 'Slow-Cooked Lamb with Lemon and Oregano'),
 (14, 'Sour Cherry Lattice Pie'),
 (15, 'Spinach Salad with Cherries, Goat Cheese and Walnuts'),
 (16, 'Korean Dipping Sauce'),
 (17, 'Potato Pancakes with Smoked Sturgeon Recipe'),
 (18, 'Grilled aubergines with miso'),
 (19, 'All-American Baked Potatoes'),
 (20, 'Peanut Butter Krispy Treats'),
 (21, 'Roasted Cauliflower, Chickpeas, and Olives Recipe'),
 (22, 'Stuffed Round Zucchinis With Red 

In [48]:
modelFile='aggClusterModel.pkl'
pickle.dump(aggModel, open(modelFile,'wb'))

# Clustering using only nutritional information

In [25]:
X_n = np.asarray([x[-22:] for x in X])

In [36]:
#Hierachical clustering
aggModelNut = AgglomerativeClustering(n_clusters=120, linkage='ward')
predsNut = aggModelNut.fit_predict(X_n)

In [44]:
predsNut[10]

5

In [45]:
printCluster(predsNut,Y,5)

Gluten-free blueberry muffins
Debesmanna (Cranberry Mousse)
Mexican Mojito
Tropical Sunrise Porridge (Oatmeal)
Sugared Jelly Candies
Sour Cherry Lattice Pie
Korean Dipping Sauce
Potato Pancakes with Smoked Sturgeon Recipe
Peanut Butter Krispy Treats
Three-Herb and Tofu Lettuce Wraps with Soy-Honey Dipping Sauce
Country-Style Apple Butter
Lentil and chickpea salad with feta and tahini
Hazelnut-Orange Shortbread
Seafood Pizza
Shrimp & Plum Kebabs
Gruyère and Black Pepper Popovers
Brûléed Cappuccino
Cranberry Sauce With Orange And Fennel Seed
Dinner Tonight: Pasta with Roasted Cauliflower, Chickpeas, and Ricotta
Buta Kakuni (Japanese Braised Porkbelly)
Pomegranate Molasses and Pine Nut Cookies
American Red Snapper Recipe
Breakfast Oven Fries Recipe
Time for a Drink: The Kangaroo, aka Vodka Martini
Cranberry Sauce with Apples and Japanese Pickled Ginger
Chinese Celery Salad
Goat Cheese and Walnut Covered Grapes
Dark Chocolate Muesli with hazelnuts
Apple Cardamom Crisp
Sweet-And-Spicy Coles

Quick Lemon Cottage Cake Recipe
Baja-Battered Fish
Dinner Tonight: Linguine With Squash, Bacon, and Goat Cheese
Crabmeat and Dill Dumplings
Miso-Agave Mayonnaise (Vegan)
Sushi Rice And Seasoning
Chinese Sausage and Broccolini Fried Rice Recipe
Meat Loaf And Mashed Potatoes
Chili-Stuffed Tomatoes
Rustic Chunky Applesauce
Baked Caprese Rigatoni
Chocolate Hazelnut Biscotti Recipe
Mango and Pineapple Sorbet
Montreal Bagels
White Fish Salad
Country Ham Biscuits
Mixed Greens Salad
Liquid Nitrogen Ice Cream Recipe
Old English Cider Sorbetto
Sweet Wine & Honey Roasted Pears
Cunning Citrus Cake
Raspberry & Champagne Ice Cream Floats
Gluten Free Orange Cake
Mushroom Risotto Recipe
Ciderhouse Whiskey
Iced Fruit Punch
Meyer Lemon Ginger Marmalade
Lentil and Herb Salad
Celery Root Soup Recipe
Raw Artichoke, Mushroom, And Parmesan Salad
Dinner Tonight: Chicken Soup with Jasmine Rice and Ginger
Halloween Meringue Cookie Recipe
French Toast With Raspberry Syrup
Noodles With Black Bean Sauce
Tomato Gra

In [29]:
#25,10,16, 39 