# Recipes for ML Model

## Import JSON Dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import os
import pandas as pd
import numpy as np
import json

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix

### Data Reading

In [0]:
main_dir = 'drive/My Drive/Final'
datajson_dir = 'full_format_recipes.json'

In [4]:
# Read data from json and print tail
json_df = pd.read_json(os.path.join(main_dir, datajson_dir))
json_df.tail()

Unnamed: 0,directions,fat,date,categories,calories,desc,protein,rating,title,ingredients,sodium
20125,[Beat whites in a bowl with an electric mixer ...,2.0,2004-08-20 04:00:00+00:00,"[Mixer, Cheese, Egg, Fry, Cocktail Party, Parm...",28.0,,2.0,3.125,Parmesan Puffs,"[2 large egg whites, 3 oz Parmigiano-Reggiano,...",64.0
20126,[Bring broth to simmer in saucepan.Remove from...,28.0,2008-02-28 22:06:54+00:00,"[Side, Kid-Friendly, High Fiber, Dinner, Parme...",671.0,Cooking the artichokes with the rice infuses t...,22.0,4.375,Artichoke and Parmesan Risotto,"[5 1/2 cups (or more) low-salt chicken broth, ...",583.0
20127,"[Using a sharp knife, cut a shallow X in botto...",38.0,2005-10-21 18:21:20+00:00,"[Onion, Poultry, turkey, Vegetable, Bake, Kid-...",563.0,,31.0,4.375,Turkey Cream Puff Pie,"[1 small tomato, 1 small onion, finely chopped...",652.0
20128,[Heat 2 tablespoons oil in heavy medium skille...,24.0,2004-08-20 04:00:00+00:00,"[Milk/Cream, Citrus, Dairy, Fish, Garlic, Past...",631.0,"Sharon Hooykaas of Los Alamitos, California, w...",45.0,4.375,Snapper on Angel Hair with Citrus Cream,"[4 tablespoons olive oil, 4 shallots, thinly s...",517.0
20129,[Position rack in bottom third of oven and pre...,10.0,2004-08-20 04:00:00+00:00,"[Pork, Bake, Roast, Christmas, Ham, Winter, Bo...",560.0,"Although labeled fully cooked, the ham will st...",73.0,4.375,Baked Ham with Marmalade-Horseradish Glaze,"[1 18-pound fully cooked bone-in smoked ham, r...",3698.0


In [5]:
# Shape of dataframe
json_df.shape

(20130, 11)

In [6]:
# Delete unnecesary columns
json_df = json_df.drop(['date', 'rating', 'categories'], axis=1)
json_df.tail()

Unnamed: 0,directions,fat,calories,desc,protein,title,ingredients,sodium
20125,[Beat whites in a bowl with an electric mixer ...,2.0,28.0,,2.0,Parmesan Puffs,"[2 large egg whites, 3 oz Parmigiano-Reggiano,...",64.0
20126,[Bring broth to simmer in saucepan.Remove from...,28.0,671.0,Cooking the artichokes with the rice infuses t...,22.0,Artichoke and Parmesan Risotto,"[5 1/2 cups (or more) low-salt chicken broth, ...",583.0
20127,"[Using a sharp knife, cut a shallow X in botto...",38.0,563.0,,31.0,Turkey Cream Puff Pie,"[1 small tomato, 1 small onion, finely chopped...",652.0
20128,[Heat 2 tablespoons oil in heavy medium skille...,24.0,631.0,"Sharon Hooykaas of Los Alamitos, California, w...",45.0,Snapper on Angel Hair with Citrus Cream,"[4 tablespoons olive oil, 4 shallots, thinly s...",517.0
20129,[Position rack in bottom third of oven and pre...,10.0,560.0,"Although labeled fully cooked, the ham will st...",73.0,Baked Ham with Marmalade-Horseradish Glaze,"[1 18-pound fully cooked bone-in smoked ham, r...",3698.0


In [7]:
# Find how many NaN are in the columns
json_df.isna().sum()

directions       19
fat            4222
calories       4154
desc           6635
protein        4201
title            19
ingredients      19
sodium         4156
dtype: int64

In [8]:
# Find if there are 0 values in calories
json_df['calories'].eq(0).sum()

13

In [9]:
# We will drop all the columns which has calories as 0 or NaN
# Calories is our main focus on the dataset, this is why we will
# drop for this column

json_df = json_df[json_df['calories'] != 0]
json_df.shape

(20117, 8)

In [10]:
json_df = json_df[json_df['calories'].notna()]
json_df.shape

(15963, 8)

In [11]:
# Let's check how many columns still has NaN
json_df.isna().sum()

directions        0
fat              63
calories          0
desc           5328
protein          42
title             0
ingredients       0
sodium            2
dtype: int64

In [12]:
# Reset the indexes
json_df = json_df.reset_index(drop=True)

# Make all NaN values zero
json_df = json_df.fillna(0)
json_df.isna().sum()

directions     0
fat            0
calories       0
desc           0
protein        0
title          0
ingredients    0
sodium         0
dtype: int64

In [13]:
# Let's check how many columns still has 0 values
json_df.eq(0).sum()

directions        0
fat            1285
calories          0
desc           5328
protein         891
title             0
ingredients       0
sodium           54
dtype: int64

In [14]:
json_df.columns

Index(['directions', 'fat', 'calories', 'desc', 'protein', 'title',
       'ingredients', 'sodium'],
      dtype='object')

In [0]:
# For last drop all columns with 0 value
for col in json_df.columns:
    json_df = json_df[json_df[col] != 0]

In [16]:
json_df.eq(0).sum()

directions     0
fat            0
calories       0
desc           0
protein        0
title          0
ingredients    0
sodium         0
dtype: int64

In [17]:
json_df = json_df.reset_index(drop=True)
json_df.shape

(9757, 8)

In [18]:
json_df.dtypes

directions      object
fat            float64
calories       float64
desc            object
protein        float64
title           object
ingredients     object
sodium         float64
dtype: object

In [0]:
# Make dictionary based on the object types columns

# *****************************************
# Esto todavía no
#******************************************

from sklearn.preprocessing import LabelEncoder
result_cat = []

label_encoder = LabelEncoder()
for var in label_encoder.fit_transform(json_df['title']):
    result_cat.append(var)

titles_encoded = pd.DataFrame(result_cat, columns=['result'])
titles_encoded.tail()

Unnamed: 0,result
15958,8551
15959,392
15960,13314
15961,11696
15962,712


In [0]:
#json_df['title'] = result['result']
json_df.tail()

Unnamed: 0,directions,fat,calories,desc,protein,title,ingredients,sodium
9752,"[In a large pot, place the chicken legs, bay l...",59.0,843.0,"My brother, Al, is an inspiring teacher at Jam...",59.0,Chicken with White Wine and Herbs,"[8 chicken legs, 3 bay leaves, 1 teaspoon drie...",351.0
9753,[1. Preheat the oven to 400°F. Spray a baking ...,70.0,1086.0,Kosher Status: Poultry,78.0,Crispy Salt and Pepper Chicken with Caramelize...,"[Cooking spray, One 3 1/2-pound chicken, cut i...",1323.0
9754,[Bring broth to simmer in saucepan.Remove from...,28.0,671.0,Cooking the artichokes with the rice infuses t...,22.0,Artichoke and Parmesan Risotto,"[5 1/2 cups (or more) low-salt chicken broth, ...",583.0
9755,[Heat 2 tablespoons oil in heavy medium skille...,24.0,631.0,"Sharon Hooykaas of Los Alamitos, California, w...",45.0,Snapper on Angel Hair with Citrus Cream,"[4 tablespoons olive oil, 4 shallots, thinly s...",517.0
9756,[Position rack in bottom third of oven and pre...,10.0,560.0,"Although labeled fully cooked, the ham will st...",73.0,Baked Ham with Marmalade-Horseradish Glaze,"[1 18-pound fully cooked bone-in smoked ham, r...",3698.0


In [0]:
# Separate the data into Snacks and Meal based on 250 calories
snacks = json_df[json_df['calories'] <= 250]
meals = json_df[json_df['calories'] > 250]

In [42]:
snacks.shape

(2889, 8)

In [46]:
meals.shape

(6868, 8)

In [44]:
snacks.head()

Unnamed: 0,directions,fat,calories,desc,protein,title,ingredients,sodium
2,"[Stir together soy sauce, sugar, sesame oil, w...",10.0,170.0,Bulgogi,7.0,Korean Marinated Beef,"[1/4 cup soy sauce, 1 tablespoon sugar, 2 teas...",1272.0
6,[Sprinkle steaks with salt and pepper. Heat oi...,12.0,174.0,This recipe can be prepared in 45 minutes or l...,11.0,Beef Tenderloin with Garlic and Brandy,[4 6- to 7-ounce beef tenderloin steaks (each ...,176.0
7,"[Using the tip of a paring knife, score an X i...",3.0,134.0,Where a chutney and mustard sauce overlap. Cho...,4.0,Peach Mustard,"[1 large ripe peach, 2 tablespoons sugar, 1 te...",1394.0
9,[Butter and sugar six 2/3-to 3/4-cup ramekins....,5.0,146.0,Classic spoon bread is a savory pudding served...,4.0,Sweet Buttermilk Spoon Breads,"[1 cup water, 2/3 cup buttermilk, 1/3 cup heav...",160.0
13,"[Gently combine the eggs, cucumbers, shallots,...",20.0,215.0,The glories of summer are captured in this pal...,6.0,Cucumber-Basil Egg Salad,"[6 hard-cooked eggs, diced (2 cups), 3/4 cup s...",250.0


In [0]:
snacks = snacks.sort_values('calories')
meals = meals.sort_values('calories')

train_snacks = snacks[['fat', 'calories', 'protein', 'sodium']]
snacks_desc = snacks[['directions', 'desc', 'title', 'ingredients']]

train_meals = meals[['fat', 'calories', 'protein', 'sodium']]
meals_desc = meals[['directions', 'desc', 'title', 'ingredients']]

In [48]:
train_snacks.head()

Unnamed: 0,fat,calories,protein,sodium
6306,1.0,13.0,1.0,38.0
4638,1.0,15.0,1.0,21.0
1127,1.0,15.0,1.0,26.0
2627,1.0,15.0,1.0,57.0
4792,1.0,15.0,1.0,26.0


In [49]:
train_meals.head()

Unnamed: 0,fat,calories,protein,sodium
8856,9.0,251.0,6.0,88.0
8754,11.0,251.0,5.0,90.0
2581,11.0,251.0,5.0,161.0
9630,9.0,251.0,3.0,117.0
4223,15.0,251.0,24.0,490.0


In [50]:
snacks_desc.head()

Unnamed: 0,directions,desc,title,ingredients
6306,[Mix first 4 ingredients in medium bowl. Cut p...,"These hors d'oeuvres are light, flaky and abso...",Pepperoni and Asiago Pinwheels,"[1/2 cup grated Asiago cheese*, 3/4 teaspoon d..."
4638,[Preheat oven to 375°F with racks in upper and...,These have the lively crisp exterior and cloud...,Hickory-Bacon and Roasted-Corn Gougeres,"[4 hickory-smoked bacon slices (1/4 pound), 3/..."
1127,[Whisk together all ingredients in a small bow...,Ras-El-Hanout,Moroccan Spice Blend,"[1 teaspoon ground cumin, 1 teaspoon ground gi..."
2627,[Cut cucumbers crosswise into generous 1/2-inc...,If you make the components of this hors d'oeuv...,Wasabi Lime Crab Salad in Cucumber Cups,[4 seedless cucumbers (usually plastic-wrapped...
4792,[Whisk together all ingredients in a small bow...,Ras-El-Hanout,Moroccan Spice Blend,"[1 teaspoon ground cumin, 1 teaspoon ground gi..."


In [51]:
meals_desc.head()

Unnamed: 0,directions,desc,title,ingredients
8856,"[Put one can of milk in each of 2 bowls, then ...","Candy corn may be synonymous with Halloween, b...","""Candy Corn"" Frozen Citrus Cream Pops","[2 (14-ounce) cans sweetened condensed milk, 1..."
8754,[Purée all ingredients in a blender just until...,"If you remember drinking Orange Julius, this s...",Orange Vanilla Shakes,"[4 cups premium vanilla ice cream, 1 cup whole..."
2581,[Toss first 4 ingredients in medium bowl. Add ...,The technique: Eggs change everything: The yol...,Sweet Potato Pudding with Pecan and Gingersnap...,"[3/4 cup coarsely chopped gingersnaps, 1/2 cup..."
9630,[Preheat oven to 350°F. Butter 8x8-inch metal ...,"Susan Richardson of Edina, Minnesota, writes: ...",Classic Date Bars,"[1 1/2 cups water, 1 1/2 cups chopped pitted d..."
4223,"[Preheat the oven to 325°F, Individually wrap ...",Editor's note: The recipe and introductory tex...,Italian Tuna and Shaved Fennel Sandwich with B...,"[1 baguette, split in half and cut into 4 sect..."


### Export the Data

In [0]:
train_snacks.to_csv("snacks.csv", index=False)
snacks_desc.to_csv("snacks_desc.csv", index=False)
train_meals.to_csv("meals.csv", index=False)
meals_desc.to_csv("meals_desc.csv", index=False)

In [0]:
cal_for_snack = 52.6
train_snacks['calories'].sub(cal_for_snack).abs().idxmin()

#df['delta_n'].sub(delta_n).abs().idxmin()

8392

In [0]:
train_snacks.loc[8392]

fat          4.0
calories    53.0
protein      3.0
sodium      93.0
Name: 8392, dtype: float64

## Divide data

In [0]:
# y = json_df[['directions', 'title', 'ingredients', 'desc']]
y = json_df['title']
y.tail()

15958     8551
15959      392
15960    13314
15961    11696
15962      712
Name: title, dtype: int64

In [0]:
# Join columns of directions, title, ingredients and desc into one column
# with json format
#yy = pd.DataFrame(index=y.index, columns=['result'])
#for i in range(len(y)):
#    yy['result'][i] = ("{'directions':" + str(y['directions'][i]) + 
#                            ", 'title':" + str(y['title'][i]) + ", 'ingredients':" + 
#                            str(y['ingredients'][i]) + ", 'desc':" + 
#                            str(y['desc'][i]) + "}")

In [0]:
X = json_df[['fat', 'calories', 'protein', 'sodium']]
X.tail()

Unnamed: 0,fat,calories,protein,sodium
15958,2.0,28.0,2.0,64.0
15959,28.0,671.0,22.0,583.0
15960,38.0,563.0,31.0,652.0
15961,24.0,631.0,45.0,517.0
15962,10.0,560.0,73.0,3698.0


In [0]:
# Set train and test data
# set yy later for getting a json format to see all the variables the prediction has
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)
X_train.shape, X_test.shape

((13568, 4), (2395, 4))

## Train the Model

### Create Linear Regression Model

In [0]:
from sklearn.linear_model import LinearRegression

In [0]:
lnr = LinearRegression(n_jobs=1)
lnr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [0]:
lnr.score(X_test, y_test)

-0.0009759201709598209

### Instantiate Random Forest Model

In [0]:
import random

#from sklearn.ensemble import RandomForestClassifier

from keras.models import Sequential
from keras.layers import Dense

from sklearn.metrics import mean_squared_error

In [0]:
random.seed(42)
rf = RandomForestClassifier(n_estimators=10, verbose=2, max_features = 'sqrt')
rf.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 10


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.8s remaining:    0.0s


building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10


In [0]:
# Define model
model = Sequential()
model.add(Dense(100, input_dim=4, activation= "relu"))
model.add(Dense(50, activation= "relu"))
model.add(Dense(1))
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_10 (Dense)             (None, 100)               500       
_________________________________________________________________
dense_11 (Dense)             (None, 50)                5050      
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 51        
Total params: 5,601
Trainable params: 5,601
Non-trainable params: 0
_________________________________________________________________


In [0]:
# Compile model
model.compile(loss="mean_squared_error" , optimizer="adam", metrics=["accuracy"])

In [0]:
# Fit Model
model.fit(X_train, y_train, epochs=10000)

Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Epoch 39/10000
Epoch 40/10000
Epoch 41/10000
Epoch 42/10000
Epoch 43/10000
Epoch 44/10000
Epoch 45/10000
Epoch 46/10000
Epoch 47/10000
Epoch 48/10000
Epoch 49/10000
Epoch 50/10000
Epoch 51/10000
Epoch 52/10000
Epoch 53/10000
Epoch 54/10000
Epoch 55/10000
Epoch 56/10000
Epoch 57/10000
Epoch 58/10000
Epoch 59/10000
Epoch 60/10000
Epoch 61/10000
Epoch 62/10000
Epoch 63/10000
Epoch 64/10000
Epoch 65/10000
Epoch 66/10000
Epoch 67/10000
Epoc

### Evaluate the models

#### Evaluate the Random Forest model

In [0]:
pred=rf.predict(X_test)
score = np.sqrt(mean_squared_error(y_test, pred))
score

#### Evaluate the NN Model

In [0]:
pred= model.predict(X_test)
score = np.sqrt(mean_squared_error(y_test, pred))
score

### Test the model

### Instantiate Logistic Regression Model

In [0]:
model = LogisticRegression(verbose=2)

### Train the model!

In [0]:
in_size = 4
hidden_size = 200
out_size = len(y)
std = 1e-3

model = TwoLayerNN(in_size, hidden_size, out_size, std)

history_data = model.train(X_train, y_train, X_test, y_test, 
              learning_rate=1e-5, learning_rate_decay=0.95, 
              reg=0.0, num_iters=40, 
              batch_size=128, it_verbose = 10, verbose=True)

KeyError: ignored