In [60]:
# To run this code you will need tensorflow package installed
import numpy as np
import pandas as pd

import time
import os
print(os.listdir("../input"))


['sample_submission.csv', 'test.json', 'train.json']


In [None]:
list_of_files = [f for f in os.listdir("../input")]

In [None]:
data = pd.read_json("../input/train.json")

In [None]:
test_data = pd.read_json("../input/test.json")

In [None]:
# data.shape = (39774, 3)
test_data.head()

In [None]:
# adding a new column for ingredients by converting list of ingredients to ingredient strings
data['ingredients_str'] = data['ingredients'].apply(lambda x: ' '.join([w for w in x]))

In [None]:
# removing duplicate (cuisine, ingredients) combinations
data.drop_duplicates(['cuisine', 'ingredients_str'], inplace=True)

In [None]:
# identifying rows where there are two or more different cuisines for the same ingredients
# remove the ambiguity
g = data.groupby('ingredients_str')
data_to_remove = g.filter(lambda x: len(x) > 1)
dfTemp = pd.merge(data, data_to_remove, how='left', on='id') 
data = dfTemp[dfTemp.ingredients_y.isna()].copy()

In [None]:
# data.shape after some cleaning = (39671, 3)
# dropping the columns that were introduced in the process and no longer are needed
data.drop(['ingredients_str_x', 'cuisine_y', 'ingredients_y', 'ingredients_str_y'], axis=1, inplace=True)
data.columns = ['cuisine', 'id', 'ingredients']
g = None
data_to_remove = None
dfTemp = None

In [None]:
# identifying all individual ingredients from all cuisines
list_of_ingredients = []
for item in list(data['ingredients']):
    list_of_ingredients += item

# removing duplicate entries
list_of_ingredients = list(set(list_of_ingredients))

In [None]:
# creating a new column for each ingredient, and they become an indicator (binary) variables
for ingredient in list_of_ingredients:
    data[ingredient] = data['ingredients'].apply(lambda x: 1 if ingredient in x else 0)
    test_data[ingredient] = test_data['ingredients'].apply(lambda x: 1 if ingredient in x else 0)    

In [None]:
# viewing some date
# data[['cuisine', 'id', 'ingredients', 'romaine lettuce', 'ground pepper', 'eggs']][:5]
test_data[['id', 'ingredients', 'romaine lettuce', 'ground pepper', 'eggs']][:5]

In [None]:
# creating dummy variables from the target variable
dfDummies = pd.get_dummies(data["cuisine"])
dfMaster = pd.concat([data, dfDummies], axis=1)

# breaking the dataset into train and test
from sklearn.model_selection import train_test_split
dfTrain, dfTest = train_test_split(dfMaster, test_size=0.2)

In [None]:
# creating list of litst in order to use with the tensorflow
train_X = []
train_Y = []
test_X = []
test_Y = []

for _, item in dfTrain.iterrows():
    train_X.append(list(item[3:-20]))
    train_Y.append(list(item[-20:]))

for _, item in dfTest.iterrows():
    test_X.append(list(item[3:-20]))
    test_Y.append(list(item[-20:]))    


In [None]:
import tensorflow as tf

In [None]:
# defining some of the hyper paramenters (these can be manipulated in order to tune the model)
learning_rate = 0.01
training_epochs = 60
display_step = 10

n_input = 6714
n_hidden = 10000
n_output = 20

X = tf.placeholder("float", [None, n_input])
Y = tf.placeholder("float", [None, n_output])

weights = {
    "hidden": tf.Variable(tf.random_normal([n_input, n_hidden])),
    "output": tf.Variable(tf.random_normal([n_hidden, n_output]))
}

bias = {
    "hidden": tf.Variable(tf.random_normal([n_hidden])),
    "output": tf.Variable(tf.random_normal([n_output]))
}

In [None]:
def model(X, weights, bias):
    layer1 = tf.add(tf.matmul(X, weights["hidden"]), bias["hidden"] )
    layer1 = tf.nn.relu(layer1)
    
    output_layer = tf.matmul(layer1, weights["output"]) + bias["output"]
    
    return output_layer

In [None]:
# defining the cost function and the optimizer
pred = model(X, weights, bias)
cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=Y))
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

init = tf.global_variables_initializer()

# building and outputing the some stats for the predictive model
with tf.Session() as sess:
    sess.run(init)
    
    for epochs in range(training_epochs):
        _, c = sess.run([optimizer, cost], feed_dict={X: train_X, Y: train_Y})
        if(epochs + 1) % display_step == 0:
            print(" Epochs: {}, Cost: {}".format((epochs+1), c))
    print("Done!")
    
    test_result = sess.run(pred, feed_dict = {X: train_X})
    score_result = sess.run(pred, feed_dict = {X: score_X})
    
    correct_pred = tf.equal( tf.argmax(test_result, 1), tf.argmax(train_Y, 1) )
    
    accuracy = tf.reduce_mean( tf.cast(correct_pred, "float") )
    print("Accuracy: {}".format(accuracy.eval({X: test_X, Y: test_Y})))
    
    

In [None]:
# creating a list for scoring
score_X = []

for _, item in test_data.iterrows():
    score_X.append(list(item[2:]))

In [None]:
score_array = []
for each_score in score_result:
    idx = np.argmax(each_score)
    score_array.append(idx)