In [None]:
# coding: utf8
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
# Any results you write to the current directory are saved as output.

In [None]:
import json
recipeRaw = pd.read_json("kaggle/train_refined.json")
recipeRaw["ingredientsFlat"] = recipeRaw["ingredients"].apply(lambda x: ' '.join(x))
recipeRaw.head()

In [None]:
recipeRawTest = pd.read_json("kaggle/test.json")
recipeRawTest["ingredientsFlat"] = recipeRawTest["ingredients"].apply(lambda x: ' '.join(x))
testdocs = recipeRawTest["ingredientsFlat"].values
recipeRawCombined = recipeRaw.append(recipeRawTest)
recipeRawCombined[40000:].head()

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(recipeRaw["cuisine"].values)
list(le.classes_)

In [None]:
from keras.utils.np_utils import to_categorical
docs = recipeRaw["ingredientsFlat"].values
testdocs = recipeRawTest["ingredientsFlat"].values
docsCombined = recipeRawCombined["ingredientsFlat"].values
labels_enc = le.transform(recipeRaw["cuisine"].values)
labels = to_categorical(labels_enc)
labels

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docsCombined)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)
encoded_test_docs = t.texts_to_sequences(testdocs)
print(vocab_size)
# pad documents to a max length of 4 words
max_length = 40
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
padded_test_docs = pad_sequences(encoded_test_docs, maxlen=max_length, padding='post')
print(len(padded_docs))

In [None]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.6B/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

In [None]:
vocab = pd.DataFrame.from_dict(t.word_index,orient="index")
vocab.drop([0],axis=1).reset_index().rename(columns={"index":"word"}).to_csv("vocab.csv",index=False, encoding='utf-8')

In [None]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
print(embedding_matrix.shape)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from sklearn.model_selection import KFold

# fix random seed for reproducibility
seed = 88
np.random.seed(seed)

# define 10-fold cross validation test harness
kfold = KFold(n_splits=5, shuffle=True, random_state=seed)

cvscores = []
for train, test in kfold.split(padded_docs, labels):
    # define the model

    model = Sequential()
    model.add(Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=40, trainable=False))
    model.add(Conv1D(filters=100, kernel_size=3, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(250, activation='relu'))
    #model.add(Dense(le.classes_.size, activation='sigmoid'))
    model.add(Dense(le.classes_.size, activation='softmax'))
    # compile the model
    #model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
    #model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[categorical_accuracy]
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    # summarize the model
    if cvscores == []:
        print(model.summary())
    # fit the model
    model.fit(padded_docs[train], labels[train], epochs=5, verbose=0)
    scores = model.evaluate(padded_docs[test], labels[test], verbose=0)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

In [None]:
predictions = model.predict(padded_test_docs)
print(predictions.shape)
#print(predictions)
recipeRawTest["cuisine"] = [le.classes_[np.argmax(prediction)] for prediction in predictions]
recipeRawTest.head()
recipeRawTest.drop(["ingredients","ingredientsFlat"],axis=1).to_csv("kaggle/submission.csv",index=False, encoding='utf-8')

In [None]:
from ipywidgets import widgets, Layout
from ipywidgets import Button, HBox, VBox

outputText = widgets.Text()
inputText = widgets.Text()

# test url: https://kitchenstories.io/en/recipes/grilled-steak-salad

textarea = widgets.Textarea(
    value='Hello World',
    placeholder='Type something',
    description='Input:',
    disabled=False,
    layout=Layout(width='50%', height='300px')
)

output_textarea = widgets.Textarea(
    value='Output Area',
    placeholder='Type something',
    description='Output:',
    disabled=False,
    layout=Layout(display='flex',
                    flex_flow='row',
                    align_items='stretch',
                    border='solid',
                    width='auto')
)

def filter_text(in_text):
    text = in_text
    text = text.split('\n')
    lines = text
    for i, l in enumerate(lines):
         if '\t' in l:
                index = l.find('\t')
                lines[i] = l[index:]
    
    for i, l in enumerate(lines):
         lines[i] = l.lower().replace('for serving', '').replace('for frying', '').strip()
    # text = '\n'.join(lines)
    return lines

def buttonHandler(sender):
    intext = textarea.value.upper()
    outext = filter_text(intext)
    #print(outext)
    content = [{'ingredients':outext}]
    path_gui_ingredient_input = 'gui_ingredient_test_input.json'
    json_data_ = json.dumps(content,indent=4,sort_keys=True, ensure_ascii=False)
    with open(path_gui_ingredient_input, 'w') as f:
        f.write(json_data_)
    pred1,prob1,pred2,prob2 = testit(path_gui_ingredient_input)
    #print pred1,prob1,pred2,prob2
    output_textarea.value = pred1 + " : " + str(int(prob1)) + " %\n" + pred2 + " : " + str(int(prob2)) + " %"

def makeUpperCase(sender):
    outputText.value = inputText.value.upper()

def testit(path_gui_ingredient_input):
    recipeRawTest = pd.read_json(path_gui_ingredient_input)
    recipeRawTest["ingredientsFlat"] = recipeRawTest["ingredients"].apply(lambda x: ' '.join(x))
    testdocs = recipeRawTest["ingredientsFlat"].values
    encoded_test_docs = t.texts_to_sequences(testdocs)
    padded_test_docs = pad_sequences(encoded_test_docs, maxlen=max_length, padding='post')
    predictions = model.predict(padded_test_docs)
    #print(predictions.shape)

    for prediction in predictions:
        #print prediction
        #print np.argmax(prediction)
        idx = prediction.argsort()[-2:][::-1]
        #print le.classes_[idx[0]], prediction[idx[0]]*100, le.classes_[idx[1]], prediction[idx[1]]*100
        return le.classes_[idx[0]], prediction[idx[0]]*100, le.classes_[idx[1]], prediction[idx[1]]*100
        #return le.classes_[np.argmax(prediction)]

    #recipeRawTest["cuisine"] = [le.classes_[np.argmax(prediction)] for prediction in predictions]
    #print recipeRawTest["cuisine"][0]

button = widgets.Button(
    description='Magical Predicton',
    disabled=False,
    button_style='success', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Predict',
)
button.on_click(buttonHandler)
inputText.on_submit(makeUpperCase)
# VBox([inputText, outputText, textarea, button, output_textarea])
HBox([textarea, VBox([button, output_textarea])])