# Analysing different Machine Learning models that solve the "What's Cooking" Kaggle Competition problem

## https://www.kaggle.com/c/whats-cooking/

## To run locally, make sure the "train.json" and "test.json" files are copied into the same folder as this file.

## Imports
### numpy and sklearn are required for all solutions

In [None]:
import json
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

import sys
!{sys.executable} -m pip install nltk
import nltk
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

!{sys.executable} -m pip install unidecode
from unidecode import unidecode

import re
import string
import datetime

tfidf_vectorizer = TfidfVectorizer(stop_words = "english", binary = True)
label_encoder = LabelEncoder()


## To Run on Google Colab, follow these instructions:
1) Set read_from_google_drive to True <br>
2) Uncomment the 3 lines at the bottom of the following cell <br>
3) Enter the folder path into the prompted area where the "train.json" and "test.json" files are housed.<br>
   The entered path is also where solution files will be created

In [None]:
device_folder = ""
read_from_google_drive = False

# device_folder = "gdrive/<Insert Folder Name or Path>/"
# from google.colab import drive
# drive.mount('/content/gdrive')

## Functions

In [None]:
def ReadDataFromDrive ():
    training_set = json.load(open(str(device_folder) + "train.json"))
    testing_set = json.load(open(str(device_folder) + "test.json"))

    return training_set, testing_set

def ReadData ():
    
    training_set = json.load(open("train.json"))
    testing_set = json.load(open("test.json"))
    
    return training_set, testing_set

def RemoveUnits (ingredient):
    
    removeList = ["lb", "lbs", "kg", "g", "gm", "oz", "small", "medium", "large", "diced", "slice", "sliced"]
    
    words = ingredient.split()
    ingredient = " ".join([word for word in words if word not in removeList])
    
    words = ingredient.split()
    ingredient = " ".join([word for word in words if word not in ENGLISH_STOP_WORDS])

    return ingredient

def PrepareData (X):
    
    X = [" ".join([re.sub(r"\d+", "", ingredient) for ingredient in entry.split()]) for entry in X]
    X = [" ".join([re.sub(r"\s+", " ", ingredient).strip() for ingredient in entry.split()]) for entry in X]
    X = [" ".join([str(ingredient).lower() for ingredient in entry.split()]) for entry in X]
    X = [" ".join([unidecode(ingredient) for ingredient in entry.split()]) for entry in X]

    X = [" ".join([ingredient.translate(str.maketrans("","", string.punctuation)) for ingredient in entry.split()]) for entry in X]
    X = [" ".join([ingredient.replace(u"\u2122", "") for ingredient in entry.split()]) for entry in X]
    X = [" ".join([ingredient.replace(u"\u00AE", "") for ingredient in entry.split()]) for entry in X]
    X = [" ".join([ingredient.replace(u"\u2019", "") for ingredient in entry.split()]) for entry in X]

    X = [" ".join([RemoveUnits(ingredient) for ingredient in entry.split()]) for entry in X]
    
    lemmatizer = WordNetLemmatizer()
    X = [" ".join([lemmatizer.lemmatize(ingredient) for ingredient in entry.split()]) for entry in X]
    
    return X
    
def ReadAndPreProcessData (drive = False):
    
    if drive == True:
        train, test = ReadDataFromDrive()
        
    else:
        train, test = ReadData() 

    train_ingredients = [" ".join(entry["ingredients"]) for entry in train]
    train_cuisines = [entry["cuisine"] for entry in train]
    train_ids = [entry["id"] for entry in train]

    test_ingredients = [' '.join(entry["ingredients"]) for entry in test]
    test_ids = [entry["id"] for entry in test]

    train_ingredients = PrepareData(train_ingredients)
    test_ingredients = PrepareData(test_ingredients)
    
    return train_ingredients, train_cuisines, test_ingredients, test_ids
  
def CreateSubmission (test_ids, y_test, stats, folder_path = device_folder):
  
    timestamp = str(datetime.datetime.now())[5:16]
    timestamp = re.sub(" ", "_", timestamp)
    timestamp = re.sub("-", "_", timestamp)
    timestamp = re.sub(":", "_", timestamp)
    fname = folder_path + "submission_" + stats + "_" + timestamp + ".csv"
    
    solution = np.transpose(np.vstack((test_ids, y_test)))
    solution = np.vstack((['id', 'cuisine'], solution))
    
    np.savetxt(fname, solution, delimiter=",", fmt="%s")
    

## Reading and Preprocessing Data

In [None]:
train_ingredients, train_cuisines, test_ingredients, test_ids = ReadAndPreProcessData(read_from_google_drive)

X = tfidf_vectorizer.fit_transform(train_ingredients).astype("float16")
y = label_encoder.fit_transform(train_cuisines)

X_test = tfidf_vectorizer.transform(test_ingredients).astype("float16")



# Solutions

## (1)
## Model: Naive Bayes
## Best Score: 70.172%
## Requirements: sklearn

In [None]:
from sklearn.naive_bayes import ComplementNB
nb = ComplementNB(alpha=0.7)
nb.fit(X, y)
prediction = nb.predict(X_test)
CreateSubmission(test_ids, label_encoder.inverse_transform(prediction), "complement_naive_bayes", folder_path = device_folder)

## (2)
## Model: k-Nearest Neighbors
## Best Score: 75.643%
## Requirements: sklearn

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=21)
knn.fit(X, y)
prediction = knn.predict(X_test)
CreateSubmission(test_ids, label_encoder.inverse_transform(prediction), "knn", folder_path = device_folder)

## (3)
## Model: Ensemble (Random Forests and Extra Trees)
## Best Score: 75.693% and 77.986%
## Requirements: sklearn

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
clf = RandomForestClassifier(n_estimators = 300)
clf.fit(X,y)

predictions = label_encoder.inverse_transform(clf.predict(X_test))

CreateSubmission (test_ids, predictions, "random_forest_classifier_300_estimators", folder_path = device_folder)

In [None]:
clf = ExtraTreesClassifier(n_estimators = 300)
clf.fit(X,y)

predictions = label_encoder.inverse_transform(clf.predict(X_test))

CreateSubmission (test_ids, predictions, "extra_trees_classifier_300_estimators", folder_path = device_folder)

## (4)
## Model: Logistic Regression
## Best Score: 78.439%
## Requirements: sklearn

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
clf = LogisticRegression(tol = 1e-5, solver='sag', multi_class='multinomial')
clf.fit(X,y)

output = clf.predict_proba(X_test)
final_output = label_encoder.inverse_transform(np.argmax(output, axis=1))

CreateSubmission(test_ids, final_output, "logistic_regression_tolerance_1e-5")

## (5)
## Model: Support Vector Classifiers
## Requirements: sklearn v0.20.1


In [None]:
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier

## Best Score: 81.858%

In [None]:
clf = SVC(C=10250, cache_size=200, class_weight=None, coef0=0.0,
          decision_function_shape='ovr', degree=2, gamma='scale', kernel='poly',
          max_iter=-1, probability=False, random_state=None, shrinking=True,
          tol=0.001, verbose=False)
classifier = OneVsRestClassifier(clf)
classifier.fit(X,y)

predictions = label_encoder.inverse_transform(classifier.predict(X_test))
CreateSubmission (test_ids, predictions, "svc_polynomial", folder_path = device_folder)

## Second Best Score: 81.476%

In [None]:
clf2 = SVC(kernel='rbf', C=1, gamma=1, tol=1e-2)
classifier2 = OneVsRestClassifier(clf2)
classifier2.fit(X,y)

predictions = label_encoder.inverse_transform(classifier2.predict(X_test))

CreateSubmission (test_ids, predictions, "svc_rbf", folder_path = device_folder)


## (6)
## Model: Neural Networks
## Requirements: Tensorflow and Keras
### Computationally Intensive!


## Functions

In [None]:
import tensorflow as tf

from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers.core import Dense, Dropout
from tensorflow.python.keras.callbacks import History 


In [None]:
def CreateNeuralNetwork (input_size, neurons_1, neurons_2, dropout = 0.50, initial_dropout = True):
    
    model = Sequential()
    
    if initial_dropout == True:
        model.add(Dropout(dropout, input_shape = (input_size, )))
        model.add(Dense(neurons_1, activation = "relu", kernel_initializer = "glorot_uniform"))
    
    else:
        model.add(Dense(neurons_1, activation = "relu", kernel_initializer = "glorot_uniform", input_shape = (input_size, )))    

    model.add(Dropout(dropout))
    model.add(Dense(neurons_2, activation = "relu", kernel_initializer = "glorot_uniform"))
    model.add(Dropout(dropout))
    model.add(Dense(20, activation = "softmax"))
    
    model.compile(optimizer = "adadelta", loss = "sparse_categorical_crossentropy", metrics=['accuracy'])
    
    return model

# Change verbose to True in order to see epochs (fit() function call)
def GetTrainedNeuralNetwork (X, y, iterations = 1, epochs = 2000, neurons = [256, 32], dropout = 0.50, batch_size = 2048, val_split = 0.2, initial_dropout = True):

    training_history = History()

    neural_network = CreateNeuralNetwork (X.shape[1], neurons[0], neurons[1], dropout, initial_dropout)
    neural_network.fit(X, y, epochs = epochs, batch_size = batch_size, callbacks = [training_history], validation_split = val_split, verbose = False)
    
    return neural_network, training_history
    
def GetStatString (n1, n2, d, ind, e):
    
    return "network_({},{})_dropout_({}_{})_epochs_{}".format(str('%04d' % n1), str('%03d' % n2), str(ind), str(d), str(e)) 
    

def WriteTrainingHistoryToFile (history, stats, folder_path = device_folder):
    
    file_name = folder_path + "run_" + stats + ".txt"
    
    data = re.sub("'", '"', str(history.history))
    
    with open (file_name, "w") as f:
        f.write(data)

## Best Score: 81.566%

In [None]:
n_runs = 10
epochs = 2000
neurons = [1024, 128]
dropout = 0.50
predictions = []

for run in range(n_runs):
    neural_network, training_history = GetTrainedNeuralNetwork (X, y, epochs = epochs, neurons = neurons, dropout = dropout, val_split = 0)
    predictions.append(neural_network.predict_proba(X_test))

final_predictions = np.mean(predictions, axis=0)
y_test = label_encoder.inverse_transform(np.argmax(final_predictions, axis=1))

stats = GetStatString (neurons[0], neurons[1], dropout, True, epochs)


CreateSubmission (test_ids, y_test, stats, folder_path = device_folder)

## Second Best Score: 81.345%

In [None]:
n_runs = 10
epochs = 2500
neurons = [512, 128]
dropout = 0.50
predictions = []

for run in range(n_runs):
    neural_network, training_history = GetTrainedNeuralNetwork (X, y, epochs = epochs, neurons = neurons, dropout = dropout, val_split = 0)
    predictions.append(neural_network.predict_proba(X_test))

final_predictions = np.mean(predictions, axis=0)
y_test = label_encoder.inverse_transform(np.argmax(final_predictions, axis=1))


stats = GetStatString (neurons[0], neurons[1], dropout, True, epochs)
CreateSubmission (test_ids, y_test, stats, folder_path = device_folder)