## b) About the data

In [1]:
import numpy as np
from matplotlib import pylab as plt
import json
from pprint import pprint

In [32]:
with open("cuisine/train.json", "r") as file:
    train_data = json.load(file)

In [33]:
ids, cuisine, ingredients = [], [], []
for line in train_data:
    ids.append(line["id"])
    cuisine.append(line["cuisine"])
    ingredients.extend(line["ingredients"])

In [15]:
print("Training set sample size: ", len(ids))
print("Cuisine: ", len(set(cuisine)))
print("Ingredient: ", len(set(ingredients)))

Training set sample size:  39774
Cuisine:  20
Ingredient:  6714


## c) Feature Vector

In [22]:
# Create Ingredient_index to map name to index
ingredient_i = dict()
i = 0;

for ingredient in set(ingredients):
    ingredient_i[ingredient] = i
    i = i + 1

In [34]:
train_data[1]

{'cuisine': 'southern_us',
 'id': 25693,
 'ingredients': ['plain flour',
  'ground pepper',
  'salt',
  'tomatoes',
  'ground black pepper',
  'thyme',
  'eggs',
  'green tomatoes',
  'yellow corn meal',
  'milk',
  'vegetable oil']}

### Convert train data

In [35]:
train_data_arr = []
ingredient_count = len(ingredient_i)

# Use index map to set 1
for line in train_data:
    ing_arr = [0] * ingredient_count # Create 0 array
    for i in line["ingredients"]:
        ing_arr[ingredient_i[i]] = 1
    train_data_arr.append(ing_arr)
train_data_arr = np.array(train_data_arr, dtype=int)
train_label_arr = np.array(cuisine, dtype=str)

train_data_arr.shape

(39774, 6714)

### Convert test data

In [39]:
with open("cuisine/test.json", "r") as file:
    test_data = json.load(file)
test_data_arr = []
test_id = []
    
for line in test_data:
    test_id.append(line["id"])
    ing_arr = [0] * ingredient_count # Create 0 array
    for i in line["ingredients"]:
        if i in ingredient_i:
            ing_arr[ingredient_i[i]] = 1
    test_data_arr.append(ing_arr)
test_data_arr = np.array(test_data_arr, dtype=int)
test_id = np.array(test_id, dtype=int)

test_data_arr.shape

(9944, 6714)

## d) Naive Bayes f) Logistic Regression

In [50]:
from sklearn.cross_validation import KFold
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

In [52]:
fold = 3
kf = KFold(len(train_data_arr), n_folds=fold)

clf_B = BernoulliNB()
clf_G = GaussianNB()
clf_L = LogisticRegression()

avg_B, avg_G, avg_L = 0, 0, 0
for train_i, test_i in kf:
    data_train = train_data_arr[train_i]
    data_test = train_data_arr[test_i]
    label_train = train_label_arr[train_i]
    label_test = train_label_arr[test_i]
    
    clf_B.fit(data_train, label_train)
    clf_G.fit(data_train, label_train)
    clf_L.fit(data_train, label_train)
    
    avg_B += clf_B.score(data_test, label_test) / fold
    avg_G += clf_G.score(data_test, label_test) / fold
    avg_L += clf_L.score(data_test, label_test) / fold

print("GaussianNB: ", avg_G)
print("BernoulliNB: ", avg_B)
print("LogisticRegression: ", avg_L)

GaussianNB:  0.379846130638
BernoulliNB:  0.683536983959
LogisticRegression:  0.775556896465


## e) BernoulliNB scores 0.68 meanwhile GaussianNB scores 0.38. BernoulliNB performs better since the data (ingredients) describes whether an ingredient is present or not.

## g) Kaggle Submission

In [54]:
clf_L.fit(train_data_arr, train_label_arr)
pred_L = clf_L.predict(test_data_arr)

In [58]:
import pandas as pd

In [63]:
result_df = pd.DataFrame(test_id, columns=["id"])

In [64]:
result_df["cuisine"] = pred_L

In [65]:
result_df.head()

Unnamed: 0,id,cuisine
0,18009,british
1,28583,southern_us
2,41580,italian
3,29752,cajun_creole
4,35687,italian


In [66]:
result_df.to_csv("kaggle_cuisine.csv", index=False)