In [24]:
import json
import numpy as np
np.random.seed(20)
from collections import Counter
import csv
import operator
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA

In [2]:
data = json.load(open('train.json'))
all_labels = [d['cuisine'] for d in data]

np.random.shuffle(data)

train_data = data[:35000]
train_labels = [d['cuisine'] for d in train_data]
validation_data = data[35000:]
val_labels = [d['cuisine'] for d in validation_data]
test_data = json.load(open('test.json'))
test_ids = [d['id'] for d in test_data]

In [3]:
def evaluate(preds, labels):
    corr = 0
    for a,b in zip(preds,labels):
        if a == b:
            corr += 1
    print(corr/len(preds))
    
def writeTest(preds):
    with open('test_preds.csv', 'w') as file:
        writer = csv.writer(file)
        writer.writerow(("id","cuisine"))
        writer.writerows(zip(test_ids,preds))

In [8]:
def getWordCounts(data,label):
    word_count = dict()
    #Get Word Counts for each ingredient per cuisuine
    for dish in data:
        cuisine = dish['cuisine']
        if label == 'all' or label == cuisine:
            for ing in dish['ingredients']:
                if ing not in word_count:
                    word_count[ing] = 1
                else:
                    word_count[ing] += 1
    return word_count

def getFrequency(word_count):
    freqs = dict()
    
    total_count = 0
    for word in word_count:
        total_count += word_count[word]
    
    for word in word_count:
        freqs[word] = word_count[word]/total_count
    
    return freqs

def pickTopNWords(N,freq_overall,freq_cat):
    relative_freqs = dict()
    
    for word in freq_cat:
        relative_freqs[word] = freq_cat[word] - freq_overall[word]
    
    sorted_relative_freqs = sorted(relative_freqs.items(), key=operator.itemgetter(1))
    sorted_relative_freqs.reverse()
    
    TopN = sorted_relative_freqs[:N]
    
    return set([word for (word,freq) in TopN])

def assignIndex(lst):
    item_to_index = dict()
    for i in range(0,len(lst)):
        item_to_index[lst[i]] = i
    
    return item_to_index

def getIngredients(data):
    ingredients = set()
    for dish in data:
        for ing in dish['ingredients']:
            if ing not in ingredients:
                ingredients.add(ing)
    
    ingredients = list(ingredients)
    
    ingredient_index = dict()
    for i in range(0,len(ingredients)):
        ingredient_index[ingredients[i]] = i
    
    return ingredient_index

In [9]:
#baseline

cuisineCtr = Counter([d['cuisine'] for d in train_data])
mostPopularCuisine = cuisineCtr.most_common(1)[0][0]

val_preds = []
test_preds = []

for d in validation_data:
    val_preds.append(mostPopularCuisine)

for d in test_data:
    test_preds.append(mostPopularCuisine)
    

In [10]:
evaluate(val_preds, val_labels)
writeTest(test_preds)

0.18977796397151236


In [11]:
trainY = [dish['cuisine'] for dish in train_data]

cuisines = set(train_labels)

#Get Word Counts
overall_word_counts = getWordCounts(train_data,'all')
cuisine_word_counts = dict()
for cuisine in cuisines:
    cuisine_word_counts[cuisine] = getWordCounts(train_data,cuisine)

#Get Frequencies
overall_frequencies = getFrequency(overall_word_counts)
cuisine_frequencies = dict()
for cuisine in cuisines:
    cuisine_frequencies[cuisine] = getFrequency(cuisine_word_counts[cuisine])
    
#Get Top Words
TopWords = set()
for cuisine in cuisines:
    TopWords = TopWords | pickTopNWords(50,overall_frequencies,cuisine_frequencies[cuisine])

#Assign Index
ingredient_to_index = assignIndex(list(TopWords))
#ingredient_to_index = getIngredients(data)


In [12]:
wordSet = set(TopWords)
wordId = dict(zip(wordSet, range(len(wordSet))))

def feature(datum):
    feat = [0]*len(wordSet)
    for w in datum['ingredients']:
        if w in wordSet:
            feat[wordId[w]] = 1
    return feat

def oneHotEncode(data,item_to_index):
    vectors = []
    for dish in data:
        vector = [0]*len(item_to_index)
        ingredients = dish['ingredients']
        for ing in ingredients:
            if ing in item_to_index:
                index = item_to_index[ing]
                vector[index] = 1
        
        vectors.append(vector)
    
    return vectors

In [13]:
X = [feature(d) for d in train_data]
X_val = [feature(d) for d in validation_data]

indexing = getIngredients(train_data)
X_all = oneHotEncode(train_data, indexing)
X_val_all = oneHotEncode(validation_data, indexing)

In [14]:
# #KNN

# neigh = KNeighborsClassifier(n_neighbors=10)
# neigh.fit(X_all, train_labels)
# train_preds = neigh.predict(X_all)
# val_preds = neigh.predict(X_val_all)

In [15]:
# evaluate(train_preds, train_labels)
# evaluate(val_preds, val_labels)

In [16]:
#SVM
clf = svm.LinearSVC(C=0.1, multi_class="ovr")
clf.fit(X, train_labels)
train_preds = clf.predict(X)
val_preds = clf.predict(X_val)

In [17]:
evaluate(train_preds, train_labels)
evaluate(val_preds, val_labels)

0.7513714285714286
0.7306242144951822


In [18]:
#SVM all
clf = svm.LinearSVC(C=0.1, multi_class="ovr")
clf.fit(X_all, train_labels)
train_preds = clf.predict(X_all)
val_preds = clf.predict(X_val_all)

evaluate(train_preds, train_labels)
evaluate(val_preds, val_labels)

0.8778857142857143
0.7919983242563887


In [None]:
#SVM all
clf = svm.LinearSVC(C=0.1, multi_class="ovr")
pca = PCA(n_components=4000)
pca.fit(X_all)
newX = pca.transform(X_all)

clf.fit(newX, train_labels)
train_preds = clf.predict(newX)
val_preds = clf.predict(X_val_all)

evaluate(train_preds, train_labels)
evaluate(val_preds, val_labels)

In [19]:
indexingTest = getIngredients(data)

X_total = oneHotEncode(data, indexingTest)
X_test = oneHotEncode(test_data, indexingTest)

clftest = svm.LinearSVC(C=0.1, multi_class="ovr")
clftest.fit(X_total, all_labels)

test_preds = clftest.predict(X_test)
writeTest(test_preds)

In [23]:
sorted(indexing)

['(    oz.) tomato sauce',
 '(   oz.) tomato paste',
 '(10 oz.) frozen chopped spinach',
 '(10 oz.) frozen chopped spinach, thawed and squeezed dry',
 '(14 oz.) sweetened condensed milk',
 '(14.5 oz.) diced tomatoes',
 '(15 oz.) refried beans',
 '1% low-fat buttermilk',
 '1% low-fat chocolate milk',
 '1% low-fat cottage cheese',
 '1% low-fat milk',
 '2 1/2 to 3 lb. chicken, cut into serving pieces',
 '2% low fat cheddar chees',
 '2% low-fat cottage cheese',
 '2% lowfat greek yogurt',
 '2% reduced-fat milk',
 '25% less sodium chicken broth',
 '33% less sodium cooked deli ham',
 '33% less sodium cooked ham',
 '33% less sodium ham',
 '33% less sodium smoked fully cooked ham',
 '40% less sodium taco seasoning',
 '40% less sodium taco seasoning mix',
 '7 Up',
 '8 ounc ziti pasta, cook and drain',
 '95% lean ground beef',
 'A Taste of Thai Rice Noodles',
 'Accent Seasoning',
 'Adobo All Purpose Seasoning',
 'Alaskan king crab legs',
 'Alexia Waffle Fries',
 'Alfredo sauce',
 'Amarena cherrie