In [32]:
import json
import csv
import random
import operator
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression
import numpy as np
np.random.seed(16)

In [25]:
data = json.load(open('train.json'))
np.random.shuffle(data)

train_data = data[:35000]
validation_data = data[35000:]
test_data = json.load(open('test.json'))
test_ids = [dish['id'] for dish in test_data]

In [13]:
def getTopNPerCat(data,labels,N):
    #Set data structure to hold word counts
    word_count_per_cusine = dict()
    for l in labels:
        word_count_per_cusine[l] = dict()
    
    #Get Word Counts for each ingredient per cuisuine
    for dish in data:
        cuisine = dish['cuisine']
        word_count = word_count_per_cusine[cuisine]
        
        for ing in dish['ingredients']:
            if ing not in word_count:
                word_count[ing] = 1
            else:
                word_count[ing] += 1
    
    
    #Get the top N ingredients per cuisine
    TopN_Per_Cat = dict()
    for l in labels:
        word_count = word_count_per_cusine[l]
        sorted_word_count = sorted(word_count.items(), key=operator.itemgetter(1))
        sorted_word_count.reverse()
        TopN_Per_Cat[l] = sorted_word_count[:N]
    
    #Get all Top N ingredients identified
    all_ings_with_count = []
    for l in labels:
        all_ings_with_count += TopN_Per_Cat[l]
        
    #Remove Count
    all_ings = [ing_count[0] for ing_count in all_ings_with_count]
    
    #Remove Duplicate ingredients
    ing_vector = []
    for item in all_ings:
        if item not in ing_vector:
            ing_vector.append(item)
    
    #Create dict mapping ingredient to index
    ingredient_index = dict()
    for i in  range(0,len(ing_vector)):
        ingredient_index[ing_vector[i]] = i
        
    return ingredient_index

In [14]:
def evaluate(preds, labels):
    corr = 0
    for a,b in zip(preds,labels):
        if a == b:
            corr += 1
    return corr/len(preds)

def writeTest(preds):
    with open('test_preds.csv', 'w') as file:
        writer = csv.writer(file)
        writer.writerow(("id","cuisine"))
        writer.writerows(zip(test_ids,preds))

def getLabels(all_labels):
    labels = set()
    for l in all_labels:
        if l not in labels:
            labels.add(l)

    return labels
  
def getWordCounts(data,label):
    word_count = dict()
    #Get Word Counts for each ingredient per cuisuine
    for dish in data:
        cuisine = dish['cuisine']
        if label == 'all' or label == cuisine:
            for ing in dish['ingredients']:
                if ing not in word_count:
                    word_count[ing] = 1
                else:
                    word_count[ing] += 1
    return word_count

def getFrequency(word_count):
    freqs = dict()
    
    total_count = 0
    for word in word_count:
        total_count += word_count[word]
    
    for word in word_count:
        freqs[word] = word_count[word]/total_count
    
    return freqs

def pickTopNWords(N,freq_overall,freq_cat):
    relative_freqs = dict()
    
    for word in freq_cat:
        relative_freqs[word] = freq_cat[word] - freq_overall[word]
    
    sorted_relative_freqs = sorted(relative_freqs.items(), key=operator.itemgetter(1))
    sorted_relative_freqs.reverse()
    
    TopN = sorted_relative_freqs[:N]
    
    return set([word for (word,freq) in TopN])
    
def assignIndex(lst):
    item_to_index = dict()
    for i in range(0,len(lst)):
        item_to_index[lst[i]] = i
    
    return item_to_index
    
def getIngredients(data):
    ingredients = set()
    for dish in data:
        for ing in dish['ingredients']:
            if ing not in ingredients:
                ingredients.add(ing)
    
    ingredients = list(ingredients)
    
    ingredient_index = dict()
    for i in range(0,len(ingredients)):
        ingredient_index[ingredients[i]] = i
    
    return ingredient_index

def oneHotEncode(data,item_to_index):
    vectors = []
    for dish in data:
        vector = [0]*len(item_to_index)
        ingredients = dish['ingredients']
        for ing in ingredients:
            if ing in item_to_index:
                index = item_to_index[ing]
                vector[index] = 1
        
        vectors.append(vector)
    
    return vectors

# #Frequency Based Ridge

In [15]:
trainY = [dish['cuisine'] for dish in train_data]

cuisines = getLabels(trainY)
#ingredient_to_index = getTopNPerCat(train_data,cuisines,100)

#Get Word Counts
overall_word_counts = getWordCounts(train_data,'all')
cuisine_word_counts = dict()
for cuisine in cuisines:
    cuisine_word_counts[cuisine] = getWordCounts(train_data,cuisine)

#Get Frequencies
overall_frequencies = getFrequency(overall_word_counts)
cuisine_frequencies = dict()
for cuisine in cuisines:
    cuisine_frequencies[cuisine] = getFrequency(cuisine_word_counts[cuisine])
    
#Get Top Words
TopWords = set()
for cuisine in cuisines:
    TopWords = TopWords | pickTopNWords(100,overall_frequencies,cuisine_frequencies[cuisine])

#Assign Index
ingredient_to_index = assignIndex(list(TopWords))
#ingredient_to_index = getIngredients(data)

trainX = oneHotEncode(train_data,ingredient_to_index)
#trainY = [dish['cuisine'] for dish in train_data]

validationX = oneHotEncode(validation_data,ingredient_to_index)
validationY = [dish['cuisine'] for dish in validation_data]

testX = oneHotEncode(test_data,ingredient_to_index)

In [16]:
classifier = RidgeClassifier(alpha=1.0)
classifier.fit(trainX, trainY)

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, random_state=None, solver='auto',
        tol=0.001)

In [17]:
#Training Evaluation
train_preds = classifier.predict(trainX)
evaluate(train_preds,trainY)

0.7464571428571428

In [18]:
#Validation Evaluation
valid_preds = classifier.predict(validationX)
evaluate(valid_preds,validationY)

0.7276916631755341

In [10]:
#Write Test Predictions
test_preds = classifier.predict(testX)
writeTest(test_preds)

NameError: name 'csv' is not defined

In [None]:
cusines = set()
for d in trainY:
    if d not in cusines:
        cusines.add(d)

len(cusines)

# #One Hot encode every word. Ridge

In [26]:
#Get Ingredients
ingredient_to_index = getIngredients(train_data)

#Train Vectors
trainX = oneHotEncode(train_data,ingredient_to_index)
trainY = [dish['cuisine'] for dish in train_data]

#Validation Vectors
validationX = oneHotEncode(validation_data,ingredient_to_index)
validationY = [dish['cuisine'] for dish in validation_data]

#Test Vector
testX = oneHotEncode(test_data,ingredient_to_index)

In [27]:
classifier = RidgeClassifier(alpha=1.0)
classifier.fit(trainX, trainY)

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, random_state=None, solver='auto',
        tol=0.001)

In [28]:
#Training Evaluation
train_preds = classifier.predict(trainX)
evaluate(train_preds,trainY)

0.8433142857142857

In [29]:
#Validation Evaluation
valid_preds = classifier.predict(validationX)
evaluate(valid_preds,validationY)

0.7624633431085044

In [None]:
#Write Test Predictions
test_preds = classifier.predict(testX)
writeTest(test_preds)

# #Logistic Regression. One Hot encode every word. 78.23%

In [42]:
#Get Ingredients
ingredient_to_index = getIngredients(train_data)

#Train Vectors
trainX = oneHotEncode(train_data,ingredient_to_index)
trainY = [dish['cuisine'] for dish in train_data]

#Validation Vectors
validationX = oneHotEncode(validation_data,ingredient_to_index)
validationY = [dish['cuisine'] for dish in validation_data]

In [43]:
classifier = LogisticRegression(multi_class='ovr')
classifier.fit(trainX, trainY)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [44]:
#Training Evaluation
train_preds = classifier.predict(trainX)
evaluate(train_preds,trainY)

0.8706

In [45]:
#Validation Evaluation
valid_preds = classifier.predict(validationX)
evaluate(valid_preds,validationY)

0.7878089652283201

In [49]:
all_data = train_data + validation_data
#allY = trainY + validationY
ingredient_to_index = getIngredients(all_data)

allX = oneHotEncode(all_data,ingredient_to_index)
allY = [dish['cuisine'] for dish in all_data]

classifier = LogisticRegression(multi_class='ovr')
classifier.fit(allX, allY)

#Test Vector
testX = oneHotEncode(test_data,ingredient_to_index)

In [50]:
#Write Test Predictions
test_preds = classifier.predict(testX)
writeTest(test_preds)

# Models TO create
Baseline, Naive Bayes, KNN, Random Forest, Ridge, SVM, Logistic Regression

In [None]:
labels = cuisines
data = train_data
N = 50

#Set data structure to hold word counts
word_count_per_cusine = dict()
for l in labels:
    word_count_per_cusine[l] = dict()

#Get Word Counts for each ingredient per cuisuine
for dish in data:
    cuisine = dish['cuisine']
    word_count = word_count_per_cusine[cuisine]

    for ing in dish['ingredients']:
        if ing not in word_count:
            word_count[ing] = 1
        else:
            word_count[ing] += 1


#Get the top N ingredients per cuisine
TopN_Per_Cat = dict()
for l in labels:
    word_count = word_count_per_cusine[l]
    sorted_word_count = sorted(word_count.items(), key=operator.itemgetter(1))
    sorted_word_count.reverse()
    TopN_Per_Cat[l] = sorted_word_count[:N]

#Get all Top N ingredients identified
all_ings_with_count = []
for l in labels:
    all_ings_with_count += TopN_Per_Cat[l]

#Remove Count
all_ings = [ing_count[0] for ing_count in all_ings_with_count]

#Remove Duplicate ingredients
ing_vector = []
for item in all_ings:
    if item not in ing_vector:
        ing_vector.append(item)

# #Create dict mapping ingredient to index
# ingredient_index = dict()
# for i in  range(0,len(ing_vector)):
#     ingredient_index[ing_vector[i]] = i



In [15]:
len(TopWords)

515

In [None]:
s = set([1,2,4])
s2 = set([1,3,4])
s | s2

In [48]:
allX[:10]

[[0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
