In [53]:
import numpy as np
import pandas as pd
import nltk, json, os, re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import vstack


stemmer = PorterStemmer()
nltk.download('stopwords')
cooking_stopwords = ["recipe", "cook", "cooking", "bake", "boil", "grill", "saute", "roast", "simmer", "fry", "stir", "season", "dish", "plate", "meal", "serve"]
standard_stopwords = set(stopwords.words("english"))
stop_words = standard_stopwords.union(cooking_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/aesmeral/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [33]:
print(os.listdir('.'))

['output.csv', '.ipynb_checkpoints', 'project.ipynb', 'train.json', 'test.json', 'sample_submission.csv', 'README.md', '.git']


In [34]:
training_data = pd.read_json('./train.json')
testing_data = pd.read_json('./test.json')

In [35]:
print(training_data.keys())

Index(['id', 'cuisine', 'ingredients'], dtype='object')


In [36]:
print(training_data['cuisine'])

0              greek
1        southern_us
2           filipino
3             indian
4             indian
            ...     
39769          irish
39770        italian
39771          irish
39772        chinese
39773        mexican
Name: cuisine, Length: 39774, dtype: object


In [37]:
print(training_data['ingredients'])

0        [romaine lettuce, black olives, grape tomatoes...
1        [plain flour, ground pepper, salt, tomatoes, g...
2        [eggs, pepper, salt, mayonaise, cooking oil, g...
3                      [water, vegetable oil, wheat, salt]
4        [black pepper, shallots, cornflour, cayenne pe...
                               ...                        
39769    [light brown sugar, granulated sugar, butter, ...
39770    [KRAFT Zesty Italian Dressing, purple onion, b...
39771    [eggs, citrus fruit, raisins, sourdough starte...
39772    [boneless chicken skinless thigh, minced garli...
39773    [green chile, jalapeno chilies, onions, ground...
Name: ingredients, Length: 39774, dtype: object


In [45]:
def scrubbed_ingredient(ingredient):
    scrubbed = re.sub(r'[^a-zA-Z ]', '', ingredient).lower()
    scrubbed = re.sub(r'oz|crushed|crumbles|ground|minced|powder|chopped|sliced|boneless|skinless|fresh|frozen|homemade|instance|kraft|large|lean|lowfat|small|smoke|vegan', '', scrubbed).lstrip()
    scrubbed = [stemmer.stem(word) for word in scrubbed.split() if word not in stop_words]
    return "_".join(scrubbed)


In [46]:
ingredients_set = set()

for ingredients_list in training_data['ingredients']:
    scrubbed_ingredients = [scrubbed_ingredient(ingredient) for ingredient in ingredients_list]
    scrubbed_ingredients = [word for word in scrubbed_ingredients if len(word) >= 1]
    ingredients_set.update(scrubbed_ingredients)

sorted_set = sorted(ingredients_set)
print(len(sorted_set))
for ingredient in sorted_set:
    print(ingredient)

6105
aai
abalon
abbamel
absinth
abura_age
acai_juic
accent
accent_season
accompani
achiot
achiot_past
acini_di_pepe
acke
acorn_squash
activ_dri_yeast
adobo
adobo_purpos_season
adobo_sauc
adobo_season
adobo_style_season
adzuki_bean
agar
agar_agar_flake
agav_nectar
agav_tequila
age_balsam_vinegar
age_cheddar_chees
age_gouda
age_manchego_chees
ahi
ahi_tuna_steak
aioli
ajinomoto
ajwain
aka_miso
alaskan_king_crab_leg
alaskan_king_salmon
albacor
albacor_tuna_water
alcohol
ale
aleppo
aleppo_pepper
alexia_waffl_fri
alfalfa_sprout
alfredo_sauc
alfredo_sauc_mix
allpurpos_flour
allspic
allspic_berri
almond
almond_butter
almond_extract
almond_fill
almond_flour
almond_liqueur
almond_milk
almond_oil
almond_past
almond_syrup
alo_juic
alphabet_pasta
alum
amaranth
amarena_cherri
amaretti
amaretti_cooki
amaretto
amaretto_liqueur
amba
amber
amber_rum
amberjack_fillet
amchur
america
american_chees
american_chees_food
american_chees_slice
ammonium_bicarbon
amontillado_sherri
ampalaya
anaheim_chile
anasazi_

In [93]:
vectorizer = CountVectorizer(binary=True, vocabulary=sorted_set)
batch_size = 2500  # Adjust the batch size as needed

result = []

for i in range(0, len(training_data), batch_size):
    batch = training_data['ingredients'].iloc[i:i+batch_size]
    scrubbed_ingredients = [" ".join([scrubbed_ingredient(ingredient) for ingredient in row]) for row in batch]
    X = vectorizer.transform(scrubbed_ingredients)
    result.append(X)
    print(i)

df = pd.DataFrame.sparse.from_spmatrix(vstack(result))
df.columns = sorted_set
df['cuisine'] = training_data['cuisine']
df['id'] = training_data['id']

df.head(100).to_csv('output.csv', index=False)

0
2500
5000
7500
10000
12500
15000
17500
20000
22500
25000
27500
30000
32500
35000
37500


In [70]:
X = df.iloc[:, :-2]
X.shape

(39774, 6105)

In [71]:
y = df['cuisine']
y.shape

(39774,)

In [73]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=10)

In [97]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

k = 11
knn = KNeighborsClassifier(n_neighbors=k, weights='uniform')
knn.fit(X_train, y_train)
y_predict = knn.predict(X_test)

In [98]:
accuracy = accuracy_score(y_test,y_predict)
print(accuracy)

0.5373978629792583


In [76]:
results = pd.DataFrame()
results['actual'] = y_test
results['prediction'] = y_predict

print(results.head())

             actual    prediction
18708       italian       italian
11518  cajun_creole  cajun_creole
3939        chinese        korean
27897   southern_us       british
13952       italian       italian


In [90]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(multi_class='ovr', solver='sag', max_iter=10000)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

In [91]:
accuracy = accuracy_score(y_test,y_predict)
print(accuracy)

0.7753614079195474


In [92]:
print(results.head(100))

             actual    prediction
18708       italian       italian
11518  cajun_creole  cajun_creole
3939        chinese        korean
27897   southern_us       british
13952       italian       italian
...             ...           ...
33786       chinese       chinese
33281    vietnamese       italian
31917       italian       italian
16167       italian         greek
13194       chinese       chinese

[100 rows x 2 columns]


In [95]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_predict)
print(accuracy)

0.5916404776869894


In [96]:
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_predict)
print(accuracy)

0.6803268384663733
