In [17]:
import pandas as pd
import numpy as np
import pickle 
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import bigrams 
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

import spacy
nlp = spacy.load("en_core_web_sm")
from sklearn.decomposition import NMF
from sklearn.metrics import pairwise_distances

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arsen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
df = pd.read_csv("C:/Users/arsen/Healthylicious/data/cleaned/csv/recipes_cleaned_with_ids.csv")

In [19]:
df1=pd.read_csv("C:/Users/arsen\Healthylicious/data/cleaned/just ingredients/just_ingredients.csv")

In [20]:
def commatokenizer(text):
    return text.split(', ')

def get_nouns(text):
    tokens = RegexpTokenizer(r'\w+').tokenize(text)
    nouns = {'NN', 'NNS', 'NNP', 'NNPS', 'NOUN', 'PROPN', 'NE', 'NNE', 'NR'}
    nounlist = [token for token in tokens if nlp(token)[0].tag_ in nouns]
    return ', '.join(nounlist)

def mytokenizer(combinedlist):
    ingredlist = combinedlist[0].split(', ')
    nounlist = combinedlist[1].split(', ')
    bigramlist = []
    for ingred in ingredlist:
        bigrms = [bi for bi in bigrams(ingred.split())]
        for bi in bigrms:
            if (bi[0] in nounlist) or (bi[1] in nounlist):
                bigramlist.append(' '.join((bi[0], bi[1])))
    return ', '.join(bigramlist + nounlist)

In [21]:
def process_ingredients(row):
    nouns = get_nouns(row)
    combined = [row, nouns]
    return mytokenizer(combined)

In [22]:
df1['TokenizedIngredients'] = df1['IngredientsRemovedAdj'].apply(process_ingredients)

In [23]:
def user_tokenize(ingreds):
    nouns = get_nouns(ingreds)
    ingredscombined = [ingreds, nouns]
    ingredstokenized = mytokenizer(ingredscombined)
    return ingredstokenized

In [24]:
vectorizer = TfidfVectorizer(tokenizer=commatokenizer, stop_words='english', min_df=7, max_df=0.4)
docs = df1['TokenizedIngredients']
doc_word = vectorizer.fit_transform(docs)

print(doc_word.shape)

(1097, 249)




In [25]:
nmf_model = NMF(20, random_state=10, max_iter=1000)
doc_topic = nmf_model.fit_transform(doc_word)
topic_word = nmf_model.components_

In [26]:
from sklearn.decomposition import NMF

# Experiment with different numbers of topics
n_topics = [15, 20, 25, 30]
best_nmf_model = None
best_score = float('inf')

for n in n_topics:
    nmf_model = NMF(n, random_state=10, max_iter=2000)
    doc_topic = nmf_model.fit_transform(doc_word)
    topic_word = nmf_model.components_
    score = nmf_model.reconstruction_err_
    
    if score < best_score:
        best_score = score
        best_nmf_model = nmf_model

nmf_model = best_nmf_model
doc_topic = nmf_model.fit_transform(doc_word)
topic_word = nmf_model.components_

def display_topics(model, feature_names, num_top_words, topic_names=None):
    for idx, topic in enumerate(model.components_):
        if not topic_names or not topic_names[idx]:
            print("\nTopic ", idx)
        else:
            print("\nTopic: '", topic_names[idx], "'")
        print(", ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))

display_topics(nmf_model, vectorizer.get_feature_names_out(), 8)



Topic  0
brown, brown sugar, sugar, cinnamon, oat, butter, pecan, vanilla

Topic  1
pepper, black pepper, garlic, butter, cayenne pepper, cayenne, onion, parsley

Topic  2
stone, stone house, house, house seasoning, parsley, ranch, bread, chili powder

Topic  3
cheddar, cheddar cheese, cheese, bacon, mayonnaise, ham, green onion, milk

Topic  4
olive, olive oil, oil, zucchini, pork, shrimp, garlic, pesto

Topic  5
powder, baking powder, flour, sugar, butter, vanilla, vanilla extract, cinnamon

Topic  6
vanilla, vanilla extract, milk, sugar, cocoa, butter, espresso, condensed milk

Topic  7
chicken, stock, chicken stock, broth, soup, thyme, celery, mushroom

Topic  8
water, yeast, corn, ice, sugar, corn syrup, rice, vegetable

Topic  9
peanut, peanut butter, butter, oat, chocolate chip, honey, chip, vanilla ice

Topic  10
self, self rising, rising flour, flour, milk, butter, cornmeal, yeast

Topic  11
cream, cream cheese, cheese, sour cream, whipping cream, ice cream, ice, vanilla ice


In [27]:

key_ingredients_weights = {
    'beef': 10,
    'chicken': 10,
    'shrimp': 10,
    'crab': 10,
    'venison': 10,
    # Add more key ingredients and their specific weights
}


In [28]:
useringreds = "egg"
usertokens = user_tokenize(useringreds)
print('User Input: ', useringreds)
print('Tokens Generated: ', usertokens, '\n')

# Vectorize user input
user_vec = vectorizer.transform([usertokens])

# Adjust weights for key ingredients
feature_names = vectorizer.get_feature_names_out()
for ingredient, weight in key_ingredients_weights.items():
    if ingredient in usertokens:
        index = feature_names.tolist().index(ingredient)
        user_vec[0, index] *= weight

# Transform user vector into topic space
topic_vec = nmf_model.transform(user_vec)

# Compute similarity and get recommendations
indices = pairwise_distances(topic_vec, doc_topic, metric='cosine').argsort().ravel()

for index in indices[0:5]:
    print(df.iloc[index].Title.upper())
    print(df.iloc[index]["All Ingredients"], '\n')

User Input:  egg
Tokens Generated:  egg 

OVEN AND AIR FRYER HARD COOKED EGGS
1 egg 

MERINGUE RECIPE
4 large egg whites, 1/4 teaspoon cream of tartar, 1/4 cup sugar 

ULTIMATE KALE SALAD RECIPE
4 cups kale greens (trimmed, chopped and massaged), 2 cups spring lettuce mix, 8 roasted radishes, 2 cucumber (peeled and chopped), 8 okra pods (halved lengthwise), 4 boiled eggs (halved), 2 avocado (halved), salt and pepper (to taste), dressing of choice 

SALMON EGGS BENEDICT
8 - 9 ounces spinach (steamed), salmon steaks (broiled), 4 eggs (poached), hollandaise sauce, salt and pepper 

CLASSIC EGG SALAD RECIPE
6 large hard cooked eggs, Â¼ cup mayonnaise, salt and pepper (to taste) 



In [46]:
# Simpan NMF model
with open('nmf_model.pkl', 'wb') as f:
    pickle.dump(nmf_model, f)

# Simpan vectorizer
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

# Simpan data doc_topic
with open('doc_topic.pkl', 'wb') as f:
    pickle.dump(doc_topic, f)

# Simpan data frame df_recipes
with open('df.pkl', 'wb') as f:
    pickle.dump(df, f)


In [51]:
#Vectorizer.pkl to json


# Ekstraksi atribut-atribut yang relevan
vectorizer_data = {
    'vocabulary_': vectorizer.vocabulary_,
    'idf_': vectorizer.idf_.tolist(),
    'tokenizer': 'commatokenizer',  # Simpan nama tokenizer sebagai referensi
}

# Simpan ke file JSON
with open('vectorizer.json', 'w') as outfile:
    json.dump(vectorizer_data, outfile, ensure_ascii=False, indent=4)