In [1]:
import pandas as pd
import numpy as np
import string, re
import swifter
import pickle 

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import bigrams 
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer

from nltk.corpus import stopwords
# import nltk
# nltk.download('stopwords')

import spacy
nlp = spacy.load("en_core_web_sm")
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv("C:/Users/arsen/Healthylicious/data/cleaned/csv/combined_dataset_cleaned.csv")

In [4]:
df['ID'] = range(1, len(df) + 1)

In [5]:
cols = ['ID'] + [col for col in df.columns if col != 'ID']
df = df[cols]

print(df.head())

   ID               Category                                      Title  \
0   1            Main Course                   Swedish Meatballs Recipe   
1   2  Appetizer,Main Course  Baked Crispy Buffalo Chicken Wings Recipe   
2   3              Appetizer                           Guacamole Recipe   
3   4              Appetizer                       Perfect Queso Recipe   
4   5              Appetizer                 Buffalo Chicken Dip Recipe   

   Total Time                                    All Ingredients  \
0          30  1 pound ground beef, 2 tablespoons diced onion...   
1          60  6 pounds chicken wings (trimmed), 3 tablespoon...   
2           5  2 avocados (halved, seeded, and peeled), 1 tab...   
3           5  1/2 cup pepper jack cheese, 1 cup grated white...   
4          25  2 cups chicken breasts (cooked and shredded), ...   

                                   Ingredient Groups  \
0  IngredientGroup(ingredients=['1 pound ground b...   
1  IngredientGroup(ingredien

In [6]:
df['Rating Counts'] = np.random.randint(1, 1001, size=len(df))

In [8]:
print(df.columns.tolist())

['ID', 'Category', 'Title', 'Total Time', 'All Ingredients', 'Ingredient Groups', 'Instructions', 'Nutrition', 'Cuisine', 'Yields', 'Image', 'Ratings', 'Description', 'Status', 'Rating Counts']


In [9]:
cols = df.columns.tolist()  # Dapatkan daftar kolom
cols.insert(12, cols.pop(cols.index('Rating Counts')))  # Pindahkan 'Rating Counts' ke posisi ke-13
df = df[cols] 
print(df.columns.tolist())

['ID', 'Category', 'Title', 'Total Time', 'All Ingredients', 'Ingredient Groups', 'Instructions', 'Nutrition', 'Cuisine', 'Yields', 'Image', 'Ratings', 'Rating Counts', 'Description', 'Status']


In [10]:
df = df.set_index('ID')
df = df.dropna(subset=['Title','Rating Counts'])   

In [27]:
measure_words = ['bottle', 'bottles', 'box', 'boxes', 'bunch','bunches', 'bushel','bushels', 
                 'can', 'cans', 'container', 'c', 'cup', 'cups', 'carton', 'cartons', 'chopped',
                 'dash','dashes', 'drop','drops','fl', 'fl.', 'fluid','jar', 'jars','ounce','ounces','oz',
                 'g', 'gallon','gallons', 'glass','glasses', 'gram','grams','kg','kgs', 'lb','lbs',
                 'liter','liters', 'l', 'large', 'medium', 'ml','mls', 'package','pkg','small', 'to taste',
                 'pinch','pinches', 'pint','pints', 'pound','pounds', 'qt', 'qts', 'quart','quarts',
                 'scoop','scoops', 'sliced','slivered','stick','sticks', 'tablespoon','tablespoons',
                 'tbs','tbsp','tbsps', 'teaspoon','teaspoons','tsp','tsps','whole']

In [82]:
def clean_once(text):
    ingredlist = []
    for ingred in text.split(", "):
        ingred = re.sub(r'\([^)]*\)', '', ingred)  # Remove anything inside parentheses
        ingred = re.sub(r'\d', '', ingred)         # Remove digits
        ingred = re.sub(r'\s+', ' ', ingred)       # Remove extra whitespace
        ingred = ingred.replace('/', ' ')          # Remove slashes
        ingred = ' ' + ingred + ' '                # Padding in case measure_word is first or last
        for measure_word in measure_words:         # Remove measure words
            ingred = ingred.replace(' ' + measure_word + ' ', ' ')
        ingred = ingred.strip().lower()            # Strip leading/trailing whitespace and convert to lowercase
        ingredlist.append(ingred)
    return ', '.join(ingredlist)

In [29]:
df['Ingredients'] = df['All Ingredients'].apply(clean_once)

In [30]:
df['Ingredients'][1]

['ground beef',
 'diced onion',
 'egg',
 'stone house seasoning',
 'ground allspice',
 'ground nutmeg',
 'bread or cracker crumbs',
 'fresh parsley',
 'olive oil',
 'butter',
 'all-purpose flour',
 'beef stock or broth',
 'milk or heavy cream',
 'stone house seasoning',
 'worcestershire sauce',
 'fresh parsley']

In [46]:
def preprocessor(text):
    ingredlist = []
    for ingred in text.split(', '):
        ingred = re.sub('\w*\d\w*', ' ', ingred)  # Remove any words containing digits 
        ingred = ingred.replace('"', '').replace("'", '').replace('& ', '').replace('-', '')   
        ingred = re.sub('[%s]' % re.escape(string.punctuation), ' ', ingred)  # Remove punctuation
        ingred = ingred.lower().strip()        
        ingredlist.append(ingred)        
    return ', '.join(ingredlist)

def word_singularizer(word):
    nlp_word = nlp(word)[0]
    lemma = nlp_word.text
    if nlp_word.tag_ in {"NNS", "NNPS"}:
        lemma = nlp_word.lemma_
    return lemma

def text_singularizer(text):
    ingredlist = []
    for ingred in text.split(', '):
        new_list = []
        for word in ingred.split():
            new_list.append(word_singularizer(word))
        ingred = ' '.join(new_list)
        ingredlist.append(ingred)        
    return ', '.join(ingredlist)

def get_nouns(text):
    tokens = RegexpTokenizer(r'\w+').tokenize(text)
    nounlist = [word_singularizer(word) for word in tokens if is_noun(word)]
    return ', '.join(nounlist) 

def is_noun(word):
    nouns = {'NN','NNS', 'NNP', 'NNPS','NOUN', 'PROPN', 'NE', 'NNE', 'NR'}
    pos = nlp(word)[0].tag_ 
    if pos in nouns:
        return True
    return False

In [47]:
df['Ingredients'] = df['Ingredients'].fillna('')

In [48]:
teststr = df.iloc[0]['Ingredients']
teststr = ', '.join(teststr)
teststr

'ground beef, diced onion, egg, stone house seasoning, ground allspice, ground nutmeg, bread or cracker crumbs, fresh parsley, olive oil, butter, all-purpose flour, beef stock or broth, milk or heavy cream, stone house seasoning, worcestershire sauce, fresh parsley'

In [49]:
print("Original Ingredients:\n", teststr, '\n')

Original Ingredients:
 ground beef, diced onion, egg, stone house seasoning, ground allspice, ground nutmeg, bread or cracker crumbs, fresh parsley, olive oil, butter, all-purpose flour, beef stock or broth, milk or heavy cream, stone house seasoning, worcestershire sauce, fresh parsley 



In [50]:
print("Nouns:\n", get_nouns(teststr), '\n')
print("Preprocessed:\n", preprocessor(teststr), '\n')
print("Singularized:\n", text_singularizer(preprocessor(teststr)), '\n')
print("Nouns after Singularizing:\n", get_nouns(text_singularizer(preprocessor(teststr))), '\n')

Nouns:
 ground, beef, onion, egg, stone, house, ground, allspice, ground, nutmeg, bread, cracker, crumb, parsley, olive, oil, butter, purpose, flour, beef, stock, broth, milk, cream, stone, house, worcestershire, parsley 

Preprocessed:
 ground beef, diced onion, egg, stone house seasoning, ground allspice, ground nutmeg, bread or cracker crumbs, fresh parsley, olive oil, butter, allpurpose flour, beef stock or broth, milk or heavy cream, stone house seasoning, worcestershire sauce, fresh parsley 

Singularized:
 ground beef, diced onion, egg, stone house seasoning, ground allspice, ground nutmeg, bread or cracker crumb, fresh parsley, olive oil, butter, allpurpose flour, beef stock or broth, milk or heavy cream, stone house seasoning, worcestershire sauce, fresh parsley 

Nouns after Singularizing:
 ground, beef, onion, egg, stone, house, ground, allspice, ground, nutmeg, bread, cracker, parsley, olive, oil, butter, flour, beef, stock, broth, milk, cream, stone, house, worcestershire,

In [52]:
df['Ingredients'] = df['Ingredients'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

In [53]:
df['IngredientsProcessed'] = df['Ingredients'].apply(preprocessor)
df['IngredientsProcNouns'] = df['IngredientsProcessed'].apply(get_nouns)  # Takes 2 min per 1000 rows
df['IngredientsProcessed'] = df['IngredientsProcessed'].apply(text_singularizer)  # Takes 1 min per 1000 rows
df.head()

Unnamed: 0_level_0,Category,Title,Total Time,All Ingredients,Ingredient Groups,Instructions,Nutrition,Cuisine,Yields,Image,Ratings,Rating Counts,Description,Status,Ingredients,IngredientsProcessed,IngredientsProcNouns
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,Main Course,Swedish Meatballs Recipe,30,"1 pound ground beef, 2 tablespoons diced onion...",IngredientGroup(ingredients=['1 pound ground b...,For the meatballs:\r\nMix together all ingredi...,"{'calories': '220 kcal', 'carbohydrateContent'...",American,6 servings,https://addapinch.com/wp-content/uploads/2022/...,4.95,879,The Best Swedish Meatballs feature tender meat...,Complete,"ground beef, diced onion, egg, stone house sea...","ground beef, diced onion, egg, stone house sea...","ground, beef, onion, egg, stone, house, ground..."
2,"Appetizer,Main Course",Baked Crispy Buffalo Chicken Wings Recipe,60,"6 pounds chicken wings (trimmed), 3 tablespoon...",IngredientGroup(ingredients=['6 pounds chicken...,Preheat the oven to 425º F. Line a baking shee...,"{'servingSize': '0.25 pound', 'calories': '419...",American,8 servings,https://addapinch.com/wp-content/uploads/2018/...,5.0,665,Crispy Baked Buffalo Chicken Wings that are so...,Complete,"chicken wings, cornstarch, stone house seasoni...","chicken wing, cornstarch, stone house seasonin...","chicken, wing, cornstarch, stone, house, recip..."
3,Appetizer,Guacamole Recipe,5,"2 avocados (halved, seeded, and peeled), 1 tab...",IngredientGroup(ingredients=['2 avocados (halv...,Mash avocado with a fork or potato masher in a...,"{'carbohydrateContent': '7 g', 'proteinContent...",Mexican,6 servings,https://addapinch.com/wp-content/uploads/2013/...,4.5,607,"This easy Guacamole recipe, made with fresh in...",Complete,"avocados (halved, seeded, and peeled), lime ju...","avocados halved, seeded, and peeled, lime juic...","lime, juice, sea, salt, garlic, onion, ground,..."
4,Appetizer,Perfect Queso Recipe,5,"1/2 cup pepper jack cheese, 1 cup grated white...",IngredientGroup(ingredients=['1/2 cup pepper j...,Stovetop Queso:\r\nAdd cheeses and half of the...,"{'calories': '534 kcal', 'carbohydrateContent'...",American,2 servings,https://addapinch.com/wp-content/uploads/2015/...,5.0,284,This easy and delicious Queso recipe is made w...,Complete,"pepper jack cheese, grated white american chee...","pepper jack cheese, grated white american chee...","pepper, jack, cheese, white, american, cheese,..."
5,Appetizer,Buffalo Chicken Dip Recipe,25,"2 cups chicken breasts (cooked and shredded), ...",IngredientGroup(ingredients=['2 cups chicken b...,Preheat oven to 350º F.\r\nMix together all in...,"{'calories': '106 kcal', 'proteinContent': '4 ...",American,12 servings,https://addapinch.com/wp-content/uploads/2024/...,5.0,117,Buffalo Chicken Dip Recipe is the best easy ap...,Complete,"chicken breasts, cream cheese, ranch dressing ...","chicken breast, cream cheese, ranch dressing o...","chicken, breast, cream, cheese, ranch, cheese,..."


In [54]:

df['IngredientsCombined'] = df.apply(lambda x: [x['IngredientsProcessed'], x['IngredientsProcNouns']],axis=1)

In [56]:
df.iloc[0].IngredientsCombined[0]

'ground beef, diced onion, egg, stone house seasoning, ground allspice, ground nutmeg, bread or cracker crumb, fresh parsley, olive oil, butter, allpurpose flour, beef stock or broth, milk or heavy cream, stone house seasoning, worcestershire sauce, fresh parsley'

In [57]:
test = df.iloc[0].IngredientsProcessed
tokens = RegexpTokenizer(r'\w+').tokenize(test)
print(test)
print(tokens)
print([token for token in tokens if is_noun(token)])

ground beef, diced onion, egg, stone house seasoning, ground allspice, ground nutmeg, bread or cracker crumb, fresh parsley, olive oil, butter, allpurpose flour, beef stock or broth, milk or heavy cream, stone house seasoning, worcestershire sauce, fresh parsley
['ground', 'beef', 'diced', 'onion', 'egg', 'stone', 'house', 'seasoning', 'ground', 'allspice', 'ground', 'nutmeg', 'bread', 'or', 'cracker', 'crumb', 'fresh', 'parsley', 'olive', 'oil', 'butter', 'allpurpose', 'flour', 'beef', 'stock', 'or', 'broth', 'milk', 'or', 'heavy', 'cream', 'stone', 'house', 'seasoning', 'worcestershire', 'sauce', 'fresh', 'parsley']
['ground', 'beef', 'onion', 'egg', 'stone', 'house', 'ground', 'allspice', 'ground', 'nutmeg', 'bread', 'cracker', 'parsley', 'olive', 'oil', 'butter', 'flour', 'beef', 'stock', 'broth', 'milk', 'cream', 'stone', 'house', 'worcestershire', 'parsley']


In [59]:
def commatokenizer(text):
    return text.split(', ')

def mytokenizer(combinedlist):
    ingredlist = combinedlist[0].split(', ')
    nounlist = combinedlist[1].split(', ')
    ingredlist = combinedlist[0].split(', ')
    bigramlist = []
    for ingred in ingredlist:
        bigrms = [bi for bi in bigrams(ingred.split())]
        for bi in bigrms:
            if (bi[0] in nounlist) or (bi[1] in nounlist):
                bigramlist.append(' '.join((bi[0], bi[1])))
    return ', '.join(bigramlist + nounlist)

In [60]:
teststr = df.iloc[0].IngredientsCombined
print(teststr, '\n')
print(mytokenizer(teststr),'\n')
print(commatokenizer(mytokenizer(teststr)))

['ground beef, diced onion, egg, stone house seasoning, ground allspice, ground nutmeg, bread or cracker crumb, fresh parsley, olive oil, butter, allpurpose flour, beef stock or broth, milk or heavy cream, stone house seasoning, worcestershire sauce, fresh parsley', 'ground, beef, onion, egg, stone, house, ground, allspice, ground, nutmeg, bread, cracker, crumb, parsley, olive, oil, butter, flour, beef, stock, broth, milk, cream, stone, house, worcestershire, parsley'] 

ground beef, diced onion, stone house, house seasoning, ground allspice, ground nutmeg, bread or, or cracker, cracker crumb, fresh parsley, olive oil, allpurpose flour, beef stock, stock or, or broth, milk or, heavy cream, stone house, house seasoning, worcestershire sauce, fresh parsley, ground, beef, onion, egg, stone, house, ground, allspice, ground, nutmeg, bread, cracker, crumb, parsley, olive, oil, butter, flour, beef, stock, broth, milk, cream, stone, house, worcestershire, parsley 

['ground beef', 'diced onion

In [61]:
df['IngredientsTokenized'] = df['IngredientsCombined'].apply(mytokenizer)
df.head()

Unnamed: 0_level_0,Category,Title,Total Time,All Ingredients,Ingredient Groups,Instructions,Nutrition,Cuisine,Yields,Image,Ratings,Rating Counts,Description,Status,Ingredients,IngredientsProcessed,IngredientsProcNouns,IngredientsCombined,IngredientsTokenized
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,Main Course,Swedish Meatballs Recipe,30,"1 pound ground beef, 2 tablespoons diced onion...",IngredientGroup(ingredients=['1 pound ground b...,For the meatballs:\r\nMix together all ingredi...,"{'calories': '220 kcal', 'carbohydrateContent'...",American,6 servings,https://addapinch.com/wp-content/uploads/2022/...,4.95,879,The Best Swedish Meatballs feature tender meat...,Complete,"ground beef, diced onion, egg, stone house sea...","ground beef, diced onion, egg, stone house sea...","ground, beef, onion, egg, stone, house, ground...","[ground beef, diced onion, egg, stone house se...","ground beef, diced onion, stone house, house s..."
2,"Appetizer,Main Course",Baked Crispy Buffalo Chicken Wings Recipe,60,"6 pounds chicken wings (trimmed), 3 tablespoon...",IngredientGroup(ingredients=['6 pounds chicken...,Preheat the oven to 425º F. Line a baking shee...,"{'servingSize': '0.25 pound', 'calories': '419...",American,8 servings,https://addapinch.com/wp-content/uploads/2018/...,5.0,665,Crispy Baked Buffalo Chicken Wings that are so...,Complete,"chicken wings, cornstarch, stone house seasoni...","chicken wing, cornstarch, stone house seasonin...","chicken, wing, cornstarch, stone, house, recip...","[chicken wing, cornstarch, stone house seasoni...","chicken wing, stone house, house seasoning, re..."
3,Appetizer,Guacamole Recipe,5,"2 avocados (halved, seeded, and peeled), 1 tab...",IngredientGroup(ingredients=['2 avocados (halv...,Mash avocado with a fork or potato masher in a...,"{'carbohydrateContent': '7 g', 'proteinContent...",Mexican,6 servings,https://addapinch.com/wp-content/uploads/2013/...,4.5,607,"This easy Guacamole recipe, made with fresh in...",Complete,"avocados (halved, seeded, and peeled), lime ju...","avocados halved, seeded, and peeled, lime juic...","lime, juice, sea, salt, garlic, onion, ground,...","[avocados halved, seeded, and peeled, lime jui...","lime juice, sea salt, clove garlic, ground cum..."
4,Appetizer,Perfect Queso Recipe,5,"1/2 cup pepper jack cheese, 1 cup grated white...",IngredientGroup(ingredients=['1/2 cup pepper j...,Stovetop Queso:\r\nAdd cheeses and half of the...,"{'calories': '534 kcal', 'carbohydrateContent'...",American,2 servings,https://addapinch.com/wp-content/uploads/2015/...,5.0,284,This easy and delicious Queso recipe is made w...,Complete,"pepper jack cheese, grated white american chee...","pepper jack cheese, grated white american chee...","pepper, jack, cheese, white, american, cheese,...","[pepper jack cheese, grated white american che...","pepper jack, jack cheese, grated white, white ..."
5,Appetizer,Buffalo Chicken Dip Recipe,25,"2 cups chicken breasts (cooked and shredded), ...",IngredientGroup(ingredients=['2 cups chicken b...,Preheat oven to 350º F.\r\nMix together all in...,"{'calories': '106 kcal', 'proteinContent': '4 ...",American,12 servings,https://addapinch.com/wp-content/uploads/2024/...,5.0,117,Buffalo Chicken Dip Recipe is the best easy ap...,Complete,"chicken breasts, cream cheese, ranch dressing ...","chicken breast, cream cheese, ranch dressing o...","chicken, breast, cream, cheese, ranch, cheese,...","[chicken breast, cream cheese, ranch dressing ...","chicken breast, cream cheese, ranch dressing, ..."


In [63]:
df.iloc[1].IngredientsTokenized

'chicken wing, stone house, house seasoning, recipe buffalo, blue cheese, cheese dressing, ranch dressing, chicken, wing, cornstarch, stone, house, recipe, cheese, ranch'

In [64]:
df0 = df.copy()
df1 = df.copy()

In [67]:
print(df[(df['Ratings'] >= 4) & (df['Rating Counts'] >= 4)]['Ratings'].count())


1289


In [69]:
df1 = df[(df['Ratings'] >= 4) & (df['Rating Counts'] >= 4)]

print(df[df.Title.str.contains('[Hh]ummus')].Title.count())

2


In [70]:
STOP_WORDS = ['ground', 'fresh ground', 'powder', 'cream of', 'boneless', 'skinless', 'skinless chicken', 'half'
             'butter or', 'juice', 'fresh lemon', 'rind', 'salt and', 'and pepper', 'boiling water', 'cold water',
             'sauce', 'soy', 'chip', 'soda', 'oil', 'or vegetable', 'oil or', 'vegetable', 'seed',
             'shredded cheddar', 'grated cheddar', 'jack', 'monterey', 'cheese or', 
            'half', 'of chicken', 'shredded mozzarella', 'cottage', 'confectioner', 'unsalted butter', 'of chicken',
            'condensed cream', 'package', 'package cream', 'whip', 'stalk celery', 'bay', 'leaf',
            'slice', 'slice white', 'slice bacon', 'slice bread', 'slice white', 'creamy', 'butter or', 'or butter',
            'salt', 'yeast or', 'pepper or', 'white', 'lowfat', 'skim', 'milk or', 'instant', 'whip', 'light', 
            'light corn', 'flake', 'worcestershire', 'dijon', 'cream or', 'salt pepper', 'wheat', 'squeezed lemon',
             ]

In [71]:
vectorizer = TfidfVectorizer(tokenizer = commatokenizer,
                  stop_words=STOP_WORDS,
                  min_df=7,max_df=.4,
                            )
docs = df1['IngredientsTokenized']
doc_word = vectorizer.fit_transform(docs)
    #returns sparsemat, can use pd.DataFrame(doc_term.toarray()) vectorizer.get_feature_names()
print(doc_word.shape)

nmf_model = NMF(20, random_state=10, max_iter=1000)
doc_topic = nmf_model.fit_transform(doc_word)
topic_word = nmf_model.components_



(1289, 415)


In [73]:
def display_topics(model, feature_names, num_top_words, topic_names=None):
    for idx, topic in enumerate(model.components_):
        if not topic_names or not topic_names[idx]:
            print("\nTopic ", idx)
        else:
            print("\nTopic: '",topic_names[idx],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-num_top_words - 1:-1]]))
        
    
display_topics(nmf_model, vectorizer.get_feature_names_out(), 8 )


Topic  0
vanilla, vanilla extract, sugar, milk, confectioner sugar, cream, cocoa, cocoa powder

Topic  1
black pepper, ground black, pepper, kosher, kosher salt, butter, mayonnaise, garlic

Topic  2
stone, house, stone house, house seasoning, worcestershire sauce, beef, fresh parsley, parsley

Topic  3
cheese, cheddar, cheddar cheese, cream, sour cream, cream cheese, bacon, tortilla

Topic  4
pepper, garlic, onion, cloves garlic, red, red pepper, clove garlic, pepper flake

Topic  5
chicken, breast, chicken breast, boneless skinless, bbq sauce, bbq, wing, chicken wing

Topic  6
chocolate, chocolate chip, peanut, semisweet, peanut butter, coconut, milk, coconut oil

Topic  7
parmesan, parmesan cheese, cheese, grated parmesan, parsley, spinach, garlic, pizza

Topic  8
green, salad green, cucumber, red, red onion, onion, grape, grape tomatoes

Topic  9
lemon, lemon juice, zest, lemon zest, mayonnaise, celery, orange, sugar

Topic  10
stock, broth, stock or, or broth, chicken stock, chick

In [74]:
def user_tokenize(ingreds):
    ingreds = preprocessor(ingreds)
    nouns = get_nouns(ingreds)
    ingreds = text_singularizer(ingreds)
    ingredscombined = [ingreds, nouns]
    ingredstokenized = mytokenizer(ingredscombined)
    return ingredstokenized

In [83]:
df['All Ingredients'] = df['All Ingredients'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
test = df.iloc[1]['Ingredients']
print(test,'\n')
print(user_tokenize(clean_once(test)),'\n')

6 pounds chicken wings (trimmed), 3 tablespoons cornstarch, 1 teaspoon Stone House Seasoning, 1 recipe Buffalo Sauce, Blue Cheese Dressing (optional), Ranch Dressing (optional) 

chicken wing, stone house, house seasoning, recipe buffalo, blue cheese, cheese dressing, ranch dressing, chicken, wing, cornstarch, stone, house, recipe, cheese, ranch 



In [86]:
def clean_raw(text):
    text = text["All Ingredients"]
    ingredlist = []
    for ingred in  text.split("#item,"):
        ingred = ingred.replace('#item','')         # Scraping artifact on last item
        ingredlist.append(ingred.strip())
        ingredlist = [ingred for ingred in ingredlist if not ingred.isupper()]  # Important for multi-part recipes
    return ingredlist

In [87]:
test = df.iloc[np.random.choice(range(1,1295))]
print(test.Title)
clean_raw(test)
test.IngredientsTokenized.split(', ')

Pulled Pork Enchilada Salad Recipe


['pulled pork',
 'enchilada sauce',
 'brown sugar',
 'sriracha sauce',
 'lettuce green',
 'black bean',
 'red onion',
 'canola oil',
 'mozzarella cheese',
 'pork',
 'enchilada',
 'coke',
 'brown',
 'sugar',
 'sriracha',
 'green',
 'cucumber',
 'bean',
 'red',
 'onion',
 'canola',
 'oil',
 'tortilla',
 'mozzarella',
 'cheese']

In [105]:
###
useringreds = "butter, ground beef, kosher salt"
###

usertokens = user_tokenize(useringreds)
print('User Input: ', useringreds)
print('Tokens Generated: ', usertokens, '\n')

user_vec = vectorizer.transform([usertokens])
topic_vec = nmf_model.transform(user_vec)
indices = pairwise_distances(topic_vec,doc_topic,metric='cosine').argsort().ravel()
for index in indices[0:5]:
    print(df1.iloc[index].Title.upper())
    print(df1.iloc[index]["All Ingredients"], '\n')

User Input:  butter, ground beef, kosher salt
Tokens Generated:  ground beef, kosher salt, butter, ground, beef, kosher, salt 

SOUTHERN COUNTRY FRIED STEAK
4 cube steaks, 1 cup all-purpose flour, 1 teaspoon Stone House Seasoning, 1 cup buttermilk, 1 teaspoon hot sauce (optional), 1 large egg, 1 cup vegetable oil, 1/2 cup all-purpose flour, 3 1/2 - 4 cups whole milk, 1/2 teaspoon kosher salt, 1/2 teaspoon ground black pepper 

BEEF TENDERLOIN SLIDERS RECIPE
2 pounds beef tenderloin (cooked medium-rare, sliced into 1/2-inch slices), 12 slider buns (split), 4 tablespoons butter, 1 cup horseradish sauce 

SIMPLE CHICKEN NUGGETS RECIPE
1 1/2 - 2 pounds chicken tenderloins (about 12), 1/4 cup all-purpose flour, 1 teaspoon fresh thyme, 2 teaspoons paprika, 1 teaspoon kosher salt, 1 tablespoon butter (melted) 

SOUTHERN HOECAKES RECIPE
1 cup flour (self-rising), 1 cup buttermilk cornmeal (self-rising), 1 cup fresh corn (cut off the cob), pinch kosher salt, pinch ground black pepper, 1 tablesp

In [118]:
df["IngredientsCombined"][5]

['chicken breast, cream cheese, ranch dressing or blue cheese dressing, homemade or storebought, buffalo sauce recipe, grated cheddar cheese, stone house seasoning, bleu cheese crumble',
 'chicken, breast, cream, cheese, ranch, cheese, homemade, recipe, cheddar, cheese, stone, house, bleu, cheese, crumble']

In [119]:
data = []

# Initialize a counter for the unique ID
unique_id = 1

# Iterate through all rows in the 'IngredientsTokenized' column
for index, row in df.iterrows():
    combined_ingredients = row["IngredientsCombined"]
    for ingredient_list in combined_ingredients:
        ingredients = ingredient_list.split(', ')
        for ingredient in ingredients:
            data.append({'ID': unique_id, 'Ingredient': ingredient})
            unique_id += 1

# Create a new DataFrame from the list of ingredients and their IDs
combined_ingredients_df = pd.DataFrame(data)

In [120]:
combined_ingredients_df

Unnamed: 0,ID,Ingredient
0,1,ground beef
1,2,diced onion
2,3,egg
3,4,stone house seasoning
4,5,ground allspice
...,...,...
26817,26818,sugar
26818,26819,karo
26819,26820,kosher
26820,26821,salt


In [121]:
combined_ingredients_df.to_csv('combined_ingredients.csv', index=False)