In [1]:
import pandas as pd
import numpy as np
import string, re
import swifter
import pickle 

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import bigrams 
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer

from nltk.corpus import stopwords
# import nltk
# nltk.download('stopwords')

import spacy
!python -m spacy download en_core_web_sm

nlp = spacy.load("en_core_web_sm")
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.1/12.8 MB 1.1 MB/s eta 0:00:12
     ---------------------------------------- 0.1/12.8 MB 1.1 MB/s eta 0:00:12
     ---------------------------------------- 0.1/12.8 MB 1.1 MB/s eta 0:00:12
     ---------------------------------------- 0.1/12.8 MB 1.1 MB/s eta 0:00:12
     ---------------------------------------- 0.1/12.8 MB 1.1 MB/s eta 0:00:12
      -------------------------------------- 0.3/12.8 MB 744.2 kB/s eta 0:00:17
      -------------------------------------- 0.3/12.8 MB 744.2 kB/s eta 0:00:17
     - ------------------------------------- 0.4/12.8

## Cleaning the ingredients

In [7]:
import pandas as pd
import re

# Define measurement words and their conversions
measure_words = {
    'bottle': 'bottle', 'bottles': 'bottles', 'box': 'box', 'boxes': 'boxes', 
    'bunch': 'bunch', 'bunches': 'bunches', 'bushel': 'bushel', 'bushels': 'bushels', 
    'can': 'can', 'cans': 'cans', 'container': 'container', 'cup': '236.59 ml', 
    'cups': '236.59 ml', 'carton': 'carton', 'cartons': 'cartons', 'dash': 'dash',
    'dashes': 'dashes', 'drop': 'drop', 'drops': 'drops', 'fl': 'fl', 'fl.': 'fl',
    'fluid': 'fluid', 'jar': 'jar', 'jars': 'jars', 'ounce': '28.35 g', 'ounces': '28.35 g', 
    'oz': '28.35 g', 'g': 'g', 'gallon': '3.78541 L', 'gallons': '3.78541 L', 'glass': 'glass',
    'glasses': 'glasses', 'gram': 'g', 'grams': 'g', 'kg': 'kg', 'kgs': 'kg', 'lb': '453.592 g',
    'lbs': '453.592 g', 'liter': 'L', 'liters': 'L', 'l': 'L', 'large': 'large', 'medium': 'medium',
    'ml': 'ml', 'mls': 'ml', 'package': 'package', 'pkg': 'package', 'small': 'small', 
    'pinch': 'pinch', 'pinches': 'pinches', 'pint': '473.176 ml', 'pints': '473.176 ml', 
    'pound': '453.592 g', 'pounds': '453.592 g', 'qt': '946.353 ml', 'qts': '946.353 ml',
    'quart': '946.353 ml', 'quarts': '946.353 ml', 'scoop': 'scoop', 'scoops': 'scoops',
    'stick': 'stick', 'sticks': 'sticks', 'tablespoon': '14.787 ml', 'tablespoons': '14.787 ml',
    'tbs': '14.787 ml', 'tbsp': '14.787 ml', 'tbsps': '14.787 ml', 'teaspoon': '4.929 ml',
    'teaspoons': '4.929 ml', 'tsp': '4.929 ml', 'tsps': '4.929 ml', 'whole': 'whole'
}

# Function to convert fractions to decimals
def fraction_to_decimal(match):
    frac = match.group()
    return str(eval(frac.replace(' ', '+').replace('/', '/')))

# Function to clean ingredient text
def clean_ingredients(text):
    ingredient_list = []
    for ingredient in text.split("#item,"):
        ingredient = ingredient.replace('#item', '')  # Remove artifact
        ingredient = re.sub(r'\([^)]*\)', '', ingredient)  # Remove anything inside parentheses
        ingredient = re.sub(r'(\d+\s+\d+/\d+)', lambda x: str(eval(x.group().replace(' ', '+').replace('/', '*0.01'))), ingredient)  # Handle mixed fractions
        ingredient = re.sub(r'(\d+/\d+)', fraction_to_decimal, ingredient)  # Handle fractions
        ingredient = re.sub(r'(\d+)', lambda x: str(float(x.group())), ingredient)  # Handle whole numbers
        ingredient = ingredient.split(',')[0]  # Remove anything after a comma
        ingredient = re.sub(r'\w*\d\w*', ' ', ingredient)  # Remove numeric characters within words
        ingredient = ingredient.replace('⁄', ' ')  # Replace fraction slashes with spaces
        ingredient = ' ' + ingredient + ' '  # Add padding for easier word removal

        for measure_word, conversion in measure_words.items():
            ingredient = re.sub(rf'\b{measure_word}\b', conversion, ingredient)  # Replace measure words

        ingredient = ingredient.strip()  # Remove leading/trailing whitespace
        if not ingredient.isupper():  # Ignore fully uppercase ingredients
            ingredient_list.append(ingredient)
    
    return ', '.join(ingredient_list)  # Join cleaned ingredients

# Load the dataset
df = pd.read_csv("data/df_recipes.csv")

# Preprocess the DataFrame
df = df.set_index('ID')
df = df.dropna(subset=['Title', 'NumReviews'])

# Apply the cleaning function to the 'IngredientsRaw' column
df['Ingredients'] = df['IngredientsRaw'].apply(clean_ingredients)

# Testing the cleaning function with specific entries
test_ingredients_1 = df.loc[11627].Ingredients
test_ingredients_2 = df.iloc[1135].Ingredients

# Splitting the cleaned ingredients into a list
ingredient_list_1 = test_ingredients_1.split(', ')
ingredient_list_2 = test_ingredients_2.split(', ')

# Display the cleaned ingredients as lists
print(ingredient_list_1)
print(ingredient_list_2)

# Test example with 'cheese' and 'cup'
df[df.IngredientsRaw.str.contains('cheese')].sample().IngredientsRaw.str.split("#item,").tolist()
df['IngredientsRaw'] = df['IngredientsRaw'].apply(lambda x: re.sub(r'\b1 cup\b', '236.59 ml', x))

df[df.IngredientsRaw.str.contains('cup')].sample().IngredientsRaw.str.split("#item,").tolist()


['. . .  236.59 ml mayonnaise']
['.']


[['1 (28-ounce) can crushed tomatoes, 1/2 large onion (roughly chopped), 2 cloves garlic (roughly chopped ), 1-3 medium jalapeno peppers (roughly chopped), 1/2 cup cilantro, 1/4 teaspoon ground cumin, 1 medium lime (juice and zest), 1/2 teaspoon kosher salt, 1/4 teaspoon ground black pepper']]

: 

In [42]:
df[df.IngredientsRaw.str.contains('cheese')].sample().IngredientsRaw.str.split("#item,").tolist()

[['Dry ingredients',
  ' 1 (1/4 ounce) package yeast',
  ' 3 1⁄3 cups flour',
  ' 1⁄4 teaspoon baking soda',
  ' 1 1⁄2 teaspoons salt',
  ' 1 unbeaten egg (at room temperature)',
  ' Non-dry ingredients',
  ' 1⁄4 cup water',
  ' 3⁄4 cup cottage cheese',
  ' 3⁄4 cup sour cream',
  ' 3 tablespoons sugar',
  ' 3 tablespoons minced dried onion',
  ' 2 tablespoons dill seeds',
  ' 1 1⁄2 tablespoons butter#item']]

In [11]:
df['IngredientsRaw'] = df['IngredientsRaw'].str.replace(r'\b1 cup\b', '236.59 ml')

In [17]:
df[df.IngredientsRaw.str.contains('cup')].sample().IngredientsRaw.str.split("#item,").tolist()

[['2 (3 ounce) packages orange Jell-O',
  ' 1 (3 ounce) can frozen orange juice',
  ' 1 cup mandarin orange section',
  ' 1 (8 ounce) container whipped topping',
  ' 2 1⁄2 cups boiling water',
  ' 1 (15 ounce) can crushed pineapple',
  ' 1 (3 ounce) package instant lemon pudding',
  ' 1 cup milk',
  ' chopped nuts (optional)#item']]

## Pre-process the ingredients


In [45]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to preprocess the ingredient text
def preprocess_ingredients(text):
    ingredients = []
    translator = str.maketrans('', '', string.punctuation)
    for ingredient in text.split(', '):
        ingredient = re.sub(r'\w*\d\w*', ' ', ingredient)  # Remove any words containing digits
        ingredient = ingredient.translate(translator)  # Remove punctuation
        ingredient = ingredient.replace('&', '').replace('-', '').replace('⁄', ' ')  # Remove special characters
        ingredient = ingredient.lower().strip()  # Lowercase and strip
        ingredients.append(ingredient)
    return ', '.join(ingredients)

# Function to singularize words
def singularize_word(word):
    if word.endswith('s') and len(word) > 3:  # Minimal 4 karakter untuk menghindari singularisasi yang tidak diinginkan
        return word[:-1]
    return word

# Function to singularize text
def singularize_text(text):
    return ', '.join(' '.join(singularize_word(word) for word in ingredient.split()) for ingredient in text.split(', '))

# Function to extract nouns
def extract_nouns(text):
    tokens = RegexpTokenizer(r'\w+').tokenize(text)
    return ', '.join(singularize_word(word) for word in tokens if is_noun(word))

# Function to check if a word is a noun
def is_noun(word):
    return nlp(word)[0].tag_ in {'NN', 'NNS', 'NNP', 'NNPS', 'NOUN', 'PROPN', 'NE', 'NNE', 'NR'}

In [46]:
teststr = df.iloc[1965].Ingredients
print("Original text:\n", teststr, '\n')
print("Nouns extracted:\n", extract_nouns(teststr), '\n')
print("Preprocessed text:\n", preprocess_ingredients(teststr), '\n')
print("Singularized text:\n", singularize_text(preprocess_ingredients(teststr)), '\n')
print("Nouns after preprocessing:\n", extract_nouns(preprocess_ingredients(teststr)))

Original text:
 fresh green beans, butter or   olive oil, onion, cloves garlic, fresh thyme  or   oregano, salt, cayenne pepper, hazelnuts 

Nouns extracted:
 bean, butter, olive, oil, onion, garlic, thyme, salt, cayenne, pepper, hazelnut 

Preprocessed text:
 fresh green beans, butter or   olive oil, onion, cloves garlic, fresh thyme  or   oregano, salt, cayenne pepper, hazelnuts 

Singularized text:
 fresh green bean, butter or olive oil, onion, clove garlic, fresh thyme or oregano, salt, cayenne pepper, hazelnut 

Nouns after preprocessing:
 bean, butter, olive, oil, onion, garlic, thyme, salt, cayenne, pepper, hazelnut


In [50]:
df['IngredientsProcessed'] = df['Ingredients'].apply(preprocess_ingredients)
df['IngredientsProcessedNouns'] = df['IngredientsProcessed'].apply(extract_nouns) 
df['IngredientsProcessed'] = df['IngredientsProcessed'].apply(singularize_text) 
df.head()

Unnamed: 0_level_0,Title,IngredientsRaw,TotalTime,NumSteps,AvgRating,NumReviews,RecipeURL,Ingredients,IngredientsProcessed,IngredientsProcessedNouns
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
10000,Tomato Paste,"48 large tomatoes#item, 2 teaspoons salt#item",3hrs 15mins,8.0,3.0,2.0,https://www.food.com/recipe/10000,"tomatoes, salt","tomatoe, salt",salt
10003,Spicy Corn Salad With Avocado Dressing,"4 cups whole kernel corn (I use frozen)#item, ...",12mins,10.0,3.67,3.0,https://www.food.com/recipe/10003,"kernel corn, red bell pepper, green bell peppe...","kernel corn, red bell pepper, green bell peppe...","kernel, corn, red, bell, pepper, bell, pepper,..."
10004,Ginger Beer,"TO MAKE THE GINGER BEER#item, 1 1⁄2 teaspoons ...",312hrs 5mins,18.0,4.0,2.0,https://www.food.com/recipe/10004,"dried yeast, ground ginger, sugar, lukewarm wa...","dried yeast, ground ginger, sugar, lukewarm wa...","yeast, ground, ginger, sugar, water, water, su..."
10005,Beau Monde Dip,"1 (16 ounce) carton sour cream#item, 16 ounces...",5mins,2.0,5.0,3.0,https://www.food.com/recipe/10005,"sour cream, mayonnaise, dried onion flakes, di...","sour cream, mayonnaise, dried onion flake, dil...","cream, mayonnaise, onion, flake, dill, weed, b..."
10006,Strawberry and Greens Salad,"mixed salad green (of your choice)#item, 3 rad...",15mins,11.0,4.0,1.0,https://www.food.com/recipe/10006,"mixed salad green, radishes, strawberry, sprin...","mixed salad green, radishe, strawberry, spring...","spring, onion, red, pepper, coriander, vinaigr..."


In [51]:
df['IngredientsCombined'] = df.apply(lambda x: [x['IngredientsProcessed'], x['IngredientsProcessedNouns']],axis=1)

df.iloc[23].IngredientsCombined[0]

'coarsely chopped hulled strawberrie, sugar, cornstarch'

In [52]:

test = df.iloc[2488].IngredientsProcessed
tokens = RegexpTokenizer(r'\w+').tokenize(test)
print(test)
print(tokens)
print([token for token in tokens if is_noun(token)])

egg white, caster sugar, vanilla essence, raspberrie, seedles raspberry jam
['egg', 'white', 'caster', 'sugar', 'vanilla', 'essence', 'raspberrie', 'seedles', 'raspberry', 'jam']
['egg', 'white', 'caster', 'sugar', 'vanilla', 'essence', 'raspberrie', 'seedles', 'jam']


In [53]:
def commatokenizer(text):
    return text.split(', ')

def mytokenizer(combinedlist):
    ingredlist = combinedlist[0].split(', ')
    nounlist = combinedlist[1].split(', ')
    ingredlist = combinedlist[0].split(', ')
    bigramlist = []
    for ingred in ingredlist:
        bigrms = [bi for bi in bigrams(ingred.split())]
        for bi in bigrms:
            if (bi[0] in nounlist) or (bi[1] in nounlist):
                bigramlist.append(' '.join((bi[0], bi[1])))
   
    return ', '.join(bigramlist + nounlist)

teststr = df.iloc[4644].IngredientsCombined
print(teststr, '\n')
print(mytokenizer(teststr),'\n')
print(commatokenizer(mytokenizer(teststr)))

['prawn, chopped scallion, soy sauce, ginger juice, salt, rice wine, oil', 'prawn, scallion, soy, ginger, juice, salt, rice, wine, oil'] 

chopped scallion, soy sauce, ginger juice, rice wine, prawn, scallion, soy, ginger, juice, salt, rice, wine, oil 

['chopped scallion', 'soy sauce', 'ginger juice', 'rice wine', 'prawn', 'scallion', 'soy', 'ginger', 'juice', 'salt', 'rice', 'wine', 'oil']


In [54]:
df['IngredientsTokenized'] = df['IngredientsCombined'].apply(mytokenizer)
df.head()

Unnamed: 0_level_0,Title,IngredientsRaw,TotalTime,NumSteps,AvgRating,NumReviews,RecipeURL,Ingredients,IngredientsProcessed,IngredientsProcessedNouns,IngredientsCombined,IngredientsTokenized
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
10000,Tomato Paste,"48 large tomatoes#item, 2 teaspoons salt#item",3hrs 15mins,8.0,3.0,2.0,https://www.food.com/recipe/10000,"tomatoes, salt","tomatoe, salt",salt,"[tomatoe, salt, salt]",salt
10003,Spicy Corn Salad With Avocado Dressing,"4 cups whole kernel corn (I use frozen)#item, ...",12mins,10.0,3.67,3.0,https://www.food.com/recipe/10003,"kernel corn, red bell pepper, green bell peppe...","kernel corn, red bell pepper, green bell peppe...","kernel, corn, red, bell, pepper, bell, pepper,...","[kernel corn, red bell pepper, green bell pepp...","kernel corn, red bell, bell pepper, green bell..."
10004,Ginger Beer,"TO MAKE THE GINGER BEER#item, 1 1⁄2 teaspoons ...",312hrs 5mins,18.0,4.0,2.0,https://www.food.com/recipe/10004,"dried yeast, ground ginger, sugar, lukewarm wa...","dried yeast, ground ginger, sugar, lukewarm wa...","yeast, ground, ginger, sugar, water, water, su...","[dried yeast, ground ginger, sugar, lukewarm w...","dried yeast, ground ginger, lukewarm water, ye..."
10005,Beau Monde Dip,"1 (16 ounce) carton sour cream#item, 16 ounces...",5mins,2.0,5.0,3.0,https://www.food.com/recipe/10005,"sour cream, mayonnaise, dried onion flakes, di...","sour cream, mayonnaise, dried onion flake, dil...","cream, mayonnaise, onion, flake, dill, weed, b...","[sour cream, mayonnaise, dried onion flake, di...","sour cream, dried onion, onion flake, dill wee..."
10006,Strawberry and Greens Salad,"mixed salad green (of your choice)#item, 3 rad...",15mins,11.0,4.0,1.0,https://www.food.com/recipe/10006,"mixed salad green, radishes, strawberry, sprin...","mixed salad green, radishe, strawberry, spring...","spring, onion, red, pepper, coriander, vinaigr...","[mixed salad green, radishe, strawberry, sprin...","spring onion, red pepper, chopped coriander, s..."


In [55]:
df.iloc[4644].IngredientsTokenized

'chopped scallion, soy sauce, ginger juice, rice wine, prawn, scallion, soy, ginger, juice, salt, rice, wine, oil'

In [56]:

df0 = df.copy()    #  keep original just in case
df1 = df.copy()    #  create a smaller one for manipulation

In [57]:
print(df[(df.AvgRating >= 4) & (df.NumReviews >= 4)].AvgRating.count())

df1 = df[(df.AvgRating >= 4) & (df.NumReviews >= 4)]

print(df[df.Title.str.contains('[Hh]ummus')].Title.count())

6074
12


In [61]:
STOP_WORDS = ['ground', 'fresh ground', 'powder', 'cream of', 'boneless', 'skinless', 'skinless chicken', 'half'
             'butter or', 'juice', 'fresh lemon', 'rind', 'salt and', 'and pepper', 'boiling water', 'cold water',
             'sauce', 'soy', 'chip', 'soda', 'oil', 'or vegetable', 'oil or', 'vegetable', 'seed',
             'shredded cheddar', 'grated cheddar', 'jack', 'monterey', 'cheese or', 
            'half', 'of chicken', 'shredded mozzarella', 'cottage', 'confectioner', 'unsalted butter', 'of chicken',
            'condensed cream', 'package', 'package cream', 'whip', 'stalk celery', 'bay', 'leaf',
            'slice', 'slice white', 'slice bacon', 'slice bread', 'slice white', 'creamy', 'butter or', 'or butter',
            'salt', 'yeast or', 'pepper or', 'white', 'lowfat', 'skim', 'milk or', 'instant', 'whip', 'light', 
            'light corn', 'flake', 'worcestershire', 'dijon', 'cream or', 'salt pepper', 'wheat', 'squeezed lemon',
             ]

vectorizer = TfidfVectorizer(tokenizer = commatokenizer,
                  stop_words=STOP_WORDS,
                  min_df=7,max_df=.4,
                            )
docs = df1['IngredientsTokenized']
doc_word = vectorizer.fit_transform(docs)
    #returns sparsemat, can use pd.DataFrame(doc_term.toarray()) vectorizer.get_feature_names()
print(doc_word.shape)

feature_names = vectorizer.get_feature_names_out() 

nmf_model = NMF(20, random_state=10, max_iter=1000)
doc_topic = nmf_model.fit_transform(doc_word)
topic_word = nmf_model.components_

def display_topics(model, feature_names, num_top_words, topic_names=None):
    for idx, topic in enumerate(model.components_):
        if not topic_names or not topic_names[idx]:
            print("\nTopic ", idx)
        else:
            print("\nTopic: '",topic_names[idx],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-num_top_words - 1:-1]]))
        
    
display_topics(nmf_model, feature_names, 8)

(6074, 1336)

Topic  0
flour, egg, baking powder, allpurpose flour, sugar, cinnamon, buttermilk, vegetable oil

Topic  1
cheese, parmesan cheese, parmesan, grated parmesan, mozzarella cheese, mozzarella, spaghetti, pasta

Topic  2
cheddar, cheddar cheese, cheese, sharp cheddar, onion, bacon, egg, green onion

Topic  3
chicken breast, breast, chicken, boneles, skinles, skinles chicken, boneles skinles, breast halve

Topic  4
garlic, clove garlic, black pepper, onion, parsley, garlic powder, paprika, cayenne

Topic  5
brown sugar, brown, sugar, cinnamon, packed brown, apple, nutmeg, raisin

Topic  6
lemon, lemon juice, lemon rind, sugar, grated lemon, mayonnaise, blueberrie, parsley

Topic  7
vanilla, sugar, vanilla extract, cocoa, powdered sugar, egg, coconut, confectioner sugar

Topic  8
water, yeast, dry yeast, flour, sugar, cornstarch, bread flour, hot water

Topic  9
butter, peanut, peanut butter, corn, melted butter, corn syrup, bread, creamy peanut

Topic  10
cream, cream cheese, 

In [62]:
# df['IngredientsProcessed'] = df['Ingredients'].apply(preprocessor)
# df['IngredientsProcNouns'] = df['IngredientsProcessed'].apply(get_nouns)  # Takes 2 min per 1000
# df['IngredientsProcessed'] = df['IngredientsProcessed'].apply(text_singularizer)  # Takes 1 min per 1000
# df['IngredientsTokenized'] = df['IngredientsCombined'].apply(mytokenizer)

def user_tokenize(ingreds):
    ingreds = preprocess_ingredients(ingreds)
    nouns = extract_nouns(ingreds)
    ingreds = singularize_text(ingreds)
    ingredscombined = [ingreds, nouns]
    ingredstokenized = mytokenizer(ingredscombined)
    return ingredstokenized
    
test = df.iloc[4634].IngredientsRaw
print(test,'\n')
print(user_tokenize(clean_ingredients(test)),'\n')

def clean_raw(text):
    text = text.IngredientsRaw
    ingredlist = []
    for ingred in  text.split("#item,"):
        ingred = ingred.replace('#item','')         # Scraping artifact on last item
        ingredlist.append(ingred.strip())
        ingredlist = [ingred for ingred in ingredlist if not ingred.isupper()]  # Important for multi-part recipes
    return ingredlist   #example: 'cucumber, vinegar, salt, black pepper'


test = df.iloc[np.random.choice(range(1,13000))]
print(test.Title)
clean_raw(test)
test.IngredientsTokenized.split(', ')

2⁄3 cup water#item, 2 cups fresh cranberries or 2 cups frozen cranberries#item, 1 cup Equal sugar substitute (or 24 packets Equal sugar substitute or 7 1/4 teaspoons Equal sugar substitute)#item 

fresh cranberrie, cranberrie or, frozen cranberrie, equal sugar, sugar substitute, water, cranberrie, cranberrie, sugar 

Frozen Chocolate Bananas


['wooden skewer',
 'skewer or',
 'chocolate candy',
 'candy bar',
 'graham cracker',
 'banana',
 'skewer',
 'chocolate',
 'candy',
 'bar',
 'nut',
 'coconut',
 'cereal',
 'graham',
 'cracker']

In [72]:
# Define a function to tokenize user input
def user_tokenize(input_text):
    tokens = commatokenizer(input_text)
    return mytokenizer((input_text, extract_nouns(input_text)))

# Define user input
input_user = "Worcestershire sauce, pepper, horseradish, chili sauce"

# Tokenize user input
usertokens = user_tokenize(input_user)
print('User Input: ', input_user)
print('Tokens Generated: ', usertokens, '\n')

# Transform user input into vector
user_vec = vectorizer.transform([usertokens])

# Transform user vector into topic vector
topic_vec = nmf_model.transform(user_vec)

# Find similar recipes based on cosine similarity
labelindexs = pairwise_distances(topic_vec, doc_topic, metric='cosine').argsort().ravel()

# Print top 10 similar recipes
for index in labelindexs[0:10]:
    print(df1.iloc[index].Title.upper())
    print(df1.iloc[index].IngredientsRaw.split("#item,"), '\n')


User Input:  Worcestershire sauce, pepper, horseradish, chili sauce
Tokens Generated:  Worcestershire sauce, chili sauce, Worcestershire, pepper, horseradish, chili 

DRESSED THREE BEAN SALAD
['1 can cut green beans, drained', ' 1 can cut waxed yellow beans, drained', ' 1 can kidney bean, drained and washed', ' 1⁄2 bermuda onion, chopped', ' 1⁄2 large green pepper, chopped', ' DRESSING', ' 3⁄4 cup sugar', ' 1⁄2 cup oil', ' 1 teaspoon salt', ' 1⁄2 cup vinegar', ' 1⁄2 teaspoon pepper#item'] 

SUPER EASY COCKTAIL SAUCE
['1⁄2 cup ketchup', ' 1⁄2 cup chili sauce', ' 2 -3 teaspoons prepared horseradish', ' 1 teaspoon Worcestershire sauce#item'] 

GREEK HAMBURGERS
['2 lbs ground beef round', ' 1⁄2 cup onion, diced', ' 2 slices bread, crumbled', ' 2 small tomatoes, chopped or 1 cup canned tomato', ' 1 teaspoon cumin, ground', ' 1 teaspoon oregano', ' 2 eggs', ' 1⁄2 cup wine, or beef broth', ' 1⁄2 teaspoon salt', ' 1⁄2 teaspoon pepper#item'] 

CRISPY FRIED MASHED POTATOES
['leftover mashed pota