In [1]:
import pandas as pd
import numpy as np
import string, re
import swifter

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import bigrams 
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer


from nltk.corpus import stopwords
nltk.download('stopwords')

import spacy
nlp = spacy.load("en_core_web_sm")
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity

## Step 0:  Load Recipe Corpus and Clean It

* Original recipe ingredient list is stored in df['IngredientsRaw'] (only need to do this once)
* This cleaning is corpus-specific (we will not do it to the user-inputted ingredients)
* Store the cleaned ingredient list in df['Ingredients'] (only need to do this once)

In [20]:
df = pd.read_csv("data/df26285str.csv")
df = df.set_index('ID')
df = df.dropna(subset=['Title','NumReviews'])    #1292

In [261]:
measure_words = ['bottle', 'bottles', 'box', 'boxes', 'bunch','bunches', 'bushel','bushels', 
                 'can', 'cans', 'container', 'c', 'cup', 'cups', 'carton', 'cartons'
                 'dash','dashes', 'drop','drops','fl', 'fl.', 'fluid','jar', 'jars','ounce','ounces','oz',
                 'g', 'gallon','gallons', 'glass','glasses', 'gram','grams','kg','kgs', 'lb','lbs',
                 'liter','liters', 'l', 'large', 'medium', 'ml','mls', 'package','pkg','small', 'to taste',
                 'pinch','pinches', 'pint','pints', 'pound','pounds', 'qt', 'qts', 'quart','quarts',
                 'scoop','scoops', 'sliced','slivered','stick','sticks', 'tablespoon','tablespoons',
                 'tbs','tbsp','tbsps', 'teaspoon','teaspoons','tsp','tsps','whole']

def clean_once(text):
    ingredlist = []
    for ingred in  text.split("#item,"):
        ingred = ingred.replace('#item','')         # Scraping artifact on last item
        ingred = re.sub(r'\([^)]*\)', '', ingred)   # Remove anything inside parentheses
        ingred = ingred.split(',')[0]               # Remove anything after a comma
        ingred = re.sub('\w*\d\w*', ' ', ingred)    # Remove numeric characters
        ingred = ingred.replace('⁄',' ')            # Fraction slashes are annoying
        ingred = ' ' + ingred + ' '                 # Padding in case measure_word is first or last
        for measure_word in measure_words:          # Remove measure words by themselves
            ingred = ingred.replace(' ' + measure_word + ' ', ' ')
        ingredlist.append(ingred.strip())
        ingredlist = [ingred for ingred in ingredlist if not ingred.isupper()]  # Important for multi-part recipes
    return ', '.join(ingredlist)   #example: 'cucumber, vinegar, salt, black pepper'

teststr = df.loc[10627].Ingredients
teststr = df.iloc[9135].Ingredients
teststr.split(', ')
newtest = [item for item in teststr.split(', ')]
newtest

['graham cracker crumbs',
 'melted butter',
 'sugar',
 'packages cream cheese',
 'sugar',
 'eggs',
 'egg yolks',
 'sour cream',
 'vanilla',
 'melted butter',
 'cornstarch',
 'almond extract',
 'heavy cream']

In [238]:
df['Ingredients'] = df['IngredientsRaw'].apply(clean_once)

In [371]:
# df[df.IngredientsRaw.str.contains('zucchini')].sample().IngredientsRaw.str.split("#item,").tolist()
# df[df.IngredientsRaw.str.contains('zucchini')].Title.count()


110

## Step 1:  Pre-process the ingredients

* Store the pre-processed recipe ingredients as a string in df['IngredientsProcessed']
* Store the pre-processed recipe ingredient nouns as a list in df['IngredientsProcNouns']

In [578]:
def preprocessor(text):
    ingredlist = []
    for ingred in  text.split(', '):
        ingred = re.sub('\w*\d\w*', ' ', ingred)  # Remove any words containing digits 
        ingred = ingred.replace('"', '').replace("'", '').replace('& ', '').replace('-','')   
        ingred = re.sub('[%s]' % re.escape(string.punctuation), ' ', ingred)  # Remove punctuation
        ingred = ingred.lower().strip()        
#         new_list = []
#         for word in ingred.split():
#             new_list.append(singularizer(word))
#         ingred = ' '.join(new_list)
        ingredlist.append(ingred)        
    return ', '.join(ingredlist) 

def word_singularizer(word):
    nlp_word = nlp(word)[0]
    lemma = nlp_word.text
    if nlp_word.tag_ in {"NNS", "NNPS"}:
            lemma = nlp_word.lemma_
    return lemma

def text_singularizer(text):
    ingredlist = []
    for ingred in  text.split(', '):
        new_list = []
        for word in ingred.split():
            new_list.append(word_singularizer(word))
        ingred = ' '.join(new_list)
        ingredlist.append(ingred)        
    return ', '.join(ingredlist) 

def get_nouns(text):
    tokens = RegexpTokenizer(r'\w+').tokenize(text)
    nounlist = [word_singularizer(word) for word in tokens if is_noun(word)]
    return ', '.join(nounlist) 

def is_noun(word):
    nouns = {'NN','NNS', 'NNP', 'NNPS','NOUN', 'PROPN', 'NE', 'NNE', 'NR'}
    pos = nlp(word)[0].tag_ 
    if pos in nouns:
        return True
    return False

    
teststr = df.iloc[1049].Ingredients
# teststr = 'steak'
print(teststr,'\n')
print(get_nouns(teststr),'\n')
print(preprocessor(teststr), '\n')
print(text_singularizer(preprocessor(teststr)))
print(get_nouns(teststr))


cooking pears or   dessert pears, sugar, grated orange rind, currants, orange juice, butter, brandy, smooth apricot jam, milk, egg yolks, sugar, vanilla pod, finely grated orange, whipped cream, orange liqueur 

pear, dessert, pear, sugar, orange, rind, currant, orange, juice, butter, brandy, apricot, jam, milk, egg, yolk, sugar, vanilla, pod, orange, cream, orange, liqueur 

cooking pears or   dessert pears, sugar, grated orange rind, currants, orange juice, butter, brandy, smooth apricot jam, milk, egg yolks, sugar, vanilla pod, finely grated orange, whipped cream, orange liqueur 

cooking pear or dessert pear, sugar, grated orange rind, currant, orange juice, butter, brandy, smooth apricot jam, milk, egg yolk, sugar, vanilla pod, finely grated orange, whipped cream, orange liqueur
pear, dessert, pear, sugar, orange, rind, currant, orange, juice, butter, brandy, apricot, jam, milk, egg, yolk, sugar, vanilla, pod, orange, cream, orange, liqueur


In [536]:
# Order matters for these steps!  We want to get the nouns before singularizing them, because sometimes
# Sometimes spacy does not register the singular of a word as a noun (e.g. "raspberry" vs "raspberries")

df['IngredientsProcessed'] = df['Ingredients'].apply(preprocessor)
df['IngredientsProcNouns'] = df['IngredientsProcessed'].apply(get_nouns)  # Takes 2 min per 1000 rows
df['IngredientsProcessed'] = df['IngredientsProcessed'].apply(text_singularizer)  # Takes 1 min per 1000 rows
df.head()

CPU times: user 14min 18s, sys: 4.35 s, total: 14min 23s
Wall time: 14min 29s


Unnamed: 0_level_0,Title,IngredientsRaw,TotalTime,NumSteps,AvgRating,NumReviews,RecipeURL,Ingredients,IngredientsProcessed,IngredientsTokenized,IngredientsProcNouns
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
10000,Tomato Paste,"48 large tomatoes#item, 2 teaspoons salt#item",3hrs 15mins,8.0,3.0,2.0,https://www.food.com/recipe/10000,"tomatoes, salt","tomato, salt","tomatoes, salt","[tomato, salt]"
10003,Spicy Corn Salad With Avocado Dressing,"4 cups whole kernel corn (I use frozen)#item, ...",12mins,10.0,3.67,3.0,https://www.food.com/recipe/10003,"kernel corn, red bell pepper, green bell peppe...","kernel corn, red bell pepper, green bell peppe...","kernel, corn, red, bell, pepper, green, bell, ...","[kernel, corn, bell, pepper, bell, pepper, spr..."
10004,Ginger Beer,"TO MAKE THE GINGER BEER#item, 1 1⁄2 teaspoons ...",312hrs 5mins,18.0,4.0,2.0,https://www.food.com/recipe/10004,"dried yeast, ground ginger, sugar, lukewarm wa...","dried yeast, ground ginger, sugar, lukewarm wa...","dried, yeast, ground, ginger, sugar, lukewarm,...","[yeast, ground, ginger, sugar, lukewarm, water..."
10005,Beau Monde Dip,"1 (16 ounce) carton sour cream#item, 16 ounces...",5mins,2.0,5.0,3.0,https://www.food.com/recipe/10005,"carton sour cream, mayonnaise, dried onion fla...","carton sour cream, mayonnaise, dried onion fla...","carton, sour, cream, mayonnaise, dried, onion,...","[carton, cream, mayonnaise, onion, flake, weed..."
10006,Strawberry and Greens Salad,"mixed salad green (of your choice)#item, 3 rad...",15mins,11.0,4.0,1.0,https://www.food.com/recipe/10006,"mixed salad green, radishes, strawberry, sprin...","mixed salad green, radishes, strawberry, sprin...","mixed, salad, green, radishes, strawberry, spr...","[salad, strawberry, spring, onion, pepper, vin..."


In [588]:
df.head()

Unnamed: 0_level_0,Title,IngredientsRaw,TotalTime,NumSteps,AvgRating,NumReviews,RecipeURL,Ingredients,IngredientsProcessed,IngredientsTokenized,IngredientsProcNouns
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
10000,Tomato Paste,"48 large tomatoes#item, 2 teaspoons salt#item",3hrs 15mins,8.0,3.0,2.0,https://www.food.com/recipe/10000,"tomatoes, salt","tomato, salt","tomatoes, salt","tomato, salt"
10003,Spicy Corn Salad With Avocado Dressing,"4 cups whole kernel corn (I use frozen)#item, ...",12mins,10.0,3.67,3.0,https://www.food.com/recipe/10003,"kernel corn, red bell pepper, green bell peppe...","kernel corn, red bell pepper, green bell peppe...","kernel, corn, red, bell, pepper, green, bell, ...","kernel, corn, bell, pepper, bell, pepper, spri..."
10004,Ginger Beer,"TO MAKE THE GINGER BEER#item, 1 1⁄2 teaspoons ...",312hrs 5mins,18.0,4.0,2.0,https://www.food.com/recipe/10004,"dried yeast, ground ginger, sugar, lukewarm wa...","dried yeast, ground ginger, sugar, lukewarm wa...","dried, yeast, ground, ginger, sugar, lukewarm,...","yeast, ground, ginger, sugar, lukewarm, water,..."
10005,Beau Monde Dip,"1 (16 ounce) carton sour cream#item, 16 ounces...",5mins,2.0,5.0,3.0,https://www.food.com/recipe/10005,"carton sour cream, mayonnaise, dried onion fla...","carton sour cream, mayonnaise, dried onion fla...","carton, sour, cream, mayonnaise, dried, onion,...","carton, cream, mayonnaise, onion, flake, weed,..."
10006,Strawberry and Greens Salad,"mixed salad green (of your choice)#item, 3 rad...",15mins,11.0,4.0,1.0,https://www.food.com/recipe/10006,"mixed salad green, radishes, strawberry, sprin...","mixed salad green, radishes, strawberry, sprin...","mixed, salad, green, radishes, strawberry, spr...","salad, strawberry, spring, onion, pepper, vina..."


In [612]:
df['IngredientsCombined'] = df.apply(lambda x: [x['IngredientsProcessed'], x['IngredientsProcNouns']],axis=1)

# df['IngredientsCombinedSTR'] = df.apply(lambda x: [x['IngredientsProcessed'], x['IngredientsProcNouns']],axis=1).astype(str)
# df.to_csv('data/df_justincase.csv',index=True)


df.iloc[23].IngredientsCombined[0]

'coarsely chopped hulled strawberry, sugar, cornstarch'

In [606]:
# test = pd.DataFrame({'foo':['a, b','b, c','c, d'], 'bar':['avs, sdf f', 'afc, sdf', 'ss sdf'], 'new':['apple', 'banana', 'pear']})


# test['combined']=test.apply(lambda x: [x['foo'], x['bar']],axis=1).astype(str)

# test['combined']=test.apply(lambda x: [x['foo'], x['bar']],axis=1).astype(str)
# display(test)
# test.iloc[0].combined.split("', ")[1]

Unnamed: 0,foo,bar,new,combined
0,"a, b","avs, sdf f",apple,"['a, b', 'avs, sdf f']"
1,"b, c","afc, sdf",banana,"['b, c', 'afc, sdf']"
2,"c, d",ss sdf,pear,"['c, d', 'ss sdf']"


"'avs, sdf f']"

## Step 2:  Tokenize the ingredients

* Store the recipe tokens as df['IngredientsTokenized'] 

In [647]:
# test = df.iloc[5444].IngredientsCombined
# # print(test)
# [bi for bi in bigrams(test[0].split(', ')[1].split())]
# # bigrm = [bi for bi in bigrams((test[0].split(', ')[0]).split())]
# # ', '.join(' '.join((a, b)) for a, b in bigrm if (is_noun(a) or is_noun(b)))


# ingr = 'your favorite barbecue and rub patty hamburger is the best ball'
# bglist = [bi for bi in bigrams(ingr.split())]
# print(bglist)
# for bi in bglist:
#     if is_noun(bi[0]) or is_noun(bi[1]):
#         print(' '.join((bi[0], bi[1])))


[('your', 'favorite'), ('favorite', 'barbecue'), ('barbecue', 'and'), ('and', 'rub'), ('rub', 'patty'), ('patty', 'hamburger'), ('hamburger', 'is'), ('is', 'the'), ('the', 'best'), ('best', 'ball')]
favorite barbecue
barbecue and
patty hamburger
hamburger is
best ball


In [508]:
# test = 'loaf wheat bread, cream, prepared horseradish, knorr dry onion soup mix, deli roast beef'
# test = df.iloc[2488].IngredientsProcessed

# tokens = RegexpTokenizer(r'\w+').tokenize(test)
# print(tokens)
# [token for token in tokens if is_noun(token)]

['egg', 'white', 'caster', 'sugar', 'vanilla', 'essence', 'raspberry', 'seedless', 'raspberry', 'jam']


['egg', 'white', 'caster', 'sugar', 'vanilla', 'essence', 'jam']

In [664]:
def commatokenizer(text):
    return text.split(', ')

def mytokenizer(combinedlist):
    ingredlist = combinedlist[0].split(', ')
    nounlist = combinedlist[1].split(', ')
    ingredlist = combinedlist[0].split(', ')
    bigramlist = []
    for ingred in ingredlist:
        bigrms = [bi for bi in bigrams(ingred.split())]
        for bi in bigrms:
            if (bi[0] in nounlist) or (bi[1] in nounlist):
                bigramlist.append(' '.join((bi[0], bi[1])))
   
    return ', '.join(bigramlist + nounlist)
  
#     tokens = []
#     for ingred in text.split(', '):
#         tokens.append(sent_tokenize(ingred)[0])
#         tokens = [stemmer.stem(token) for token in tokens]
#     return tokens

#     tokens = RegexpTokenizer(r'\w+').tokenize(text)   # just gets single words
# #     tokens = [token for token in tokens]
#     return ', '.join(tokens) 

teststr = df.iloc[4644].IngredientsCombined
print(teststr, '\n')
print(mytokenizer(teststr),'\n')
print(commatokenizer(mytokenizer(teststr)))


# text_tokens = word_tokenize(text)
# tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]
# print(tokens_without_sw)



['prawn, chopped scallion, soy sauce, ginger juice, salt, rice wine, oil', 'prawn, scallion, soy, sauce, ginger, juice, salt, rice, wine, oil'] 

chopped scallion, soy sauce, ginger juice, rice wine, prawn, scallion, soy, sauce, ginger, juice, salt, rice, wine, oil 

['chopped scallion', 'soy sauce', 'ginger juice', 'rice wine', 'prawn', 'scallion', 'soy', 'sauce', 'ginger', 'juice', 'salt', 'rice', 'wine', 'oil']


In [666]:
df['IngredientsTokenized'] = df['IngredientsCombined'].apply(mytokenizer)
df.head()

Unnamed: 0_level_0,Title,IngredientsRaw,TotalTime,NumSteps,AvgRating,NumReviews,RecipeURL,Ingredients,IngredientsProcessed,IngredientsTokenized,IngredientsProcNouns,IngredientsCombined,IngredientsCombinedSTR
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
10000,Tomato Paste,"48 large tomatoes#item, 2 teaspoons salt#item",3hrs 15mins,8.0,3.0,2.0,https://www.food.com/recipe/10000,"tomatoes, salt","tomato, salt","tomato, salt","tomato, salt","[tomato, salt, tomato, salt]","['tomato, salt', 'tomato, salt']"
10003,Spicy Corn Salad With Avocado Dressing,"4 cups whole kernel corn (I use frozen)#item, ...",12mins,10.0,3.67,3.0,https://www.food.com/recipe/10003,"kernel corn, red bell pepper, green bell peppe...","kernel corn, red bell pepper, green bell peppe...","kernel corn, red bell, bell pepper, green bell...","kernel, corn, bell, pepper, bell, pepper, spri...","[kernel corn, red bell pepper, green bell pepp...","['kernel corn, red bell pepper, green bell pep..."
10004,Ginger Beer,"TO MAKE THE GINGER BEER#item, 1 1⁄2 teaspoons ...",312hrs 5mins,18.0,4.0,2.0,https://www.food.com/recipe/10004,"dried yeast, ground ginger, sugar, lukewarm wa...","dried yeast, ground ginger, sugar, lukewarm wa...","dried yeast, ground ginger, lukewarm water, ye...","yeast, ground, ginger, sugar, lukewarm, water,...","[dried yeast, ground ginger, sugar, lukewarm w...","['dried yeast, ground ginger, sugar, lukewarm ..."
10005,Beau Monde Dip,"1 (16 ounce) carton sour cream#item, 16 ounces...",5mins,2.0,5.0,3.0,https://www.food.com/recipe/10005,"carton sour cream, mayonnaise, dried onion fla...","carton sour cream, mayonnaise, dried onion fla...","carton sour, sour cream, dried onion, onion fl...","carton, cream, mayonnaise, onion, flake, weed,...","[carton sour cream, mayonnaise, dried onion fl...","['carton sour cream, mayonnaise, dried onion f..."
10006,Strawberry and Greens Salad,"mixed salad green (of your choice)#item, 3 rad...",15mins,11.0,4.0,1.0,https://www.food.com/recipe/10006,"mixed salad green, radishes, strawberry, sprin...","mixed salad green, radishes, strawberry, sprin...","mixed salad, salad green, spring onion, red pe...","salad, strawberry, spring, onion, pepper, vina...","[mixed salad green, radishes, strawberry, spri...","['mixed salad green, radishes, strawberry, spr..."


In [667]:
df.iloc[4644].IngredientsTokenized

'chopped scallion, soy sauce, ginger juice, rice wine, prawn, scallion, soy, sauce, ginger, juice, salt, rice, wine, oil'

## Step 3:  Vectorize the ingredient tokens and Train a model

* Up until now, we've been using the full recipe dataset.  Now we can take a subset.
* Create a document-term matrix
* Train a model

In [668]:
df0 = df.copy()    #  keep original just in case
df1 = df.copy()    #  create a smaller one for manipulation

In [669]:
print(df[(df.AvgRating >= 4) & (df.NumReviews >= 4)].AvgRating.count())

df1 = df[(df.AvgRating >= 4) & (df.NumReviews >= 4)]

# df[df.IngredientsRaw.str.contains(' scoop ')].sample().IngredientsRaw.str.split("#item,").tolist()
# df[df.Title.str.contains('[Hh]ummus')].count()

6074


In [827]:
STOP_WORDS = ['ground', 'fresh ground', 'powder', 'cream of', 'boneless', 'skinless', 'skinless chicken', 'half'
             'butter or', 'juice', 'fresh lemon', 'rind', 'salt and', 'and pepper', 'boiling water', 'cold water',
             'sauce', 'soy', 'chip', 'soda', 'oil', 'or vegetable', 'oil or', 'vegetable', 'seed',
             'shredded cheddar', 'grated cheddar', 'jack', 'monterey', 'cheese or', 
            'half', 'of chicken', 'shredded mozzarella', 'cottage', 'confectioner', 'unsalted butter', 'of chicken',
            'condensed cream', 'package', 'package cream', 'whip', 'stalk celery', 'bay', 'leaf',
            'slice', 'slice white', 'slice bacon', 'slice bread', 'slice white', 'creamy', 'butter or', 'or butter',
            'salt', 'yeast or', 'pepper or', 'white', 'lowfat', 'skim', 'milk or', 'instant', 'whip', 'light', 
            'light corn', 'flake', 'worcestershire', 'dijon', 'cream or', 'salt pepper', 'wheat', 'squeezed lemon',
             ]


vectorizer = TfidfVectorizer(tokenizer = commatokenizer,
                  stop_words=STOP_WORDS,
                  min_df=7,max_df=.4,
                            )
docs = df1['IngredientsTokenized']
doc_word = vectorizer.fit_transform(docs)
    #returns sparsemat, can use pd.DataFrame(doc_term.toarray()) vectorizer.get_feature_names()
print(doc_word.shape)

nmf_model = NMF(20, random_state=10, max_iter=1000)
doc_topic = nmf_model.fit_transform(doc_word)
topic_word = nmf_model.components_

def display_topics(model, feature_names, num_top_words, topic_names=None):
    for idx, topic in enumerate(model.components_):
        if not topic_names or not topic_names[idx]:
            print("\nTopic ", idx)
        else:
            print("\nTopic: '",topic_names[idx],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-num_top_words - 1:-1]]))
        
    
display_topics(nmf_model, vectorizer.get_feature_names(), 8 )




(6074, 1340)

Topic  0
flour, baking powder, egg, allpurpose flour, baking soda, sugar, buttermilk, vegetable oil

Topic  1
olive, olive oil, virgin olive, basil, tomato, fresh basil, oregano, wine

Topic  2
cheese, parmesan cheese, mozzarella, mozzarella cheese, spaghetti, pasta, breadcrumb, basil

Topic  3
chicken, breast, chicken breast, boneless skinless, chicken broth, broth, breast half, chicken stock

Topic  4
potato, onion, carrot, celery, bacon, parsley, paprika, red potato

Topic  5
chocolate, chocolate chip, semisweet, semisweet chocolate, walnut, cake, cake mix, condensed milk

Topic  6
lemon, lemon juice, lemon rind, grated lemon, parsley, mayonnaise, sugar, fresh parsley

Topic  7
butter, peanut, peanut butter, melted butter, bread, flour, sugar, creamy peanut

Topic  8
water, yeast, dry yeast, flour, sugar, cornstarch, hot water, bread flour

Topic  9
cream, cream cheese, sour cream, cheese, whipping cream, heavy cream, pie, chicken soup

Topic  10
tomato, beef, ground b

## Step 5:  Generate a recipe recommendations based on user input


In [727]:
# df['IngredientsProcessed'] = df['Ingredients'].apply(preprocessor)
# df['IngredientsProcNouns'] = df['IngredientsProcessed'].apply(get_nouns)  # Takes 2 min per 1000
# df['IngredientsProcessed'] = df['IngredientsProcessed'].apply(text_singularizer)  # Takes 1 min per 1000
# df['IngredientsTokenized'] = df['IngredientsCombined'].apply(mytokenizer)

def user_tokenize(ingreds):
    ingreds = preprocessor(ingreds)
    nouns = get_nouns(ingreds)
    ingreds = text_singularizer(ingreds)
    ingredscombined = [ingreds, nouns]
    ingredstokenized = mytokenizer(ingredscombined)
    return ingredstokenized
    
test = df.iloc[4634].IngredientsRaw
print(test)
user_preprocess(clean_once(test))
    


2⁄3 cup water#item, 2 cups fresh cranberries or 2 cups frozen cranberries#item, 1 cup Equal sugar substitute (or 24 packets Equal sugar substitute or 7 1/4 teaspoons Equal sugar substitute)#item


'fresh cranberry, cranberry or, frozen cranberry, equal sugar, sugar substitute, water, cranberry, cranberry, sugar'

### User input goes here!  Type desired ingredients as a comma-separated string:

In [835]:
###
useringreds = "broccoli, cheddar cheese, butter, breadcrumbs"
###

usertokens = user_tokenize(useringreds)

print('User Input: ', useringreds)
print('Tokens Generated: ', usertokens, '\n')

user_vec = vectorizer.transform([usertokens])
topic_vec = nmf_model.transform(user_vec)

indices = pairwise_distances(topic_vec,doc_topic,metric='cosine').argsort().ravel()
for index in indices[0:5]:
    print(df1.iloc[index].Title.upper())
    print(df1.iloc[index].IngredientsRaw.split("#item,"), '\n')

User Input:  broccoli, cheddar cheese, butter, breadcrumbs
Tokens Generated:  cheddar cheese, broccoli, cheddar, cheese, butter, breadcrumb 

CHEDDAR BAKED BAGELS AND EGGS
['4 bagels, halved', ' 2 tablespoons butter', ' 8 eggs', ' 1⁄2 teaspoon salt', ' 1⁄2 teaspoon pepper', ' 1⁄2 cup shredded cheddar cheese#item'] 

BROCCOLI CASSEROLE
['2 packages chopped frozen broccoli', ' 1 cup mayonnaise', ' 1 small onion, chopped', ' 2 eggs, beaten', ' 2 cups grated cheddar cheese', ' 1⁄4 - 1⁄2 cup butter, melted', ' 1 cup breadcrumbs#item'] 

SUMMER SQUASH CASSEROLE
['1 1⁄2 lbs summer squash', ' 1⁄2 cup butter or 1/2 cup margarine', ' 1 egg', ' 1 medium onion, chopped', ' salt and pepper', ' 12 -15 Ritz crackers, crushed', ' 3⁄4 cup grated cheddar cheese or 3/4 cup longhorn cheese#item'] 

CHEESE SQUARES
['1 cup butter or 1 cup margarine, softened', ' 2 (5 ounce) jars Kraft Old English cheese spread, softened', ' 1 egg', ' 1 (4 ounce) can chopped green chilies', ' 1⁄4 cup salsa', ' 2 cups shredde

In [821]:
def clean_raw(text):
    text = text.IngredientsRaw
    ingredlist = []
    for ingred in  text.split("#item,"):
        ingred = ingred.replace('#item','')         # Scraping artifact on last item
        ingredlist.append(ingred.strip())
        ingredlist = [ingred for ingred in ingredlist if not ingred.isupper()]  # Important for multi-part recipes
    return ingredlist   #example: 'cucumber, vinegar, salt, black pepper'


# test = df.iloc[np.random.choice(range(1,13000))]
print(test.Title)
# clean_raw(test)
# test.IngredientsTokenized.split(', ')


Cranberry Butter


['2 1⁄2 lbs cranberries',
 '2⁄3 cup apple juice',
 '1 cup pure maple syrup',
 '1⁄2 cup liquid honey',
 '1⁄2 teaspoon ground cinnamon',
 '1 dash ground ginger']