In [1]:
import pandas as pd
import numpy as np
import re
from ast import literal_eval

### Load data frame from csv file

In [2]:
data_file = 'MasterCSV.csv'
master_data_df = pd.read_csv(data_file, converters = {'cuisine':  literal_eval,
                                              'aisle_SP':  literal_eval,
                                              'ingredients_SP':  literal_eval})
master_data_df

Unnamed: 0,recipe_name,cuisine_SP,aisle_SP,ingredients_SP
0,Pear-ginger upside-down cake,[],"[Baking, Baking, Milk, Eggs, Other Dairy, Baki...","[low sodium baking powder, baking soda, butter..."
1,Easy Chicken Cordon Bleu,[],"[Meat, Meat, Cheese, Spices and Seasonings, Sp...","[boneless skinless chicken breast, ham, chedda..."
2,Chicken 65,[],"[Meat, Spices and Seasonings, Ethnic Foods, Sp...","[chicken breast, chili powder, ginger garlic p..."
3,Herb Roasted Chicken,[],"[Spices and Seasonings, Baking, Milk, Eggs, Ot...","[bay leaves, golden brown sugar, butter, dried..."
4,Meatball Sliders,[],"[Spices and Seasonings, Pasta and Rice, Produc...","[bay leaves, breadcrumbs, marjoram, egg, parsl..."
...,...,...,...,...
9995,Cinnamon Twists,[],"[Baking, Milk, Eggs, Other Dairy, Milk, Eggs, ...","[dry yeast, butter, egg, milk, salt, sugar]"
9996,Fluffy frittata with spinach,[],"[Milk, Eggs, Other Dairy, Produce, Spices and ...","[egg, garlic, black pepper, nutmeg, olive oil,..."
9997,Protein Packed Carrot Muffins,[],"[Spices and Seasonings, Gluten Free;Health Foo...","[dry seasoning rub, almond meal, low sodium ba..."
9998,BLT Sandwich,[],"[Produce, Bakery/Bread, Produce, Condiments, M...","[bell pepper, bread, lettuce, mayonnaise, thic..."


### Define functions required to clean dataframe

In [3]:
# Converting strings to lists then remove [] and ()
def ingredients_cleanup (data_df, column):
    row_list = []
    indexes = data_df.index.values.tolist()
    counter = 0
    for row in data_df[column]:
        row = row.replace("[", '')
        row = row.replace("'", '')
        row = row.replace("]", '')
        row = row.replace('"', '')
        row_list = row.split( ',')
        data_df.loc[indexes[counter],column] = row_list
        counter += 1
    
    return data_df

In [4]:
# Removing leading white spaces 
def remove_leading_ws(data_df, column):
    indexes = data_df.index.values.tolist()
    counter = 0
    for row in data_df[column]:
        for i, element in enumerate(row):
            element = element.strip()
            data_df.loc[indexes[counter],column][i] = element
        counter += 1
    return data_df


In [5]:
# Creating a single list from all the ingredents 
def total_ing_list_from_df(data_df, ingred_col):
    composite_ingredients = []
    for row in data_df[ingred_col]:
        for element in row:
            composite_ingredients.append(element)
    return composite_ingredients

In [6]:
# Creating list of composite list and creat count of ingredients
def sort_and_count_ingredient_list(ingredient_list):
    ingredient_df = pd.DataFrame(ingredient_list)
    ingredient_df = ingredient_df.rename(columns = {0:  'ingredient'})
    ingredient_df['count'] = 1
    ingredient_counts = ingredient_df.groupby('ingredient').agg({'count':  'count'})
    ingredient_counts.sort_values('count', ascending=False, inplace=True)
    return ingredient_counts


In [11]:
# Replacing different verisons of ingredients
def ingredient_replacement (data_df):
    
    indexes = data_df.index.values.tolist()
    ingredients_raw = data_df['ingredients_SP'].tolist()
    corrected_ingredient_list = []
    for row in ingredients_raw:
        one_row_ing_list = []
        for element in row:
            element=str(element)
            if re.search('(?:^|\s)chicken(?!\sstock|\sbroth|\sbouillon|\sbase)',element):
                one_row_ing_list.append('chicken')
            elif re.search('(?:^|\s)pork|baby\sback|boston\sbutt|ham|spare\srib(?:\s|$|s)',element):
                one_row_ing_list.append('pork')
            elif re.search('(?:^|\s)bean(?!\ssauce)',element):
                one_row_ing_list.append('beans') 
            elif re.search('(?:(?<=acorn)|(?<=butternut)|(?<=kabocha))\ssquash(?:$|\s)', element):
                one_row_ing_list.append('winter squash')
            elif re.search('(?:(?<=american\s)|(?<=sheep\s)|(?<=string\s)|(?<=cottage\s)|(?<=sharp\s)|(?<=pepperjack\s)|(?<=fontina\s)|(?<=monterey\sjack\s)|(?<=cheddar\s)|(?<=shredded\s)|(?<=swiss\s)|(?<=cream\s)|(?<=^))cheese|gouda(?:$|\s)', element):
                one_row_ing_list.append('cheese')
            elif re.search('(?:(?<=asiago\s)|(?<=ricotta\s)|(?<=mozzarella\s)|(?<=parmessan\s)|(?<=^))cheese(?:$|\s)', element):
                one_row_ing_list.append('italian cheese')
            elif re.search('(?:(?<=black\s)|(?<=green\s)|(?<=pink\s))pepper(?:corns|\s|$)', element):
                one_row_ing_list.append('peppercorns')
            elif re.search('(?:(?<=blue\s)|(?<=feta\s)|(?<=goat\s)|(?<=^))cheese|gorgonzola(?:$|\s)', element):
                one_row_ing_list.append('mediterranean cheese')
            elif re.search('(?:^|\s)cotija\scheese|queso\sfresco|mexican\scheese(?:$|\s)', element):
                one_row_ing_list.append('mexican cheese')
            elif re.search('(?:(?<=green)|(?<=yellow)|(?<=orange)|(?<=red)|(?<=bell))\spepper(?:$|\s|s)', element):
                one_row_ing_list.append('bell pepper')
            elif re.search('(?:^|\s)amaranth|barley|quinoa(?:$|\s)',element):
                one_row_ing_list.append('grain')
            elif re.search('(?:^|\s)ancho|dried\schile(?:$|\s)', element):
                one_row_ing_list.append('dried chile')
            elif re.search('(?:^|\s)anchovy|anchovies(?:$|\s)', element):
                one_row_ing_list.append('anchovy')
            elif re.search('(?:^|\s)apple(?:$|s)', element):
                one_row_ing_list.append('apple')
            elif re.search('(?:^|\s)apple\s(?:cider|juice)(?!\svinegar)', element):
                one_row_ing_list.append('apple juice')
            elif re.search('(?:^|\s)apricot(?:$|\s|s)', element):
                one_row_ing_list.append('apricot')
            elif re.search('(?:^|\s)balsamic(?:\sglaze|$)', element):
                one_row_ing_list.append('balsamic')
            elif re.search('(?:^|\s)bacon(?:$|\s)', element):
                one_row_ing_list.append('bacon')
            elif re.search('(?:^|\s)banana\spepper|jalapeno|pimenton\sde\sla\svera|poblano|serrano|scotch\sbonnet|habanero|green\schili\spepper(?:$|\s|s)', element):
                one_row_ing_list.append('fresh pepper')
            elif re.search('(?:^|\s)base|stock|bouillon|broth(?:$|\s)', element):
                one_row_ing_list.append('broth')
            elif re.search('(?:^|\s)basil(?!\spesto)', element):
                one_row_ing_list.append('basil')
            elif re.search('(?:^|\s)beef|steak|ground\schuck|ground\selk|ground\ssirloin|veal|pot\sroast(?!\sbroth|\sbase|\sstock|\sbouillon|steak|\sgravy)', element):
                one_row_ing_list.append('beef')
            elif re.search('(?:^|\s)beet(?:$|\s|s)', element):
                one_row_ing_list.append('beets')
            elif re.search('(?:(?:^)|(?:\s))carrot(?:$|\s|s)', element):
                one_row_ing_list.append('carrots')
            elif re.search('(?:^|\s)cardamom(?:$|\s|s)', element):
                one_row_ing_list.append('cardomom')
            elif re.search('(?:^|\s)chocolate(?:$|\s)',element):
                one_row_ing_list.append('chocolate')
            elif re.search('(?:^|\s)corn(?:$|\s)', element):
                one_row_ing_list.append('corn')
            elif re.search('(?:^|\s)eggplant(?:$|\s|s)', element):
                one_row_ing_list.append('eggplant')
            elif re.search('(?:^|\s)gravy(?:$|\s)', element):
                one_row_ing_list.append('gravy')
            elif re.search('(?:^|\s)lentil(?:$|\s|s)', element):
                one_row_ing_list.append('lentils')
            elif re.search('(?:^|\s)olive(?!\soil)', element):
                one_row_ing_list.append('olives')
            elif re.search('(?:^|\s)pasta|penne|rigatoni|pappardelle|macaroni|lasagne\snoodles|orzo|tortellini|farfalle|ravioli|fettuccine|linguine|spaghetti(?:$|\s)', element):
                one_row_ing_list.append('pasta')
            elif re.search('(?:^|\s)pear(?:$|\s|s)', element):
                one_row_ing_list.append('pear')
            elif re.search('(?:^|\s)pesto(?:$|\s)', element):
                one_row_ing_list.append('pesto')
            elif re.search('(?:^|\s)rice(?!\sflour|\spudding|d)(?:$|\s)', element):
                one_row_ing_list.append('rice')
            elif re.search('(?:^|\s)spinach(?:$|\s)', element):
                one_row_ing_list.append('spinach')
            elif re.search('(?:^|\s)tuna|catfish|cod|tilapia|white\sfish|swordfish|clams|mackerel|halibut|crawfish|salmon|crab|shrimp|scallop|oyster(?:$|\s|s)',element):
                one_row_ing_list.append('seafood')
            elif re.search('(?:^|\s)tomato(?!\spaste|\ssauce)', element):
                one_row_ing_list.append('tomato')
            elif re.search('(?:^|\s)vinegar(?:$|\s)', element):
                one_row_ing_list.append('vinegar')
            elif re.search('(?:^|\s)vanilla(?!\sice|\scake)', element):
                one_row_ing_list.append('vanilla')
            elif re.search('(?:^|\s)masala(?:$|\s)', element):
                one_row_ing_list.append('masala')
            elif re.search('(?:^|\s|\w*)berries|berry(?:$|\s)', element):
                one_row_ing_list.append('berries')
            elif re.search('(?:(?:black\s)|(?:green\s)|(?:pink\s)|(?:fresh\s))[Pp]epper(?:corns|\s|$)', element):
                one_row_ing_list.append('black pepper')
            elif re.search('(?:^|\s)almond(?!\smilk)', element):
                one_row_ing_list.append('almonds')
            elif re.search('(?:^|\s)oil(?:\s|$)', element):
                one_row_ing_list.append('oil')
            elif re.search('(?:^|\s)avocado(?!\soil)', element):
                one_row_ing_list.append('avocado')
            elif re.search('(?:^|\s)turkey(?!\sstock|\sbroth)', element):
                one_row_ing_list.append('turkey')
            elif re.search('(?:^|\s)bread|croissant|baguette|crescent\sroll|bun|roll|crouton|english\smuffin(?!\sflour|crumbs)', element):
                one_row_ing_list.append('bread')
            elif re.search('(?:^|\s)bratwurst|chorizo|sausage(?:$|\s)', element):
                one_row_ing_list.append('sausage')
            elif re.search('(?:^|\s)broccoli(?:$|\sfloret)', element):
                one_row_ing_list.append('broccoli')
            elif re.search('(?:^|\s)mustard\sseed|mustard\spowder(?:$|s)', element):
                one_row_ing_list.append('mustard seed')
            elif re.search('(?:(?:^)|(?:clarified\s)|(?:unsalted\s)|(?:salted\s))butter(?!\slettuce)', element):
                one_row_ing_list.append('butter')
            elif re.search('(?:^|\s)cabbage(?:$|/s)', element):
                one_row_ing_list.append('cabbage')
            elif re.search('(?:^|\s)curry(?:\spaste|\spowder|\sleaves)', element):
                one_row_ing_list.append('curry')
            elif re.search('(?:^|\s)pineapple(?:$|/s)', element):
                one_row_ing_list.append('pineapple')
            elif re.search('(?:(?:himalayan\s)|(?:coarse\s)|(?:sea\s)|(?:kosher\s)|(?:^))salt(?:$|/s)', element):
                one_row_ing_list.append('salt')
            elif re.search('(?:(?:\s)|(?:^))mozzarella|parmesan|pecorino\sromano|grana\spadano|mascarpone(?:$|/s)', element):
                one_row_ing_list.append('italian cheese')
            elif re.search('(?:^|\s)butterscotch(?:$|/s)', element):
                one_row_ing_list.append('butterscotch')
            elif re.search('(?:(?:^)|(?:bread\s)|(?:unbleached\sall\spurpose\s)|(?:self\srising\s)|(?:cake\s)|(?:unbleached\s)|(?:wheat\s)|(?:pastry\s))flour(?:$|/s)', element):
                one_row_ing_list.append('wheat flour')
            elif re.search('(?:(?:^)|(?:\s))ginger(?:$|\spaste|\spowder|\sjuice)', element):
                one_row_ing_list.append('ginger')
            elif re.search('(?:(?:^)|(?:\s))blackeyed\speas(?:$|/s)', element):
                one_row_ing_list.append('blackeyed peas')
            elif re.search('(?:(?:^)|(?:\s))onion(?!\ssoup)', element):
                one_row_ing_list.append('onion')
            elif re.search('(?:(?:green\s)|(?:chipotle\s))chile(?:\s|s|$)', element):
                one_row_ing_list.append('fresh chile')
            elif re.search('(?:(?:^)|(?:\s))cauliflower(?:\sfloret|$)', element):
                one_row_ing_list.append('cauliflower')
            elif re.search('(?:(?:^)|(?:\s))celery(?!\ssalt|\sseed)', element):
                one_row_ing_list.append('celery')
            elif re.search('(?:(?:^)|(?:\s))cherry|cherries(?:\s|$)', element):
                one_row_ing_list.append('cherry')
            elif re.search('(?:(?:chickpea\s)|(?:rice\s)|(?:almond\s)|(?:coconut\s)|(?:gluten\sfree\s)|(?:gluten\sfree\sall\spurpose\s))flour(?:\s|$)', element):
                one_row_ing_list.append('alt flour')
            elif re.search('(?:(?:^)|(?:\s))cinnamon(?:\sstick|$)', element):
                one_row_ing_list.append('cinnamon')
            elif re.search('(?:(?:^)|(?:\s))tortilla(?:\s|$)', element):
                one_row_ing_list.append('tortilla')
            elif re.search('(?:(?:^)|(?:\s))lettuce(?:\s|$)', element):
                one_row_ing_list.append('lettuce')
            elif re.search('(?:(?:^)|(?:\s))chickpea|garbonzo|hummus|chana\sdal(?:\s|$)', element):
                one_row_ing_list.append('chickpea')
            elif re.search('(?:(?:^)|(?:\s))tomato(?:\ssauce|\spaste|\sjuice)', element):
                one_row_ing_list.append('tomato sauce')
            elif re.search('(?:(?:^)|(?:\s))coconut(?:\smeat|\sextract|\sflake|$)', element):
                one_row_ing_list.append('coconut')
            elif re.search('(?:(?:^)|(?:\s))mustard(?!\spowder|\sseed)', element):
                one_row_ing_list.append('prepared mustard')
            elif re.search('(?:(?:^)|(?:\s))mushroom|mushrooms(?!\ssoup)', element):
                one_row_ing_list.append('mushrooms')
            elif re.search('(?:(?:^)|(?:\s))cumin(?:\sseeds|$)', element):
                one_row_ing_list.append('cumin')
            elif re.search('(?:(?:^)|(?:\s))[Ss]ugar(?:\s|$)', element):
                one_row_ing_list.append('sugar')
            elif re.search('(?:(?:^)|(?:\s))[Gg]arlic(?!\ssauce|\schili)', element):
                one_row_ing_list.append('garlic')
            elif re.findall('dried.*?chile',element):
                one_row_ing_list.append('dried chile')
            elif re.search('(?:(?:^)|(?:dried\s))[Cc]ilantro(?:\s|$)', element):
                one_row_ing_list.append('cilantro')
            elif re.search('(?:(?:^)|(?:dried\s))[Dd]ill(?:\s|weed|$)', element):
                one_row_ing_list.append('dill')
            elif re.search('(?:(?:^)|(?:\s))[Ff]enugreek(?:\s|$)', element):
                one_row_ing_list.append('fenugreek')
            elif re.search('(?:(?:^)|(?:dried\s))[Mm]int(?:\s|$)', element):
                one_row_ing_list.append('mint')
            elif re.search('(?:(?:^)|(?:dried\s))[Pp]arsley(?:\s|$)', element):
                one_row_ing_list.append('parsley')
            elif re.search('(?:(?:^)|(?:dried\s))[Tt]hyme(?:\s|$)', element):
                one_row_ing_list.append('thyme')
            elif re.search('(?:(?:^)|(?:\s))[Ww]ine|sherry|brandy(?:\s|$)', element):
                one_row_ing_list.append('wine')
            elif re.search('(?:(?:^)|(?:\s))[Ee]gg(?:\sreplacer|\swhite|\syolk|\ssubstitute|s|$)', element):
                one_row_ing_list.append('egg')
            elif re.search('(?:(?:^)|(?:\s))[Cc]ucumber(?:\s|s|$)', element):
                one_row_ing_list.append('cucumber')
            elif re.search('(?:(?:^)|(?:\s))[Tt]ofu(?:\s|$)', element):
                one_row_ing_list.append('tofu')
            elif re.search('(?:(?:^)|(?:\s))[Yy]ogurt(?:\s|$)', element):
                one_row_ing_list.append('yogurt')
            elif re.search('(?:(?:^)|(?:\s))[Mm]ayonnaise(?:\s|$)', element):
                one_row_ing_list.append('mayonnaise')
            elif re.search('(?:(?:^)|(?:\s))[Ff]ennel(?:\s|$)', element):
                one_row_ing_list.append('fennel')
            elif re.search('(?:(?:^)|(?:\s))[Cc]offee|espresso(?:\s|$)', element):
                one_row_ing_list.append('coffee')
            elif re.search('(?:(?:^)|(?:\s))[Gg]rits(?:\s|$)', element):
                one_row_ing_list.append('grits')
            elif re.search('(?:(?:^)|(?:\s))[Cc]love(?:\s|$)', element):
                one_row_ing_list.append('clove')
            elif re.search('(?:(?:^)|(?:\s))[Cc]oriander(?:\s|$)', element):
                one_row_ing_list.append('coriander')
            elif re.search('(?:(?:^)|(?:\s))[Ll]amb|mutton(?:\s|$)', element):
                one_row_ing_list.append('lamb')
            elif re.search('(?:(?:^)|(?:\s))[Ss]age(?:\s|$)', element):
                one_row_ing_list.append('sage')
            elif re.search('(?:(?:^)|(?:\s))[Ss]avory(?:\s|$)', element):
                one_row_ing_list.append('savory')
            elif re.search('(?:(?:^)|(?:\s))[Hh]avarti|gruyere|provolone|colby\sjack(?:\s|$)', element):
                one_row_ing_list.append('cheese')
            elif re.search('(?:(?:^)|(?:\s))[Oo]at(?:meal|s|\s|$)', element):
                one_row_ing_list.append('oats')
            elif re.search('(?:(?:^)|(?:\s))[Cc]ouscous(?:\s|$)', element):
                one_row_ing_list.append('couscous')
            elif re.search('(?:(?:^)|(?:\s))[Dd]ate(?:s|\s|$)', element):
                one_row_ing_list.append('dates')
            elif re.search('(?:(?:^)|(?:\s))[Ss]yrup(?:\s|$)', element):
                one_row_ing_list.append('syrup')
            elif re.search('(?:(?:^)|(?:\s))[Mm]arjoram(?:\s|$)', element):
                one_row_ing_list.append('marjoram')
            elif re.search('(?:(?:^)|(?:\s))[Rr]osemary(?:\s|$)', element):
                one_row_ing_list.append('rosemary')
            elif re.search('(?:(?:^)|(?:\s))[Aa]rtichoke(?:\s|$|s)', element):
                one_row_ing_list.append('artichoke')
            elif re.search('(?:(?:^)|(?:\s))[Ee]nchilada\ssauce(?:\s|$)', element):
                one_row_ing_list.append('enchilada sauce')
            elif re.search('(?:(?:^)|(?:\s))half\sand\shalf|cream|milk(?!cheese)', element):
                one_row_ing_list.append('milk or cream')
            elif re.search('(?:(?:^)|(?:\s))cocoa(?:\s|$)', element):
                one_row_ing_list.append('cocoa')
            elif re.search('(?:(?:^)|(?:\s))hollandaise(?:\s|$)', element):
                one_row_ing_list.append('hollandaise sauce')
            elif re.search('(?:(?:^)|(?:\s))potato|hash\sbrowns(?:es|\s|$)', element):
                one_row_ing_list.append('potato')
            elif re.search('(?:(?:^)|(?:\s))lemon(?:\sjuice|\speel|\swedge|\sextract|$)', element):
                one_row_ing_list.append('lemon')
            elif re.search('(?:(?:^)|(?:light\s)|(?:rye\s)|(?:wheat\s))beer|stout(?:\s|$)', element):
                one_row_ing_list.append('beer')
            elif re.search('(?:(?:^)|(?:\s))sour\scream(?:\s|$)', element):
                one_row_ing_list.append('sour cream')
            elif re.search('(?:(?:^)|(?:\s))soy\ssauce(?:\s|$)', element):
                one_row_ing_list.append('soy sauce')
            elif re.search('(?:(?:^)|(?:\s))lime(?:\sjuice|\speel|\swedge|\sextract|$)', element):
                one_row_ing_list.append('lime')
            elif re.search('(?:(?:^)|(?:\s))maple(?:\sextract|\sflavoring|\ssyrup|$)', element):
                one_row_ing_list.append('maple')
            elif re.search('(?:(?:^)|(?:\s))lemonade(?:\s|$)', element):
                one_row_ing_list.append('lemonade')
            elif re.search('(?:(?:^)|(?:\s))baking\spowder(?:\s|$)', element):
                one_row_ing_list.append('baking powder')
            elif re.search('(?:(?:^)|(?:\s))orange(?:\speel|\szest|\sslice|\swedge|\sextract|\sjuice|\ssection|$)', element):
                one_row_ing_list.append('orange')
            elif re.search('(?:(?:^)|(?:\s))breadcrumbs|panko(?:\s|$)', element):
                one_row_ing_list.append('breadcrumbs')
            elif re.search('(?:(?:^)|(?:\s))cayenne(?:\s|$)', element):
                one_row_ing_list.append('cayenne')
            elif re.search('(?:(?:^)|(?:\s))hot\ssauce|picante\ssauce|salsa|sriracha(?:\s|$)', element):
                one_row_ing_list.append('hot sauce or salsa')
            elif re.search('(?:(?:^)|(?:\s))pineapple(?:\s|$)', element):
                one_row_ing_list.append('pineapple')
            elif re.search('(?:(?:^)|(?:\s))polenta(?:\s|$)', element):
                one_row_ing_list.append('polenta')
            elif re.search('(?:(?:^)|(?:\s))peanut\sbutter|almond\sbutter|nut\sbutter(?:\s|$)', element):
                one_row_ing_list.append('peanut butter')
            elif re.search('(?:(?:^)|(?:\s))banana(?!\sliqueur)', element):
                one_row_ing_list.append('banana')
            elif re.search('(?:(?:^)|(?:\s))pumpkin\sseed(?:s|\s|$)', element):
                one_row_ing_list.append('pumpkin seeds')
            elif re.search('(?:(?:^)|(?:\s))paprika(?:\s|$)', element):
                one_row_ing_list.append('paprika')
            elif re.search('(?:(?:^)|(?:\s))duck(?:\s|$)', element):
                one_row_ing_list.append('duck')
            elif re.search('(?:(?:^)|(?:\s))fig(?:\s|$|s)', element):
                one_row_ing_list.append('fig')
            elif re.search('(?:(?:^)|(?:\s))filo|puff\spastry|mini\sfilo\sshells(?:\s|$)', element):
                one_row_ing_list.append('puff pastry')
            elif re.search('(?:(?:^)|(?:\s))salami|pancetta|pepperoni|prosciutto(?:\s|$)', element):
                one_row_ing_list.append('italian cured meat')
            elif re.search('(?:(?:^)|(?:\s))macadamia|peanuts|pine\snuts|pecans|cashews|pistachio(?:\s|$|s)', element):
                one_row_ing_list.append('nuts')
            elif re.search('(?:(?:^)|(?:\s))summer\ssquash|zucchini(?:\s|$)', element):
                one_row_ing_list.append('summer squash')
            elif re.search('(?:(?:^)|(?:\s))yeast(?:\s|$)', element):
                one_row_ing_list.append('yeast')
            elif re.search('(?:(?:^)|(?:\s))raisin(?:\s|$|s)', element):
                one_row_ing_list.append('raisin')
            elif re.search('(?:(?:^)|(?:\s))flaxseed(?:\s|$|s)', element):
                one_row_ing_list.append('flaxseed')
            elif re.search('(?:(?:^)|(?:\s))water\schestnut(?:\s|$|s)', element):
                one_row_ing_list.append('water chestnuts')
            elif re.search('(?:(?:^)|(?:\s))chili\spowder|chili\spepper(?:\s|$|s)', element):
                one_row_ing_list.append('chili powder')
            elif re.search('(?:(?:^)|(?:\s))seasoning|seasoned|spices(?:\s|$|s)', element):
                one_row_ing_list.append('seasoning mix')
            else:
                 one_row_ing_list.append(element)
        corrected_ingredient_list.append(one_row_ing_list)
        
    data_df['ingredients_corrected'] = corrected_ingredient_list
    

    return data_df

In [12]:
ing_list = total_ing_list_from_df(master_data_df, 'ingredients_SP') 
sorted_ing_list = sort_and_count_ingredient_list(ing_list)
# sorted_ing_list.index.values.tolist()

In [13]:
master_data_df

Unnamed: 0,recipe_name,cuisine_SP,aisle_SP,ingredients_SP
0,Pear-ginger upside-down cake,[],"[Baking, Baking, Milk, Eggs, Other Dairy, Baki...","[low sodium baking powder, baking soda, butter..."
1,Easy Chicken Cordon Bleu,[],"[Meat, Meat, Cheese, Spices and Seasonings, Sp...","[boneless skinless chicken breast, ham, chedda..."
2,Chicken 65,[],"[Meat, Spices and Seasonings, Ethnic Foods, Sp...","[chicken breast, chili powder, ginger garlic p..."
3,Herb Roasted Chicken,[],"[Spices and Seasonings, Baking, Milk, Eggs, Ot...","[bay leaves, golden brown sugar, butter, dried..."
4,Meatball Sliders,[],"[Spices and Seasonings, Pasta and Rice, Produc...","[bay leaves, breadcrumbs, marjoram, egg, parsl..."
...,...,...,...,...
9995,Cinnamon Twists,[],"[Baking, Milk, Eggs, Other Dairy, Milk, Eggs, ...","[dry yeast, butter, egg, milk, salt, sugar]"
9996,Fluffy frittata with spinach,[],"[Milk, Eggs, Other Dairy, Produce, Spices and ...","[egg, garlic, black pepper, nutmeg, olive oil,..."
9997,Protein Packed Carrot Muffins,[],"[Spices and Seasonings, Gluten Free;Health Foo...","[dry seasoning rub, almond meal, low sodium ba..."
9998,BLT Sandwich,[],"[Produce, Bakery/Bread, Produce, Condiments, M...","[bell pepper, bread, lettuce, mayonnaise, thic..."


In [14]:
master_data_df_edited = ingredient_replacement(master_data_df)
master_data_df_edited

Unnamed: 0,recipe_name,cuisine_SP,aisle_SP,ingredients_SP,ingredients_corrected
0,Pear-ginger upside-down cake,[],"[Baking, Baking, Milk, Eggs, Other Dairy, Baki...","[low sodium baking powder, baking soda, butter...","[baking powder, baking soda, butter, sugar, su..."
1,Easy Chicken Cordon Bleu,[],"[Meat, Meat, Cheese, Spices and Seasonings, Sp...","[boneless skinless chicken breast, ham, chedda...","[chicken, pork, cheese, salt, peppercorns, bre..."
2,Chicken 65,[],"[Meat, Spices and Seasonings, Ethnic Foods, Sp...","[chicken breast, chili powder, ginger garlic p...","[chicken, chili powder, garlic, salt, turmeric..."
3,Herb Roasted Chicken,[],"[Spices and Seasonings, Baking, Milk, Eggs, Ot...","[bay leaves, golden brown sugar, butter, dried...","[bay leaves, sugar, butter, basil, rosemary, t..."
4,Meatball Sliders,[],"[Spices and Seasonings, Pasta and Rice, Produc...","[bay leaves, breadcrumbs, marjoram, egg, parsl...","[bay leaves, bread, marjoram, egg, parsley, ga..."
...,...,...,...,...,...
9995,Cinnamon Twists,[],"[Baking, Milk, Eggs, Other Dairy, Milk, Eggs, ...","[dry yeast, butter, egg, milk, salt, sugar]","[yeast, butter, egg, milk or cream, salt, sugar]"
9996,Fluffy frittata with spinach,[],"[Milk, Eggs, Other Dairy, Produce, Spices and ...","[egg, garlic, black pepper, nutmeg, olive oil,...","[egg, garlic, peppercorns, nutmeg, oil, italia..."
9997,Protein Packed Carrot Muffins,[],"[Spices and Seasonings, Gluten Free;Health Foo...","[dry seasoning rub, almond meal, low sodium ba...","[seasoning mix, almonds, baking powder, baking..."
9998,BLT Sandwich,[],"[Produce, Bakery/Bread, Produce, Condiments, M...","[bell pepper, bread, lettuce, mayonnaise, thic...","[bell pepper, bread, lettuce, mayonnaise, baco..."


### Create a list of all ingredients from the cuisine 

In [15]:
composite_ingredients = total_ing_list_from_df(master_data_df_edited, 'ingredients_corrected')

In [16]:
composite_ingredients

['baking powder',
 'baking soda',
 'butter',
 'sugar',
 'sugar',
 'egg',
 'wheat flour',
 'ginger',
 'cinnamon',
 'syrup',
 'pear',
 'salt',
 'butter',
 'chicken',
 'pork',
 'cheese',
 'salt',
 'peppercorns',
 'bread',
 'wheat flour',
 'egg',
 'butter',
 'chicken',
 'chili powder',
 'garlic',
 'salt',
 'turmeric',
 'yogurt',
 'bay leaves',
 'sugar',
 'butter',
 'basil',
 'rosemary',
 'thyme',
 'fennel',
 'water',
 'salt',
 'peppercorns',
 'water',
 'chicken',
 'bay leaves',
 'bread',
 'marjoram',
 'egg',
 'parsley',
 'garlic',
 'garlic',
 'pork',
 'beef',
 'oil',
 'oregano',
 'italian cheese',
 'bread',
 'sugar',
 'tomato',
 'oil',
 'milk or cream',
 'onion',
 'chicken',
 'cinnamon',
 'seeds',
 'seeds',
 'clove',
 'fresh pepper',
 'garlic',
 'ginger',
 'peppercorns',
 'saffron',
 'milk or cream',
 'salt',
 'None',
 'arugula',
 'vinegar',
 'capers',
 'olives',
 'mayonnaise',
 'oil',
 'parsley',
 'tomato',
 'mesclun',
 'shallot',
 'seafood',
 'sugar',
 'vinegar',
 'beets',
 'oil',
 'vine

### Create a dataframe from the composite ingredients list, group by ingredient name with count(), and sort descending

In [17]:
ing_groups = sort_and_count_ingredient_list(composite_ingredients)
# ing_groups.to_csv('master_ingredient_dataframe.csv')
ing_groups.index.values

array(['sugar', 'salt', 'milk or cream', 'oil', 'egg', 'butter', 'onion',
       'garlic', 'wheat flour', 'vanilla', 'lemon', 'peppercorns',
       'water', 'cheese', 'bell pepper', 'tomato', 'salt and pepper',
       'chocolate', 'bread', 'chicken', 'baking powder', 'italian cheese',
       'broth', 'parsley', 'cinnamon', 'berries', 'corn', 'seafood',
       'pork', 'nuts', 'ginger', 'lime', 'carrots', 'baking soda',
       'vinegar', 'thyme', 'beans', 'beef', 'basil', 'rice',
       'fresh pepper', 'cilantro', 'almonds', 'cumin', 'honey', 'potato',
       'wine', 'mushrooms', 'apple', 'pasta', 'orange', 'cocoa',
       'chili powder', 'yogurt', 'bacon', 'hot sauce or salsa', 'banana',
       'nutmeg', 'soy sauce', 'celery', 'cayenne', 'None', 'shallot',
       'syrup', 'walnuts', 'seasoning mix', 'mediterranean cheese',
       'mayonnaise', 'coconut', 'coffee', 'bay leaves', 'oregano',
       'tomato sauce', 'paprika', 'prepared mustard', 'mint', 'spinach',
       'avocado', 'rosemar

## List of top 100 ingredients

In [18]:
top_100_ingredients = ['cumin','onion','garlic', 'vanilla', 'lemon','bell pepper', 'tomato', 'chocolate','mushrooms', 'italian cheese', 'parsley', 'cinnamon', 'berries','corn', 'broth', 'ginger', 'lime','carrots', 'vinegar', 
'thyme','basil', 'rice', 'fresh pepper', 'almonds', 'cilantro', 'potato','honey', 'wine', 'apple', 'pasta', 'orange', 'cocoa','chili powder', 'yogurt', 'bacon', 'hot sauce or salsa', 'banana',
'nutmeg', 'soy sauce', 'celery', 'cayenne', 'shallot','syrup', 'mediterranean cheese','mayonnaise', 'coconut', 'coffee', 'bay leaves', 'oregano','tomato sauce', 'paprika', 'prepared mustard', 
'spinach', 'mint','avocado', 'rosemary', 'chives', 'pineapple','peanut butter', 'worcestershire sauce', 'cherry', 'curry','cabbage', 'turmeric','summer squash', 'barbecue sauce', 'dill',
'pear', 'mango', 'mustard seed', 'chickpea', 'cucumber','broccoli', 'coriander', 'cornmeal','eggplant', 'sage', 'cauliflower', 'grain', 'dates', 'olives','raisin', 'ketchup', 'italian cured meat',
'fresh chile', 'sesame seeds', 'clove', 'flaxseed','winter squash', 'kale', 'beer', 'cardomom','asparagus', 'allspice', 'capers', 'petite peas','agave', 'fennel', 'molasses','masala', 'watermelon', 
'tofu', 'dried chile', 'puff pastry','artichoke', 'plantain', 'apricot','canned pumpkin','brussels sprouts', 'swiss chard','beets', 'peach', 'grapes', 'fish sauce', 'couscous', 'enchilada sauce', 
'grits', 'seeds' 'arugula', 'tarragon', 'leek', 'marjoram','liquid smoke', 'oats', 'brie', 'purple plum', 'snow peas','peppermint extract', 'sunflower kernels', 'rhubarb', 'wonton wrappers', 
'polenta', 'lentils', 'hazelnuts', 'tahini','pumpkin pie spice', 'celery salt', 'pumpkin seeds','poppy seeds','horseradish', 'daikon radish', 'teriyaki sauce','apple juice', 'parsnip','blackeyed peas',
'hoisin sauce', 'saffron','asafoetida', 'creme fraiche','mexican cheese', 'celery seed', 'maple','tamari', 'marsala', 'herbes de provence', 'port','dulce de leche', 'fig', 'yam', 'chinese five spice', 
'arrowroot','fenugreek', 'amaretto liqueur', 'water chestnuts', 'bran','quail', 'pita','whiskey', 'legumes', 'gochujang', 'xylitol', 'farro', 'mexican crema', 'hollandaise sauce','caramel sauce', 
'green tea bag', 'masa harina','thai chili', 'coconut rum','tomatillos', 'dried currants', 'lemon grass', 'boquerones','bitters', 'chai spice', 'broccolini', 'guacamole', 'bourbon','amchoor', 
'radish', 'ladyfingers', 'black bean sauce', 'tea', 'star anise', 'ajwain', 'watercress',  'chili sauce', 'anise', 'broccoli rabe', 'kelp', 'bok choy', 'guava', 'vodka', 'crepes', 'green grapes',
'caraway seeds',  'vegeta','egg noodles', 'banana liqueur', 'black tea bag', 'wasabi powder', 'miso','white grape juice', 'fresh lavender', 'korean bbq marinade','bottle gourd','grand marnier', 
'sweet chili sauce', 'escarole', 'cassava','radicchio', 'moscato', 'lemon pepper','paneer', 'prunes', 'balsamic','hibiscus', 'tequila', 'sardines', 'turnip', 'pickle juice', 'gherkins', 
'orange marmalade','stinging nettle', 'mini peppers', 'carob powder', 'soba noodles', 'zaatar', 'marzipan', 'orange bitters', 'matcha', 'pesto', 'rose water','matzo meal', 'quark', 'okra',
'pico de gallo', 'chili garlic sauce','tangerine', 'file powder', 'cracked wheat','eggnog',  'kefir', 'dandelion greens', 'harissa', 'sprouts','squash blossoms', 'grapefruit', 'tamarind pulp', 
'savory','baharat', 'douchi','sucralose', 'jicama']

### Code to apply regex will go below here

## Code to create dataframe of 1s and 0s below.  Will need to be edited

In [21]:
for row in (top_100_ingredients):
    master_data_df_edited[row] = 0
master_data_df_edited.columns   


Index(['recipe_name', 'cuisine_SP', 'aisle_SP', 'ingredients_SP',
       'ingredients_corrected', 'cumin', 'onion', 'garlic', 'vanilla', 'lemon',
       ...
       'harissa', 'sprouts', 'squash blossoms', 'grapefruit', 'tamarind pulp',
       'savory', 'baharat', 'douchi', 'sucralose', 'jicama'],
      dtype='object', length=268)

In [22]:
master_data_df_edited

Unnamed: 0,recipe_name,cuisine_SP,aisle_SP,ingredients_SP,ingredients_corrected,cumin,onion,garlic,vanilla,lemon,...,harissa,sprouts,squash blossoms,grapefruit,tamarind pulp,savory,baharat,douchi,sucralose,jicama
0,Pear-ginger upside-down cake,[],"[Baking, Baking, Milk, Eggs, Other Dairy, Baki...","[low sodium baking powder, baking soda, butter...","[baking powder, baking soda, butter, sugar, su...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Easy Chicken Cordon Bleu,[],"[Meat, Meat, Cheese, Spices and Seasonings, Sp...","[boneless skinless chicken breast, ham, chedda...","[chicken, pork, cheese, salt, peppercorns, bre...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Chicken 65,[],"[Meat, Spices and Seasonings, Ethnic Foods, Sp...","[chicken breast, chili powder, ginger garlic p...","[chicken, chili powder, garlic, salt, turmeric...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Herb Roasted Chicken,[],"[Spices and Seasonings, Baking, Milk, Eggs, Ot...","[bay leaves, golden brown sugar, butter, dried...","[bay leaves, sugar, butter, basil, rosemary, t...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Meatball Sliders,[],"[Spices and Seasonings, Pasta and Rice, Produc...","[bay leaves, breadcrumbs, marjoram, egg, parsl...","[bay leaves, bread, marjoram, egg, parsley, ga...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,Cinnamon Twists,[],"[Baking, Milk, Eggs, Other Dairy, Milk, Eggs, ...","[dry yeast, butter, egg, milk, salt, sugar]","[yeast, butter, egg, milk or cream, salt, sugar]",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,Fluffy frittata with spinach,[],"[Milk, Eggs, Other Dairy, Produce, Spices and ...","[egg, garlic, black pepper, nutmeg, olive oil,...","[egg, garlic, peppercorns, nutmeg, oil, italia...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,Protein Packed Carrot Muffins,[],"[Spices and Seasonings, Gluten Free;Health Foo...","[dry seasoning rub, almond meal, low sodium ba...","[seasoning mix, almonds, baking powder, baking...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,BLT Sandwich,[],"[Produce, Bakery/Bread, Produce, Condiments, M...","[bell pepper, bread, lettuce, mayonnaise, thic...","[bell pepper, bread, lettuce, mayonnaise, baco...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
# Assigning 1 or 0 if ingredient is in ingredient list
indexes = master_data_df_edited.index.values.tolist()
for index in indexes:
    for col_name in master_data_df_edited.columns:
        if col_name in master_data_df_edited.ingredients_corrected[index]:
            master_data_df_edited.loc[index,col_name] = 1
master_data_df_edited

Unnamed: 0,recipe_name,cuisine_SP,aisle_SP,ingredients_SP,ingredients_corrected,cumin,onion,garlic,vanilla,lemon,...,harissa,sprouts,squash blossoms,grapefruit,tamarind pulp,savory,baharat,douchi,sucralose,jicama
0,Pear-ginger upside-down cake,[],"[Baking, Baking, Milk, Eggs, Other Dairy, Baki...","[low sodium baking powder, baking soda, butter...","[baking powder, baking soda, butter, sugar, su...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Easy Chicken Cordon Bleu,[],"[Meat, Meat, Cheese, Spices and Seasonings, Sp...","[boneless skinless chicken breast, ham, chedda...","[chicken, pork, cheese, salt, peppercorns, bre...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Chicken 65,[],"[Meat, Spices and Seasonings, Ethnic Foods, Sp...","[chicken breast, chili powder, ginger garlic p...","[chicken, chili powder, garlic, salt, turmeric...",0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Herb Roasted Chicken,[],"[Spices and Seasonings, Baking, Milk, Eggs, Ot...","[bay leaves, golden brown sugar, butter, dried...","[bay leaves, sugar, butter, basil, rosemary, t...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Meatball Sliders,[],"[Spices and Seasonings, Pasta and Rice, Produc...","[bay leaves, breadcrumbs, marjoram, egg, parsl...","[bay leaves, bread, marjoram, egg, parsley, ga...",0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,Cinnamon Twists,[],"[Baking, Milk, Eggs, Other Dairy, Milk, Eggs, ...","[dry yeast, butter, egg, milk, salt, sugar]","[yeast, butter, egg, milk or cream, salt, sugar]",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,Fluffy frittata with spinach,[],"[Milk, Eggs, Other Dairy, Produce, Spices and ...","[egg, garlic, black pepper, nutmeg, olive oil,...","[egg, garlic, peppercorns, nutmeg, oil, italia...",0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,Protein Packed Carrot Muffins,[],"[Spices and Seasonings, Gluten Free;Health Foo...","[dry seasoning rub, almond meal, low sodium ba...","[seasoning mix, almonds, baking powder, baking...",0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
9998,BLT Sandwich,[],"[Produce, Bakery/Bread, Produce, Condiments, M...","[bell pepper, bread, lettuce, mayonnaise, thic...","[bell pepper, bread, lettuce, mayonnaise, baco...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
# Dropping columns. Cleaning data for machine learning
master_data_df_edited = master_data_df_edited.drop(columns = ["recipe_name", "aisle_SP",  "ingredients_SP", "ingredients_corrected"])
master_data_df_edited

Unnamed: 0,cuisine_SP,cumin,onion,garlic,vanilla,lemon,bell pepper,tomato,chocolate,mushrooms,...,harissa,sprouts,squash blossoms,grapefruit,tamarind pulp,savory,baharat,douchi,sucralose,jicama
0,[],0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,[],0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,[],0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,[],0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,[],0,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,[],0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,[],0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,[],0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,[],0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
data_export_file = 'Exported_datafiles/Master10000_ing_cleaned_parsed.csv'
master_data_df_edited.to_csv(data_export_file)