In [1]:
import pandas as pd
import numpy as np
import re
from ast import literal_eval

### Load data frame from csv file

In [6]:
data_file = 'MasterCSV.csv'
data_df = pd.read_csv(data_file, converters = {'cuisine':  literal_eval,
                                              'aisle_SP':  literal_eval,
                                              'ingredients_SP':  literal_eval})
data_df

Unnamed: 0,recipe_name,cuisine_SP,aisle_SP,ingredients_SP
0,Pear-ginger upside-down cake,[],"[Baking, Baking, Milk, Eggs, Other Dairy, Baki...","[low sodium baking powder, baking soda, butter..."
1,Easy Chicken Cordon Bleu,[],"[Meat, Meat, Cheese, Spices and Seasonings, Sp...","[boneless skinless chicken breast, ham, chedda..."
2,Chicken 65,[],"[Meat, Spices and Seasonings, Ethnic Foods, Sp...","[chicken breast, chili powder, ginger garlic p..."
3,Herb Roasted Chicken,[],"[Spices and Seasonings, Baking, Milk, Eggs, Ot...","[bay leaves, golden brown sugar, butter, dried..."
4,Meatball Sliders,[],"[Spices and Seasonings, Pasta and Rice, Produc...","[bay leaves, breadcrumbs, marjoram, egg, parsl..."
...,...,...,...,...
9995,Cinnamon Twists,[],"[Baking, Milk, Eggs, Other Dairy, Milk, Eggs, ...","[dry yeast, butter, egg, milk, salt, sugar]"
9996,Fluffy frittata with spinach,[],"[Milk, Eggs, Other Dairy, Produce, Spices and ...","[egg, garlic, black pepper, nutmeg, olive oil,..."
9997,Protein Packed Carrot Muffins,[],"[Spices and Seasonings, Gluten Free;Health Foo...","[dry seasoning rub, almond meal, low sodium ba..."
9998,BLT Sandwich,[],"[Produce, Bakery/Bread, Produce, Condiments, M...","[bell pepper, bread, lettuce, mayonnaise, thic..."


In [4]:
data_df.loc[0,'ingredients_SP'][3]

'dark brown sugar'

### Define functions required to clean dataframe

In [78]:
# Converting strings to lists then remove [] and ()
def ingredients_cleanup (data_df, column):
    row_list = []
    indexes = data_df.index.values.tolist()
    counter = 0
    for row in data_df[column]:
        row = row.replace("[", '')
        row = row.replace("'", '')
        row = row.replace("]", '')
        row = row.replace('"', '')
        row_list = row.split( ',')
        data_df.loc[indexes[counter],column] = row_list
        counter += 1
    
    return data_df

In [79]:
# Removing leading white spaces 
def remove_leading_ws(data_df, column):
    indexes = data_df.index.values.tolist()
    counter = 0
    for row in data_df[column]:
        for i, element in enumerate(row):
            element = element.strip()
            data_df.loc[indexes[counter],column][i] = element
        counter += 1
    return data_df


In [8]:
# Creating a single list from all the ingredents 
def total_ing_list_from_df(data_df):
    composite_ingredients = []
    for row in data_df.ingredients_SP:
        for element in row:
            composite_ingredients.append(element)
    return composite_ingredients

In [10]:
# Creating list of composite list and creat count of ingredients
def sort_and_count_ingredient_list(ingredient_list):
    ingredient_df = pd.DataFrame(ingredient_list)
    ingredient_df = ingredient_df.rename(columns = {0:  'ingredient'})
    ingredient_df['count'] = 1
    ingredient_counts = ingredient_df.groupby('ingredient').agg({'count':  'count'})
    ingredient_counts.sort_values('count', ascending=False, inplace=True)
    return ingredient_counts


In [82]:
# Replacing different verisons of ingredients
def ingredient_replacement (data_df, replacement_dict):
    
    
    axises = data_df.index.values.tolist()
    counter = 0
    for row in data_df.ingredients_SP:
        for i, element in enumerate(row):
            if element in replacement_dict.keys():
                element = replacement_dict[element]
                data_df.loc[axises[counter],'ingredients_SP'][i] = element
        counter += 1
    return data_df

### Create a list of all ingredients from the cuisine 

In [9]:
composite_ingredients = total_ing_list_from_df(data_df)

### Create a dataframe from the composite ingredients list, group by ingredient name with count(), and sort descending

In [11]:
ing_groups = sort_and_count_ingredient_list(composite_ingredients)
ing_groups.to_csv('master_ingredient_dataframe.csv')

### Code to apply regex will go below here

## Code to create dataframe of 1s and 0s below.  Will need to be edited

In [100]:
for index, row in enumerate(ing_groups.index):
    if index < 20:
        data_df[row] = 0
data_df.columns   


Index(['recipe_name', 'cuisine_SP', 'aisle_SP', 'ingredients_SP', 'garlic',
       'chocolate', 'tomato', 'vanilla', 'lemon', 'parsley', 'honey',
       'vinegar', 'flour', 'cream', 'broth', 'basil', 'shallot', 'parmesan',
       'buttermilk', 'fresh pepper', 'almonds', 'chili powder', 'dry yeast',
       'orange'],
      dtype='object')

In [102]:
data_df

Unnamed: 0,recipe_name,cuisine_SP,aisle_SP,ingredients_SP,garlic,chocolate,tomato,vanilla,lemon,parsley,...,broth,basil,shallot,parmesan,buttermilk,fresh pepper,almonds,chili powder,dry yeast,orange
0,White Chocolate Macadamia Banana Bread,[],"[Baking, Milk, Eggs, Other Dairy, Produce,...","[sugar, butter, banana, milk, cheese, egg, whe...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Asian Chickpea Lettuce Wraps,['Asian'],"[Canned and Jarred, Ethnic Foods, Condiments...","[chickpeas, chili sauce, barbecue sauce, soy s...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Salsa Verde Chicken Tamales,['Mexican'],"[Meat, Meat, Produce, Produce, Spices and ...","[chicken, chicken, garlic, onion, cumin, salt ...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Cheddar Polenta With Bacon Wrapped Asparagus,[],"[Produce, Meat, Canned and Jarred, Cereal;B...","[asparagus, bacon, broth, grits, cheese, bell ...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Blasian's Deviled Eggs,['American'],"[Milk, Eggs, Other Dairy, Condiments, Cond...","[egg, mayonnaise, mustard, salt and pepper, ga...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Authentic Mexican Wedding Cookies,['Mexican'],"[Milk, Eggs, Other Dairy, Baking, Baking, ...","[butter, sugar, vanilla, wheat flour, pecans, ...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,Corned Beef Ribs With Brown Sugar and Mustard ...,[],"[Spices and Seasonings, Spices and Seasonings...","[bay leaves, salt and pepper, cabbage, carrot,...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,Green Monster Ice Pops,[],"[Milk, Eggs, Other Dairy, Produce, Produce...","[almond milk, avocado, spinach, banana, honey,...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98,Orange Zest Maple Date Bars,[],"[Dried Fruits;Produce, Beverages, Cereal, P...","[dates, water, maple syrup, orange, sugar, but...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [103]:
# Assigning 1 or 0 if ingredient is in ingredient list
indexes = data_df.index.values.tolist()
for index in indexes:
    for col_name in data_df.columns:
        if col_name in data_df.ingredients_SP[index]:
            data_df.loc[index,col_name] = 1
data_df

Unnamed: 0,recipe_name,cuisine_SP,aisle_SP,ingredients_SP,garlic,chocolate,tomato,vanilla,lemon,parsley,...,broth,basil,shallot,parmesan,buttermilk,fresh pepper,almonds,chili powder,dry yeast,orange
0,White Chocolate Macadamia Banana Bread,[],"[Baking, Milk, Eggs, Other Dairy, Produce,...","[sugar, butter, banana, milk, cheese, egg, whe...",0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Asian Chickpea Lettuce Wraps,['Asian'],"[Canned and Jarred, Ethnic Foods, Condiments...","[chickpeas, chili sauce, barbecue sauce, soy s...",0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,Salsa Verde Chicken Tamales,['Mexican'],"[Meat, Meat, Produce, Produce, Spices and ...","[chicken, chicken, garlic, onion, cumin, salt ...",1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
3,Cheddar Polenta With Bacon Wrapped Asparagus,[],"[Produce, Meat, Canned and Jarred, Cereal;B...","[asparagus, bacon, broth, grits, cheese, bell ...",0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,Blasian's Deviled Eggs,['American'],"[Milk, Eggs, Other Dairy, Condiments, Cond...","[egg, mayonnaise, mustard, salt and pepper, ga...",1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Authentic Mexican Wedding Cookies,['Mexican'],"[Milk, Eggs, Other Dairy, Baking, Baking, ...","[butter, sugar, vanilla, wheat flour, pecans, ...",0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
96,Corned Beef Ribs With Brown Sugar and Mustard ...,[],"[Spices and Seasonings, Spices and Seasonings...","[bay leaves, salt and pepper, cabbage, carrot,...",1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,Green Monster Ice Pops,[],"[Milk, Eggs, Other Dairy, Produce, Produce...","[almond milk, avocado, spinach, banana, honey,...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98,Orange Zest Maple Date Bars,[],"[Dried Fruits;Produce, Beverages, Cereal, P...","[dates, water, maple syrup, orange, sugar, but...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [104]:
# Dropping columns. Cleaning data for machine learning
data_df = data_df.drop(columns = ["recipe_name", "aisle_SP",  "ingredients_SP"])
data_df

Unnamed: 0,cuisine_SP,garlic,chocolate,tomato,vanilla,lemon,parsley,honey,vinegar,flour,...,broth,basil,shallot,parmesan,buttermilk,fresh pepper,almonds,chili powder,dry yeast,orange
0,[],0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,['Asian'],0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
2,['Mexican'],1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
3,[],0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,['American'],1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,['Mexican'],0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,[],1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
97,[],0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
98,[],0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [105]:
data_export_file = '100_random_cleaned_0908.csv'
data_df.to_csv(data_export_file)