In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
df = pd.read_json('raw/total.json')

In [2]:
df.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [3]:
df = df.dropna()

In [4]:
def has_num(s):
    for c in s:
        if c.isdigit():
            return True
    return False

In [5]:
units = ['cup','cups','lb','oz','tablespoon','tablespoons', 'teaspoon', 'teaspoons', 'clove', 'cloves', 'small', 'large']
adjs = ['range', 'extra', 'corned', 'cooked', 'steamed', 'toasted', 'unseasoned','waxy','smoked','skim', 'shredded','seasoned', 'processed', 'peeled', 'organic', 'minced', 'chopped', 'peeled', 'drained', 'cut', 'ground', 'light', 'medium', 'melted', 'firm', 'neutral','lean', 'skinless', 'sliced', 'free', 'fine', 'granulated', 'packed', 'firmly', 'fresh', 'freshly']
stopwords = units + adjs + ['style', 'and', 'such', 'as', 'or', 'not', 'into', 'other', 'in', 'to']
def filter_ingredients(row):
    filtered = []
    ingredients = row['ingredients']
    for raw_ingrd in ingredients:
        ingrd = ''
        for word in raw_ingrd.split(' '):
            if not (word in stopwords or has_num(word)):
                ingrd += ' ' + word
        filtered.append(ingrd.strip())
    return filtered

In [6]:
def combine_array(row):
    ingredients = row['ingredientsFiltered']
    if len(ingredients) == 0:
        return ''
    else:
        recipe = ingredients[0]
        for i in range(1,len(ingredients)):
            recipe = recipe + ' ' + ingredients[i].strip().replace(' ','_').strip('_')
        return recipe

In [7]:
label_mapping = {}
label = 0
for cuisine in df['cuisine'].unique():
    label_mapping[cuisine] = label
    label += 1
def label_cuisine(row):
    return label_mapping[row['cuisine']]

In [8]:
df['ingredientsFiltered'] = df.apply(filter_ingredients, axis=1)
df['recipe'] = df.apply(combine_array, axis=1)
df['label'] = df.apply(label_cuisine, axis=1)

In [9]:
df.head()

Unnamed: 0,cuisine,id,ingredients,ingredientsFiltered,recipe,label
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...","[romaine lettuce, black olives, grape tomatoes...",romaine lettuce black_olives grape_tomatoes ga...,0
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...","[plain flour, pepper, salt, tomatoes, black pe...",plain flour pepper salt tomatoes black_pepper ...,1
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...","[eggs, pepper, salt, mayonaise, cooking oil, g...",eggs pepper salt mayonaise cooking_oil green_c...,2
3,indian,22213,"[water, vegetable oil, wheat, salt]","[water, vegetable oil, wheat, salt]",water vegetable_oil wheat salt,3
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...","[black pepper, shallots, cornflour, cayenne pe...",black pepper shallots cornflour cayenne_pepper...,3


In [10]:
df.to_json('processed-total.json', orient='records', lines=True)

In [11]:
train_df = pd.DataFrame()
test_df = pd.DataFrame()

In [12]:
TRAIN_TEST_SPLIT = 0.75

for i in df['label'].unique():
    temp = df.loc[df['label']==i]
    split = int(TRAIN_TEST_SPLIT*len(temp))
    train_df = train_df.append(temp.iloc[:split])
    test_df = test_df.append(temp.iloc[split:])

In [13]:
train_df.index = range(len(train_df.index))
test_df.index = range(len(test_df.index))

In [14]:
train_df

Unnamed: 0,cuisine,id,ingredients,ingredientsFiltered,recipe,label
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...","[romaine lettuce, black olives, grape tomatoes...",romaine lettuce black_olives grape_tomatoes ga...,0
1,greek,34471,"[ground pork, finely chopped fresh parsley, on...","[pork, finely parsley, onions, salt, vinegar, ...",pork finely_parsley onions salt vinegar caul_fat,0
2,greek,4635,"[minced garlic, dried oregano, red wine vinega...","[garlic, dried oregano, red wine vinegar, oliv...",garlic dried_oregano red_wine_vinegar olive_oi...,0
3,greek,5980,"[orange, anise, cinnamon sticks, unflavored ge...","[orange, anise, cinnamon sticks, unflavored ge...",orange anise cinnamon_sticks unflavored_gelati...,0
4,greek,18031,"[fresh dill, yoghurt, salt, myzithra, large eg...","[dill, yoghurt, salt, myzithra, eggs, cheese, ...",dill yoghurt salt myzithra eggs cheese feta_ch...,0
5,greek,24338,"[olive oil, salt, hamburger buns, paprika, cho...","[olive oil, salt, hamburger buns, paprika, min...",olive oil salt hamburger_buns paprika mint cin...,0
6,greek,22678,"[pepper, dried mint flakes, salt, dried oregan...","[pepper, dried mint flakes, salt, dried oregan...",pepper dried_mint_flakes salt dried_oregano to...,0
7,greek,35408,"[garbanzo beans, liquid, black pepper, garlic,...","[garbanzo beans, liquid, black pepper, garlic,...",garbanzo beans liquid black_pepper garlic tahi...,0
8,greek,32480,"[dry red wine, cinnamon sticks, Turkish bay le...","[dry red wine, cinnamon sticks, Turkish bay le...",dry red wine cinnamon_sticks Turkish_bay_leave...,0
9,greek,11665,"[mint leaves, sliced almonds, vanilla lowfat y...","[mint leaves, almonds, vanilla lowfat yogurt, ...",mint leaves almonds vanilla_lowfat_yogurt honey,0


In [15]:
test_df

Unnamed: 0,cuisine,id,ingredients,ingredientsFiltered,recipe,label
0,greek,Slow-Cooker-Greek-Rice-Recipe-with-Red-Bell-Pe...,"[olive oil, Uncle Ben's Original Converted Bra...","[olive oil, Uncle Ben's Original Converted Bra...",olive oil Uncle_Ben's_Original_Converted_Brand...,0
1,greek,Frozen-Yogurt-1241082,"[whole milk greek yogurt, sugar, vanilla extract]","[whole milk greek yogurt, sugar, vanilla extract]",whole milk greek yogurt sugar vanilla_extract,0
2,greek,Braised-greek-green-beans-332127,"[olive oil, white onion, tomatoes, garlic, sal...","[olive oil, white onion, tomatoes, garlic, sal...",olive oil white_onion tomatoes garlic salt bla...,0
3,greek,Margaret_s-Keftedes-_greek-Meatballs_-Allrecipes,"[white bread, milk, garlic, onions, dried mint...","[white bread, milk, garlic, onions, dried mint...",white bread milk garlic onions dried_mint salt...,0
4,greek,Homemade-Caesar-Dressing-_Easy-_-Healthy_-1563691,"[nonfat plain greek yogurt, grated parmesan ch...","[nonfat plain greek yogurt, grated parmesan ch...",nonfat plain greek yogurt grated_parmesan_chee...,0
5,greek,Blueberry-Mango-Smoothie-1571624,"[water, greek yogurt, bananas, lemon, frozen m...","[water, greek yogurt, bananas, lemon, frozen m...",water greek_yogurt bananas lemon frozen_mango ...,0
6,greek,Greek-Seasoned-Spatchcock-Chicken-_AIP-friendl...,"[garlic, lemon, fresh oregano, chopped fresh t...","[garlic, lemon, oregano, thyme, rosemary, salt...",garlic lemon oregano thyme rosemary salt avoca...,0
7,greek,Greek-Baked-Scallops-Santorini-1545890,"[bay scallops, olive oil, chopped onion, diced...","[bay scallops, olive oil, onion, diced tomatoe...",bay scallops olive_oil onion diced_tomatoes dr...,0
8,greek,Greek-Yogurt-Double-Chocolate-Muffins-1106559,"[whole wheat flour, all-purpose flour, granula...","[whole wheat flour, all-purpose flour, sugar, ...",whole wheat flour all-purpose_flour sugar unsw...,0
9,greek,Orange-Creamsicle-Smoothie-1578683,"[bananas, vanilla extract, orange juice, greek...","[bananas, vanilla extract, orange juice, greek...",bananas vanilla_extract orange_juice greek_yog...,0


In [16]:
train_df.to_json('train/trainText-75-25.json', orient='records', lines=True)
test_df.to_json('test/testText-75-25.json', orient='records', lines=True)