In [2]:
import pandas as pd
import nltk

### Tokenize and simplify the ingredients

In [2]:
# Upload trained dataset
train_ingr=pd.read_csv('data/shortened_train.csv')

train_ingr['full form']=pd.Series([x.lstrip() for x in train_ingr['full form']])
train_ingr['short form']=pd.Series([x.lstrip() for x in train_ingr['short form']])
train_ingr.index=train_ingr['full form']

groups = train_ingr.groupby(level=train_ingr.index.names) 
train_ingr=groups.last()


In [112]:
# read the full list of 3700 ingredients
ingredients=pd.read_csv('data/ingredients.csv')
ingredients.drop('Column1', axis=1, inplace=True)

In [113]:
ingredients['full name'].head()

0    cremini mushrooms
1              caramel
2      maldon sea salt
3          orecchiette
4        baby bok choy
Name: full name, dtype: object

In [None]:
# create the dataset of ingredients

ingredients['split']=pd.Series([x.split() for x in new_ingredients])

ingredients['pos']=pd.Series([[xx[1] for xx in nltk.pos_tag(x)] for x in ingredients['split']])

ingredients['train']=pd.Series([int(x in list(train_ingr['full form'])) for x in ingredients['full name']])

ingredients['short']=pd.Series([train_ingr.loc[x]['short form'] 
                                if (x in train_ingr.index)&(x in new_ingredients) else 'na' for x in ingredients['full name'] ])

ingredients['short_split']=pd.Series([ingredients['short'][i].split() for i in range(ingredients.shape[0])])


### Fill the non-train short values based either on word, or the POS probability

In [47]:
samples=[]
idx=ingredients[ingredients.train==1].index

In [48]:
for i in idx:
    split=ingredients.loc[i]['split']
    pos=ingredients.loc[i]['pos']
    short_split=ingredients.loc[i]['short_split']
    for j in range(len(split)):
        samples.append([split[j], pos[j], int(split[j] in short_split)])

In [49]:
samples=pd.DataFrame(samples, columns=['word', 'pos','keep'])

In [50]:
samples.shape

(1538, 3)

In [51]:
samples.head()

Unnamed: 0,word,pos,keep
0,cremini,NN,1
1,mushrooms,NNS,1
2,sliced,VBN,0
3,mushrooms,NNS,0
4,rice,NN,1


In [None]:
### Simple probabiity model

def keep_prob(x):
    if x[0] in list(samples.word):
        idx=samples[samples.word==x[0]].index
        if x[1] in list(samples.loc[idx]['pos']):
            idx=samples.loc[idx][samples.pos==x[1]].index
        else:
            idx=samples[samples.pos==x[1]].index
    else: 
        idx=samples[samples.pos==x[1]].index
    
    try: return sum(samples.loc[idx]['keep'])/len(samples.loc[idx]['keep']) 
    except: return 0

In [None]:
# Fille the 'short_spit' for non-train data

ingredients['short_split']=pd.Series([ingredients['short'][i].split() for i in range(ingredients.shape[0])])

for i in ingredients[ingredients.train==0].index:
    ss=[]
    for j in range(len(ingredients.loc[i]['split'])):
        x=[ingredients.loc[i]['split'][j], ingredients.loc[i]['pos'][j]]
        if keep_prob(x)>0.5:
            ss.extend([x[0]])
    ingredients.loc[i, 'short_split'].extend(ss)


In [None]:
# Add 'short' name and 'stemmed' name columns from short_split

ingredients['short']=pd.Series([' '.join(x[1:]) if x[0]=='na' else ' '.join(x) 
                               for x in ingredients['short_split'] ])

def stem(x):
    try: return lemmatizer.lemmatize(x)
    except: return None
    

ingredients['stem']=pd.Series([stem(x) for x in ingredients['short']])

In [53]:
ingredients.head()

Unnamed: 0,full name,stem,short,split,pos,train,short_split
0,cremini mushrooms,cremini mushrooms,cremini mushrooms,"[cremini, mushrooms]","[NN, NNS]",1,"[cremini, mushrooms]"
1,caramel,caramel,caramel,[caramel],[NN],0,"[na, caramel]"
2,maldon sea salt,maldon sea salt,maldon sea salt,"[maldon, sea, salt]","[NNS, NN, NN]",0,"[na, maldon, sea, salt]"
3,orecchiette,orecchiette,orecchiette,[orecchiette],[NN],0,"[na, orecchiette]"
4,baby bok choy,baby bok choy,baby bok choy,"[baby, bok, choy]","[NN, NNS, VBP]",0,"[na, baby, bok, choy]"


In [None]:
ingredients.to_csv('ingredients.csv')

In [141]:
short_ingredients=list(set(ingredients['stem']))
short_ingredients=[x for x in short_ingredients if type(x)==str]

In [142]:
len(short_ingredients)

2640

In [143]:
short_ingredients.remove('meal')
short_ingredients.remove('fat')

In [144]:
pd.DataFrame(short_ingredients, columns=['ingredient']).to_csv("data/ingr_list.csv", encoding="UTF-8")

In [None]:
# Further upload of ingredients dataset

In [114]:
ingredients=pd.read_csv('data/ingredients.csv')
ingredients.drop('Column1', axis=1, inplace=True)

In [222]:
from ast import literal_eval

In [115]:
for column in list(['split', 'pos', 'short_split']):
    ingredients[column]=pd.Series(literal_eval(x) for x in ingredients[column])


### Create the dataset out of 2.7K unified ingredients and recipes

In [None]:
data=pd.read_csv('data/data.csv')

data_short=data[['title','calories','protein','carbs','fat','sodium','steps', 'meal']]


In [None]:
len(short_ingredients)

for ingr in short_ingredients:
    items=[ingredients.loc[i]['full name'] for i in ingredients.index if ingredients.loc[i]['stem']==ingr]
    items=[x for x in items if x in data.columns]
    data_short[ingr]=data[items].sum(axis=1)


In [None]:
data_short[short_ingredients]=data_short[short_ingredients].applymap(lambda x: int(x>0))

In [None]:

data_short.to_csv('data_short.csv', encoding="UTF-8")

### Categorize ingredients

#### Categorize vegetarian and vegan products

In [129]:
data_short=pd.read_csv('data/data_short.csv', encoding="UTF-8")

In [217]:
short_ingredients=pd.read_csv('data/ingr_list.csv', encoding="UTF-8")

In [218]:
short_ingredients.drop('Unnamed: 0', axis=1, inplace=True)

In [151]:
red_meat=['lamb', 'beef', 'meat', 'mutton', 'veal', 'venison', 'pork',
      'turkey', 'bacon', 'ham', 'hot dogs', 'jamon', 'prosciutto', 'salami', 'sausages', 'rabbit']

In [None]:
short_ingredients[str(column)]=pd.Series([1 if len(set(column).intersection(x.split(' ')))>0 else 0 
                                     for x in short_ingredients.ingredient])

In [152]:
poultry=['chicken', 'turkey', 'duck', 'grouse', 'pheasant']

In [153]:
fish=['fish', 'salmon', 'cod', 'tilapia', 'catfish', 'halibut', 'mahi', 'tuna', 'mackerel',  
      'swordfish', 'sole', 'bass', 'anchovy', 'herring', 'haddock', 'monkfish', 'mullet', 'sardines', 'swordfish', 
     'trout', 'caviar', 'turbot', 'snapper', 'sturgeon', 'flounder', 'barramundi']

In [154]:
seafood=['crabs', 'shells', 'shrimp', 'clam', 'lobster', 'octopus', 'scallops', 'squid', 'eel']

In [155]:
dairy=['cheese', 'milk', 'mascarpone', 'butter' 'buttermilk', 'ice cream', 'sour cream', 'yogurt', 'yoghurt', 'kefir', 'custard']

In [156]:
milk_exclude=['coconut', 'soymilk', 'soy', 'cashew', 'almond', 'rice']

In [157]:
egg=['egg', 'eggs', 'yolk', 'whites']

In [160]:
#vegetables
vegetables=list(pd.read_csv('data/vegetables.csv')['Vegetables'])
vegetables=[x.lower() for x in vegetables]
vegetables=[x.split('/') for x in vegetables]
vegetables=[y for x in vegetables for y in x]

In [197]:
# fruit
fruits=list(pd.read_csv('data/fruits.csv')['Fruits'])
fruits=[x.lower() for x in fruits]
fruits=[x.split('/') for x in fruits]
fruits=[y for x in fruits for y in x]

In [None]:
d=dict(zip(['red_meat', 'poultry', 'fish', 'seafood', 'egg', 'vegetables', 'fruits'], 
     [red_meat, poultry, fish, seafood, egg, vegetables, fruits]))

In [229]:
for column in d.keys():
    short_ingredients[column]=pd.Series([1 if len(set(d[column]).intersection(x.split(' ')))>0 else 0 
                                     for x in short_ingredients.ingredient])

In [233]:
short_ingredients['dairy']=pd.Series([1 if (len(set(dairy).intersection(x.split(' ')))>0)&
                                            (len(set(milk_exclude).intersection(x.split(' ')))==0) else 0 
                                     for x in short_ingredients.ingredient])

In [234]:
short_ingredients.head()

Unnamed: 0,ingredient,red_meat,poultry,fish,seafood,egg,vegetables,fruits,dairy
0,maitake mushrooms,0,0,0,0,0,1,0,0
1,america,0,0,0,0,0,0,0,0
2,shiro miso,0,0,0,0,0,0,0,0
3,hash brown,0,0,0,0,0,0,0,0
4,jicama,0,0,0,0,0,0,0,0


In [240]:
short_ingredients.to_csv('data/ing_list.csv', encoding='UTF-8')

In [239]:
[(x, sum(short_ingredients[x])) for x in ['red_meat', 'poultry', 'fish', 'seafood', 'egg', 'vegetables', 'fruits', 'dairy']]

[('red_meat', 137),
 ('poultry', 46),
 ('fish', 47),
 ('seafood', 33),
 ('egg', 12),
 ('vegetables', 320),
 ('fruits', 227),
 ('dairy', 67)]

### Separate desserts into category (in process)

In [None]:
# Identify ingredients which often appear in dessets

In [73]:
idx=[i for i in data_short.index if 'cake' in data_short.loc[i]['title']]

In [98]:
data_short.drop('Unnamed: 0', axis=1, inplace=True)

In [99]:
x=data_short.loc[idx]

In [105]:
x=[(y, sum(x[y])) for y in data_short.columns if y not in ['title','calories','protein','carbs','fat','sodium','steps', 'meal']]

In [109]:
x.sort(key=lambda tup: tup[1])

In [121]:
x[::-1][:10]

[('egg', 236),
 ('butter', 213),
 ('sugar', 198),
 ('flour', 196),
 ('salt', 152),
 ('vanilla extract', 113),
 ('baking powder', 111),
 ('cream cheese', 89),
 ('sour cream', 76),
 ('baking soda', 68)]

In [39]:
import json
from pprint import pprint

In [40]:
with open('epicurious/full_format_recipes.json') as f:
    recipes = json.load(f)