In [1]:
import pandas as pd
import nltk

### Tokenize and simplify the ingredients

In [2]:
# Upload trained dataset
train_ingr=pd.read_csv('shortened_train.csv')

train_ingr['full form']=pd.Series([x.lstrip() for x in train_ingr['full form']])
train_ingr['short form']=pd.Series([x.lstrip() for x in train_ingr['short form']])
train_ingr.index=train_ingr['full form']

groups = train_ingr.groupby(level=train_ingr.index.names) 
train_ingr=groups.last()


In [40]:
# read the full list of 3700 ingredients
ingredients=pd.read_csv('ingredients.csv')
ingredients.drop('Column1', axis=1, inplace=True)

In [45]:
ingredients['full name'].head()

0    cremini mushrooms
1              caramel
2      maldon sea salt
3          orecchiette
4        baby bok choy
Name: full name, dtype: object

In [None]:
# create the dataset of ingredients

ingredients['split']=pd.Series([x.split() for x in new_ingredients])

ingredients['pos']=pd.Series([[xx[1] for xx in nltk.pos_tag(x)] for x in ingredients['split']])

ingredients['train']=pd.Series([int(x in list(train_ingr['full form'])) for x in ingredients['full name']])

ingredients['short']=pd.Series([train_ingr.loc[x]['short form'] 
                                if (x in train_ingr.index)&(x in new_ingredients) else 'na' for x in ingredients['full name'] ])

ingredients['short_split']=pd.Series([ingredients['short'][i].split() for i in range(ingredients.shape[0])])


### Fill the non-train short values based either on word, or the POS probability

In [47]:
samples=[]
idx=ingredients[ingredients.train==1].index

In [48]:
for i in idx:
    split=ingredients.loc[i]['split']
    pos=ingredients.loc[i]['pos']
    short_split=ingredients.loc[i]['short_split']
    for j in range(len(split)):
        samples.append([split[j], pos[j], int(split[j] in short_split)])

In [49]:
samples=pd.DataFrame(samples, columns=['word', 'pos','keep'])

In [50]:
samples.shape

(1538, 3)

In [51]:
samples.head()

Unnamed: 0,word,pos,keep
0,cremini,NN,1
1,mushrooms,NNS,1
2,sliced,VBN,0
3,mushrooms,NNS,0
4,rice,NN,1


In [None]:
### Simple probabiity model

def keep_prob(x):
    if x[0] in list(samples.word):
        idx=samples[samples.word==x[0]].index
        if x[1] in list(samples.loc[idx]['pos']):
            idx=samples.loc[idx][samples.pos==x[1]].index
        else:
            idx=samples[samples.pos==x[1]].index
    else: 
        idx=samples[samples.pos==x[1]].index
    
    try: return sum(samples.loc[idx]['keep'])/len(samples.loc[idx]['keep']) 
    except: return 0

In [None]:
# Fille the 'short_spit' for non-train data

ingredients['short_split']=pd.Series([ingredients['short'][i].split() for i in range(ingredients.shape[0])])

for i in ingredients[ingredients.train==0].index:
    ss=[]
    for j in range(len(ingredients.loc[i]['split'])):
        x=[ingredients.loc[i]['split'][j], ingredients.loc[i]['pos'][j]]
        if keep_prob(x)>0.5:
            ss.extend([x[0]])
    ingredients.loc[i, 'short_split'].extend(ss)


In [None]:
# Add 'short' name and 'stemmed' name columns from short_split

ingredients['short']=pd.Series([' '.join(x[1:]) if x[0]=='na' else ' '.join(x) 
                               for x in ingredients['short_split'] ])

def stem(x):
    try: return lemmatizer.lemmatize(x)
    except: return None
    

ingredients['stem']=pd.Series([stem(x) for x in ingredients['short']])

In [53]:
ingredients.head()

Unnamed: 0,full name,stem,short,split,pos,train,short_split
0,cremini mushrooms,cremini mushrooms,cremini mushrooms,"[cremini, mushrooms]","[NN, NNS]",1,"[cremini, mushrooms]"
1,caramel,caramel,caramel,[caramel],[NN],0,"[na, caramel]"
2,maldon sea salt,maldon sea salt,maldon sea salt,"[maldon, sea, salt]","[NNS, NN, NN]",0,"[na, maldon, sea, salt]"
3,orecchiette,orecchiette,orecchiette,[orecchiette],[NN],0,"[na, orecchiette]"
4,baby bok choy,baby bok choy,baby bok choy,"[baby, bok, choy]","[NN, NNS, VBP]",0,"[na, baby, bok, choy]"


In [None]:
ingredients.to_csv('ingredients.csv')

In [56]:
short_ingredients=list(set(ingredients['stem']))
short_ingredients=[x for x in short_ingredients if type(x)==str]
len(short_ingredients)

2705

In [57]:
pd.DataFrame(short_ingredients, columns=['ingredient']).to_csv("ingr_list.csv")

In [None]:
# Further upload of ingredients dataset

In [None]:
ingredients=pd.read_csv('ingredients.csv')
ingredients.drop('Column1', axis=1, inplace=True)

In [None]:
from ast import literal_eval
for column in list(['split', 'pos', 'short_split']):
    ingredients[column]=pd.Series(literal_eval(x) for x in ingredients[column])


### Create the dataset out of 2.7K unified ingredients and recipes

In [None]:
data=pd.read_csv('data.csv')

data_short=data[['title','calories','protein','carbs','fat','sodium','steps', 'meal']]


In [None]:
short_ingredients.remove('meal')
short_ingredients.remove('fat')

In [None]:
len(short_ingredients)

for ingr in short_ingredients:
    items=[ingredients.loc[i]['full name'] for i in ingredients.index if ingredients.loc[i]['stem']==ingr]
    items=[x for x in items if x in data.columns]
    data_short[ingr]=data[items].sum(axis=1)


In [None]:
data_short[short_ingredients]=data_short[short_ingredients].applymap(lambda x: int(x>0))

In [None]:

data_short.to_csv('data_short.csv', encoding="UTF-8")