# Welcome to the first step of this project

 - Loading of the datasets
 - Exploratory Data Analysis of some of their characteristics
 - Preprocessing of the data

## imports

In [2]:
import pandas as pd
from string import ascii_lowercase
import numpy as np
import re
from collections import Counter, defaultdict
import datetime

import json
from pprint import pprint
from googlesearch import search

In [3]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")


## Preprocessing datasets

 - Dataset of 20k recipes from the food website [Epicurious.com](https://www.epicurious.com/)
 - Recipe Ingredient Kaggle dataset

In [4]:
epi_json = 'datasets/full_format_recipes.json'
df=pd.read_json(epi_json)

In [6]:
df=df.dropna(how='all')
df.title=df.title.apply(lambda t: t.strip())
df['num_dir'] = df.directions.apply(len)
df['num_ing'] = df.ingredients.apply(len)
df['num_cat'] = df.categories.apply(len)

df.ingredients=df.ingredients.apply(lambda l: [s.lower() for s in l])
df['title_words']=df.title.apply(lambda t: t.lower().replace(',','').split())

In [7]:
df

Unnamed: 0,directions,fat,date,categories,calories,desc,protein,rating,title,ingredients,sodium,num_dir,num_ing,num_cat,title_words
0,"[1. Place the stock, lentils, celery, carrot, ...",7.0,2006-09-01 04:00:00+00:00,"[Sandwich, Bean, Fruit, Tomato, turkey, Vegeta...",426.0,,30.0,2.500,"Lentil, Apple, and Turkey Wrap","[4 cups low-sodium vegetable or chicken stock,...",559.0,3,15,11,"[lentil, apple, and, turkey, wrap]"
1,[Combine first 9 ingredients in heavy medium s...,23.0,2004-08-20 04:00:00+00:00,"[Food Processor, Onion, Pork, Bake, Bastille D...",403.0,This uses the same ingredients found in boudin...,18.0,4.375,Boudin Blanc Terrine with Red Onion Confit,"[1 1/2 cups whipping cream, 2 medium onions, c...",1439.0,5,28,11,"[boudin, blanc, terrine, with, red, onion, con..."
2,[In a large heavy saucepan cook diced fennel a...,7.0,2004-08-20 04:00:00+00:00,"[Soup/Stew, Dairy, Potato, Vegetable, Fennel, ...",165.0,,6.0,3.750,Potato and Fennel Soup Hodge,"[1 fennel bulb (sometimes called anise), stalk...",165.0,2,6,7,"[potato, and, fennel, soup, hodge]"
3,[Heat oil in heavy large skillet over medium-h...,,2009-03-27 04:00:00+00:00,"[Fish, Olive, Tomato, Sauté, Low Fat, Low Cal,...",,The Sicilian-style tomato sauce has tons of Me...,,5.000,Mahi-Mahi in Tomato Olive Sauce,"[2 tablespoons extra-virgin olive oil, 1 cup c...",,2,10,17,"[mahi-mahi, in, tomato, olive, sauce]"
4,[Preheat oven to 350°F. Lightly grease 8x8x2-i...,32.0,2004-08-20 04:00:00+00:00,"[Cheese, Dairy, Pasta, Vegetable, Side, Bake, ...",547.0,,20.0,3.125,Spinach Noodle Casserole,"[1 12-ounce package frozen spinach soufflé, th...",452.0,1,6,11,"[spinach, noodle, casserole]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20125,[Beat whites in a bowl with an electric mixer ...,2.0,2004-08-20 04:00:00+00:00,"[Mixer, Cheese, Egg, Fry, Cocktail Party, Parm...",28.0,,2.0,3.125,Parmesan Puffs,"[2 large egg whites, 3 oz parmigiano-reggiano,...",64.0,2,3,8,"[parmesan, puffs]"
20126,[Bring broth to simmer in saucepan.Remove from...,28.0,2008-02-28 22:06:54+00:00,"[Side, Kid-Friendly, High Fiber, Dinner, Parme...",671.0,Cooking the artichokes with the rice infuses t...,22.0,4.375,Artichoke and Parmesan Risotto,"[5 1/2 cups (or more) low-salt chicken broth, ...",583.0,2,8,16,"[artichoke, and, parmesan, risotto]"
20127,"[Using a sharp knife, cut a shallow X in botto...",38.0,2005-10-21 18:21:20+00:00,"[Onion, Poultry, turkey, Vegetable, Bake, Kid-...",563.0,,31.0,4.375,Turkey Cream Puff Pie,"[1 small tomato, 1 small onion, finely chopped...",652.0,6,17,15,"[turkey, cream, puff, pie]"
20128,[Heat 2 tablespoons oil in heavy medium skille...,24.0,2004-08-20 04:00:00+00:00,"[Milk/Cream, Citrus, Dairy, Fish, Garlic, Past...",631.0,"Sharon Hooykaas of Los Alamitos, California, w...",45.0,4.375,Snapper on Angel Hair with Citrus Cream,"[4 tablespoons olive oil, 4 shallots, thinly s...",517.0,5,18,13,"[snapper, on, angel, hair, with, citrus, cream]"


In [8]:
units = ['ounce','oz', 'cup', 'teaspoon', 'tablespoon', 'tbsp','tsp',
          'quart','qt', 'lb', 'pound',
         'l','ml','g','gram', 'inch','inche', 'cm']

descriptions= ['large', 'medium','small','quarter','pkg','package',
               'bottle','stick','cube','can','piece','pinch of', 'bit']  


def remove_plurals(ing):
    doc = nlp(ing)
    for token in doc:
        if token.tag_=='NNS': 
            ing=re.sub(token.text,token.lemma_,ing)
    return ing


def preprocess_df(s):
    for u in units+descriptions:
        ex = r'\b{}\b|\b{}s\b'.format(u,u)  # units singular and plural
        s = re.sub(ex,'',s)
        
    s = re.sub('-',' ',s)    
    s = re.sub(r'\(.*?\)','',s) 
    s = re.sub(r'[^A-Za-z\s]','',s) 
    s = re.sub(r'\s+',r' ',s)
    
    s = remove_plurals(s)
    return s.strip()

In [None]:
df['ing_cleaned']=df.ingredients.apply(lambda list_ing: sorted([pre_process(s) for s in list_ing]))        

In [None]:
df['ing_cleaned_all']=df.ing_cleaned.apply(lambda l: " ".join(l))

In [None]:
df['recipe_id'] = list(df.index)
df.head()

In [None]:
df.to_csv(save_dir+'epicurious_ing_cleaned.csv')
df.to_json(save_dir+'epicurious_ing_cleaned.json')

In [None]:
df.cuisine.value_counts()

# Building Flask app functions

In [None]:
def load_recipes():
    """Loads recipe dataframe"""
    return pd.read_json(save_dir + 'epicurious_cuisine.json')
df = load_recipes()

In [None]:
def match_string(keywords, title, how='any'):
    found = 0
    for pattern in keywords:
        if re.search(pattern, title, re.IGNORECASE):
            if how == 'any':
                return True
            if how == 'all':
                found += 1
    if found == len(keywords):
        return True
    else:
        return False

In [None]:
def get_recipes(df,rec_id=None, kw=None, cuis=[]):
    """
    Query on recipe dataframe from keyword ingredients, matching all of them
    :param rec_id: Recipe Id to retrieve a single recipe to be displayed on its page
    :param kw: List of ingredients
    :param cuis: List of cuisine selected
    :return: dictionary
    """
    if kw is not None:
        kw = [k.strip() for k in kw.split(' ')]
        mask = df.ing_cleaned_all.apply(lambda t: match_string(kw, t, 'all'))
    if rec_id is not None:
        mask = df.recipe_id == int(rec_id)        
    df_sel  = df[mask]
    if cuis!=[]:
        df_sel = sort_cuis(df_sel,cuis)
    return df_sel #[['recipe_id', 'title', 'ing_cleaned', 'ingredients', 'directions','cuisine']]#.to_dict('records')
col = ['recipe_id', 'title', 'ing_cleaned', 'ingredients', 'directions','cuisine']

In [None]:
def sort_cuis(df,cuis_list):
    ind_list,ind_list_match={},{}
    ind_sorted=[]
    
    for cui in cuis_list:
        ind_list_match[cui] = list(df[df.cuisine==cui].sort_values(cui,ascending=False).index)
        ind_list[cui] = list(df.sort_values(cui,ascending=False).index)
    
    for i in range(max([len(l) for l in ind_list_match.values()])):         
        for cui in cuis_list:
            try:
                ind_sorted.append(ind_list_match[cui][i])
            except:
                pass
            
    for i in range(len(df.index)):         
        for cui in cuis_list:
            if ind_list[cui][i] not in ind_sorted:
                ind_sorted.append(ind_list[cui][i])
                
    
    df = df.loc[ind_sorted]
    return df