In [44]:
import numpy as np
import pandas as pd
import re
from collections import Counter

In [45]:
flatten = lambda l: [item for sublist in l for item in sublist]

## Filtering / cleaning functions

In [46]:
def not_has_numbers(inputString):
    return not any(char.isdigit() for char in inputString)

def not_one_or_two_letter(inputString):
    return len(inputString) > 2

In [47]:
def change_to_singular(inputString):
    inputString = re.sub('ies$', 'y', inputString)
    inputString = re.sub('es$', '', inputString)
    inputString = re.sub('s$', '', inputString)
    
    return inputString

In [48]:
def filter_and_normalize_list(words_list):
    words_list = filter(not_has_numbers, words_list)
    words_list = filter(not_one_or_two_letter, words_list)
    words_list = map(change_to_singular, words_list)
    words_list = map(lambda x: x.lower(), words_list)
    
    return list(words_list)

## Products bought in supermarket

In [49]:
exec(open("allowed_aisles.txt").read())
products = pd.read_csv("../../data/instacart/products.csv")
products = products[products.aisle_id.isin(allowed_aisles)]
names = products.product_name.str.lower().tolist()

In [50]:
products

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
4,5,Green Chile Anytime Sauce,5,13
6,7,Pure Coconut Water With Orange,98,7
7,8,Cut Russet Potatoes Steam N' Mash,116,1
8,9,Light Strawberry Blueberry Yogurt,120,16
10,11,Peach Mango Juice,31,7
16,17,Rendered Duck Fat,35,12
18,19,Gluten Free Quinoa Three Cheese & Mushroom Blend,63,9
19,20,Pomegranate Cranberry & Aloe Vera Enrich Drink,98,7


In [51]:
concat = " ".join(names)
single_words = re.sub("[^\w]", " ",  concat).split()
bought_names = np.unique(single_words)

bought_names = filter_and_normalize_list(bought_names)
print(bought_names)

['aaa', 'abate', 'abc', 'abl', 'abruzzese', 'absorbency', 'absorbenncy', 'absorber', 'abuelita', 'abundant', 'acacia', 'acai', 'acaí', 'accent', 'aceita', 'ache', 'achiote', 'acid', 'acidophilu', 'acini', 'acme', 'acorn', 'acting', 'active', 'activedry', 'activia', 'activity', 'add', 'added', 'ade', 'adhesive', 'adige', 'adobo', 'aduki', 'advanced', 'adzuki', 'aerosol', 'affi', 'african', 'after', 'agar', 'agave', 'age', 'aged', 'agen', 'ag', 'ahi', 'ahoy', 'aid', 'ailoi', 'aioli', 'aiolo', 'air', 'aji', 'aka', 'akoma', 'alaria', 'alarm', 'alaska', 'alaskan', 'alba', 'albacore', 'albacoretuna', 'albarino', 'alcaparrado', 'alcohol', 'alcoholic', 'alder', 'alderwood', 'ale', 'alehouse', 'aleppo', 'aleve', 'alexander', 'alfabeto', 'alfalfa', 'alfredo', 'alfresco', 'algae', 'alice', 'alive', 'alkaline', 'alkalize', 'all', 'alla', 'allspice', 'allure', 'almnd', 'almond', 'almondmilk', 'almond', 'almonette', 'almost', 'aloe', 'aloha', 'aloo', 'alpha', 'alphabet', 'alphabet', 'alphatot', 'alp

In [52]:
products['product_name'] = products['product_name'].apply(lambda x: x.split(' '))

In [53]:
products['product_name'] = products['product_name'].apply(filter_and_normalize_list)

## Products that appear in recipes

In [54]:
recipes = pd.read_json("../../data/recipes/train.json")
fl = flatten(list(recipes["ingredients"]))
ingr = " ".join(fl)

recipes_products = re.sub("[^\w]", " ",  ingr).split()
recipes_products = filter_and_normalize_list(recipes_products)

In [55]:
Counter(recipes_products).most_common(2000)

[('pepper', 27187),
 ('salt', 24462),
 ('oil', 23344),
 ('onion', 19242),
 ('garlic', 18941),
 ('ground', 18271),
 ('fresh', 17853),
 ('sauce', 13129),
 ('sugar', 12502),
 ('cheese', 11776),
 ('chicken', 11557),
 ('tomato', 11205),
 ('olive', 10920),
 ('black', 10753),
 ('water', 9790),
 ('red', 9189),
 ('green', 9089),
 ('egg', 9039),
 ('flour', 8851),
 ('butter', 8648),
 ('powder', 8290),
 ('chopped', 7884),
 ('clov', 7557),
 ('juice', 7146),
 ('white', 6915),
 ('cream', 6096),
 ('rice', 6073),
 ('cilantro', 5950),
 ('milk', 5872),
 ('lemon', 5796),
 ('vegetable', 5645),
 ('leav', 5580),
 ('large', 5396),
 ('ginger', 5388),
 ('corn', 5325),
 ('dried', 5316),
 ('vinegar', 4972),
 ('lime', 4942),
 ('soy', 4888),
 ('all', 4839),
 ('purpose', 4831),
 ('cumin', 4644),
 ('broth', 4504),
 ('chili', 4318),
 ('wine', 4315),
 ('bell', 4275),
 ('parsley', 4150),
 ('seed', 4038),
 ('bean', 3994),
 ('sesame', 3518),
 ('breast', 3335),
 ('grated', 3280),
 ('carrot', 3232),
 ('kosher', 3180),
 ('ba

In [56]:
prohibited_words = ["salt", "fresh", "chopped", "large", "all", "purpose", "unsalted", "virgin", "traditional", "natural", "whole", "frozen", "half", "evaporated"]
recipes_products = filter(lambda x: x not in prohibited_words, recipes_products)

In [57]:
recipes_products = list(recipes_products)
recipes_products = np.unique(recipes_products)
recipes_products

array(['abalone', 'abbamele', 'absinthe', ..., 'ziti', 'zucchini', 'épic'],
      dtype='<U16')

In [58]:
common = list(set(bought_names) & set(recipes_products))
prohibited_words = ['organic', 'original', 'water', 'with', 'of', 'and', 'no', 'a', 'mr', 'style']
common = filter(lambda x: x not in prohibited_words, common)
common = list(common)

In [59]:
len(common)

1839

In [60]:
def filer_products(words):
    words = [word for word in words if word in common]
    return ' '.join(words)

In [61]:
products.product_name = products.product_name.map(filer_products)
products = products[products.product_name != ""]
products.to_csv("../../data/instacart/products_cropped.csv")

In [62]:
recipes.ingredients = recipes.ingredients.map(lambda x: " ".join(x))
recipes.ingredients = recipes.ingredients.map(lambda x: x.split(" "))
recipes.ingredients = recipes.ingredients.map(filer_products)
recipes = recipes[recipes.ingredients != ""]
recipes.to_csv("../../data/instacart/recipes_cropped.csv")