### Housekeeping

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import unicodedata
import re

from IPython.display import display

%matplotlib inline

## Read test / train data

In [2]:
train_df = pd.read_json('../data/train.json')
test_df = pd.read_json('../data/test.json')

## Do some general data cleaning
### Split ingridient list for train and test data into separate records ( a record with 3 ingridients will become 3 records with 1 ingridient )

In [3]:
train_ingrs = train_df.groupby('id').ingredients.apply(
    lambda x: pd.DataFrame(x.values[0])).reset_index().drop('level_1', axis =1)
train_ingrs.columns = ['id', 'ingridient']

test_ingrs = test_df.groupby('id').ingredients.apply(
    lambda x: pd.DataFrame(x.values[0])).reset_index().drop('level_1', axis =1)
test_ingrs.columns = ['id', 'ingridient']
print "TO START: Train -", len(train_ingrs.ingridient.unique()), "--- Test -", len(test_ingrs.ingridient.unique())

TO START: Train - 6714 --- Test - 4484


### Convert all ingridients to lower case

In [4]:
train_ingrs['ingridient'] = train_ingrs['ingridient'].apply(lambda x: x.lower())
test_ingrs['ingridient'] = test_ingrs['ingridient'].apply(lambda x: x.lower())
print "LOWERED CASE: Train -", len(train_ingrs.ingridient.unique()), "--- Test -", len(test_ingrs.ingridient.unique())

LOWERED CASE: Train - 6703 --- Test - 4479


### Remove all unicode characters

In [5]:
train_ingrs['ingridient'] = train_ingrs['ingridient'].apply(
    lambda x: unicodedata.normalize('NFD', x).encode('ascii', 'ignore'))
test_ingrs['ingridient'] = test_ingrs['ingridient'].apply(
    lambda x: unicodedata.normalize('NFD', x).encode('ascii', 'ignore'))
print "CLEANED UNICODE: Train -", len(train_ingrs.ingridient.unique()), "--- Test -", len(test_ingrs.ingridient.unique())

CLEANED UNICODE: Train - 6701 --- Test - 4478


## Fix some obvious misspellings
### sauc, chees, yoghurt, ic cream

In [6]:
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace('sauc', 'sauce')
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace('saucee', 'sauce')
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace('sauc', 'sauce')
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace('saucee', 'sauce')
print "SAUC/EE & SAUCE: Train -", len(train_ingrs.ingridient.unique()), "--- Test -", len(test_ingrs.ingridient.unique())

train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace('chees', 'cheese')
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace('chees', 'cheese')
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace('cheesee', 'cheese')
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace('cheesee', 'cheese')
print "CHEES/EE & CHEESE: Train -", len(train_ingrs.ingridient.unique()), "--- Test -", len(test_ingrs.ingridient.unique())

train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace('yoghurt', 'yogurt')
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace('yoghurt', 'yogurt')
print "YOGHURT: Train -", len(train_ingrs.ingridient.unique()), "--- Test -", len(test_ingrs.ingridient.unique())

train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace('ic cream', 'ice cream')
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace('ic cream', 'ice cream')
print "IC CREAM: Train -", len(train_ingrs.ingridient.unique()), "--- Test -", len(test_ingrs.ingridient.unique())

SAUC/EE & SAUCE: Train - 6701 --- Test - 4478
CHEES/EE & CHEESE: Train - 6701 --- Test - 4478
YOGHURT: Train - 6701 --- Test - 4478
IC CREAM: Train - 6701 --- Test - 4478


### I can't believe it's not butter

In [7]:
gb_train = train_ingrs[train_ingrs.ingridient.str.contains("believ")].groupby('ingridient')
gb_test = test_ingrs[test_ingrs.ingridient.str.contains("believ")].groupby('ingridient')
display(gb_train.all())
display(gb_test.all())

Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
i can't believ it' not butter! made with olive oil spread,True
i can't believe it's not butter! all purpose sticks,True
i can't believe it's not butter! spread,True


Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
i can't believe it's not butter! spread,True


In [8]:
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace("i can't believ it' not butter!", "ictbisnb")
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace("i can't believ it' not butter!", "ictbisnb")
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace("i can't believe it's not butter!", "ictbisnb")
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace("i can't believe it's not butter!", "ictbisnb")
print "CAN'T BELIEVE: Train -", len(train_ingrs.ingridient.unique()), "--- Test -", len(test_ingrs.ingridient.unique())

CAN'T BELIEVE: Train - 6701 --- Test - 4478


## Look how single apostrophe is being used and clean up

In [9]:
display(train_ingrs[train_ingrs.ingridient.str.contains("'")])
display(test_ingrs[test_ingrs.ingridient.str.contains("'")])

Unnamed: 0,id,ingridient
1639,180,hellmann' or best food real mayonnais
4179,478,m&m's candy
4797,551,hellmann's real mayonnaise
5722,662,soft goat's cheese
12886,1514,soft goat's cheese
13030,1528,hellmann' or best food real mayonnais
23194,2672,hellmann' or best food real mayonnais
23200,2673,piment d'espelette
28782,3317,breakstone's sour cream
31033,3569,soft goat's cheese


Unnamed: 0,id,ingridient
3578,1570,piment d'espelette
6168,2990,piment d'espelette
6509,3133,pig's ear
10956,5114,campbell's condensed cheddar cheese soup
13596,6376,campbell's condensed cream of chicken soup
18873,9030,soft goat's cheese
22526,10609,hellmann' or best food real mayonnais
23661,11154,hellmann's mayonnaise with a hint of wasabi
23884,11258,hellmann''s light mayonnaise
23894,11263,hellmann's real mayonnaise


In [10]:
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace("goat's", "goat")
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace("goat's", "goat")
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace("soft fresh goat cheese", "soft goat cheese")
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace("soft fresh goat cheese", "soft goat cheese")
print "GOAT'S & GOAT: Train -", len(train_ingrs.ingridient.unique()), "--- Test -", len(test_ingrs.ingridient.unique())



GOAT'S & GOAT: Train - 6700 --- Test - 4477


In [11]:
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace("hellmann' or best food", "hellmann")
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace("hellmann' or best food", "hellmann")
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace("hellmann's", "hellmann")
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace("hellmann's", "hellmann")
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace("hellmann''s", "hellmann")
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace("hellmann''s", "hellmann")
print "HELLMANN: Train -", len(train_ingrs.ingridient.unique()), "--- Test -", len(test_ingrs.ingridient.unique())

HELLMANN: Train - 6700 --- Test - 4477


In [12]:
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace("uncle ben's", "uncle bens")
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace("uncle ben's", "uncle bens")
print "UNCLE BENS: Train -", len(train_ingrs.ingridient.unique()), "--- Test -", len(test_ingrs.ingridient.unique())

UNCLE BENS: Train - 6700 --- Test - 4477


In [13]:
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace("french's spicy brown mustard", "spicy brown mustard")
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace("french's spicy brown mustard", "spicy brown mustard")
print "FRENCH'S: Train -", len(train_ingrs.ingridient.unique()), "--- Test -", len(test_ingrs.ingridient.unique())

FRENCH'S: Train - 6700 --- Test - 4476


## Sodium ...   Fat ... Stock ...
### Sodium

In [14]:
gb_train = train_ingrs[train_ingrs.ingridient.str.contains("sodium")].groupby('ingridient')
gb_test = test_ingrs[test_ingrs.ingridient.str.contains("sodium")].groupby('ingridient')
display(gb_train.all())
display(gb_test.all())

Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
25% less sodium chicken broth,True
33% less sodium cooked deli ham,True
33% less sodium cooked ham,True
33% less sodium ham,True
33% less sodium smoked fully cooked ham,True
40% less sodium taco seasoning,True
40% less sodium taco seasoning mix,True
bottled low sodium salsa,True
canned low sodium chicken broth,True
condensed reduced fat reduced sodium cream of chicken soup,True


Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
33% less sodium cooked deli ham,True
33% less sodium smoked ham,True
40% less sodium taco seasoning,True
50% less sodium black beans,True
canned low sodium chicken broth,True
condensed reduced fat reduced sodium cream of mushroom soup,True
fat free less sodium beef broth,True
fat free less sodium chicken broth,True
fat free reduced sodium chicken broth,True
fat-free reduced-sodium chicken broth,True


In [15]:
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace("25% less sodium", "low_sodium")
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace("25% less sodium", "low_sodium")
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace("33% less sodium", "low_sodium")
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace("33% less sodium", "low_sodium")
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace("40% less sodium", "low_sodium")
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace("40% less sodium", "low_sodium")
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace("50% less sodium", "low_sodium")
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace("50% less sodium", "low_sodium")
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace("reduced sodium", "low_sodium")
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace("reduced sodium", "low_sodium")
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace("reduc sodium", "low_sodium")
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace("reduc sodium", "low_sodium")
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace("lower sodium", "low_sodium")
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace("lower sodium", "low_sodium")
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace("low-sodium", "low_sodium")
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace("low-sodium", "low_sodium")
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace("lowsodium", "low_sodium")
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace("lowsodium", "low_sodium")
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace("less sodium", "low_sodium")
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace("less sodium", "low_sodium")
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace("reduced-sodium", "low_sodium")
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace("reduced-sodium", "low_sodium")
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace("sodium reduced", "low_sodium")
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace("sodium reduced", "low_sodium")
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace("25% low sodium", "low_sodium")
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace("25% low sodium", "low_sodium")
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace("33% low sodium", "low_sodium")
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace("33% low sodium", "low_sodium")
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace("40% low sodium", "low_sodium")
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace("40% low sodium", "low_sodium")
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace("50% low sodium", "low_sodium")
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace("50% low sodium", "low_sodium")
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace("low sodium", "low_sodium")
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace("low sodium", "low_sodium")
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace("freelow_sodium", "free low_sodium")
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace("freelow_sodium", "free low_sodium")
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace("low salt", "low_sodium")
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace("low salt", "low_sodium")
print "SODIUM: Train -", len(train_ingrs.ingridient.unique()), "--- Test -", len(test_ingrs.ingridient.unique())
gb_train = train_ingrs[train_ingrs.ingridient.str.contains("sodium")].groupby('ingridient')
gb_test = test_ingrs[test_ingrs.ingridient.str.contains("sodium")].groupby('ingridient')
display(gb_train.all())
display(gb_test.all())

SODIUM: Train - 6676 --- Test - 4460


Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
bottled low_sodium salsa,True
canned low_sodium chicken broth,True
condensed reduced fat low_sodium cream of chicken soup,True
condensed reduced fat low_sodium cream of mushroom soup,True
condensed reduced fat low_sodium tomato soup,True
fat free low_sodium beef broth,True
fat free low_sodium chicken broth,False
fat free low_sodium vegetable broth,True
fat skimmed low_sodium chicken broth,True
fat-free low_sodium chicken broth,True


Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
canned low_sodium chicken broth,True
condensed reduced fat low_sodium cream of mushroom soup,True
fat free low_sodium beef broth,True
fat free low_sodium chicken broth,True
fat-free low_sodium chicken broth,True
fatfree low_sodium chicken broth,True
knorr low_sodium chicken flavor bouillon,True
low_sodium beef broth,True
low_sodium beef stock,True
low_sodium black beans,True


### stock vs broth

In [16]:
gb_train = train_ingrs[train_ingrs.ingridient.str.contains("stock")].groupby('ingridient')
gb_test = test_ingrs[test_ingrs.ingridient.str.contains("stock")].groupby('ingridient')
display(gb_train.all())
display(gb_test.all())
gb_train = train_ingrs[train_ingrs.ingridient.str.contains("broth")].groupby('ingridient')
gb_test = test_ingrs[test_ingrs.ingridient.str.contains("broth")].groupby('ingridient')
display(gb_train.all())
display(gb_test.all())
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace("stock", "broth")
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace("stock", "broth")
train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace("store bought", "canned")
test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace("store bought", "canned")
print "BROTH: Train -", len(train_ingrs.ingridient.unique()), "--- Test -", len(test_ingrs.ingridient.unique())

Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
beef stock,True
beef stock cubes,True
brown chicken stock,True
chicken stock,True
chicken stock cubes,True
dark chicken stock,True
duck stock,True
fish stock,True
fresh chicken stock,True
gluten-free chicken stock,True


Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
beef stock,True
beef stock cubes,True
brown chicken stock,True
chicken stock,True
chicken stock cubes,True
duck stock,True
fish stock,True
fresh chicken stock,True
ham stock cube,True
homemade beef stock,True


Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
beef broth,True
bone broth,True
broth,True
canned beef broth,True
canned chicken broth,True
canned low_sodium chicken broth,True
chicken broth,True
chicken broth low fat,True
condensed chicken broth,True
fat free beef broth,True


Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
beef broth,True
broth,True
canned beef broth,True
canned chicken broth,True
canned low_sodium chicken broth,True
chicken broth,True
chicken broth low fat,True
condensed chicken broth,True
fat free beef broth,True
fat free low_sodium beef broth,True


BROTH: Train - 6658 --- Test - 4447


### fat

In [17]:
gb_train = train_ingrs[train_ingrs.ingridient.str.contains("fat")].groupby('ingridient')
gb_test = test_ingrs[test_ingrs.ingridient.str.contains("fat")].groupby('ingridient')
display(gb_train.all())
display(gb_test.all())

Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
1% low-fat buttermilk,True
1% low-fat chocolate milk,True
1% low-fat cottage cheese,True
1% low-fat milk,True
2% low fat cheddar cheese,True
2% low-fat cottage cheese,True
2% lowfat greek yogurt,True
2% reduced-fat milk,True
and fat free half half,True
bacon fat,True


Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
1% low-fat buttermilk,True
1% low-fat cottage cheese,True
1% low-fat milk,True
2% low-fat cottage cheese,True
2% lowfat greek yogurt,True
2% reduced fat chocolate milk,True
2% reduced-fat milk,True
and fat free half half,True
bacon fat,True
beef fat,True


In [18]:
t_list = [('evapor low-fat milk', 'evaporated low_fat milk'),
          ('evaporated low-fat 2% milk', 'evaporated low_fat milk'),
          ('1% low-fat', 'low_fat'),
          ('2% low fat', 'low_fat'),
          ('2% low-fat', 'low_fat'),
          ('2% lowfat', 'low_fat'),
          ('2% reduced fat', 'low_fat'),
          ('2% reduced-fat', 'low_fat'),
          ('2% milkfat', ''),
          ('2%milkfat', ''),
          ('fat free', 'non_fat'),
          ('fat-free', 'non_fat'),
          ('fatfree', 'non_fat'),
          ('fatfre', 'non_fat'),
          ('nonfat', 'non_fat'),
          ('lowfat', 'low_fat'),
          ('low fat', 'low_fat'),
          ('low-fat', 'low_fat'),
          ('reduced fat', 'low_fat'),
          ('reduced-fat', 'low_fat'),
          ('fat skimmed', 'low_fat'),
          ('full fat ', ''),
          ('full-fat ', ''),
          (' (not low_fat)', ''),
         ]
for t in t_list:
    train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace(t[0], t[1])
    test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace(t[0], t[1])
print "FAT: Train -", len(train_ingrs.ingridient.unique()), "--- Test -", len(test_ingrs.ingridient.unique())
gb_train = train_ingrs[train_ingrs.ingridient.str.contains("fat")].groupby('ingridient')
gb_test = test_ingrs[test_ingrs.ingridient.str.contains("fat")].groupby('ingridient')
display(gb_train.all())
display(gb_test.all())

FAT: Train - 6620 --- Test - 4419


Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
and non_fat half half,True
bacon fat,True
caul fat,True
chicken broth low_fat,True
coffee low_fat frozen yogurt,True
condensed low_fat low_sodium cream of chicken soup,True
condensed low_fat low_sodium cream of mushroom soup,True
condensed low_fat low_sodium tomato soup,True
cooking fat,True
cream cheese low_fat,True


Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
and non_fat half half,True
bacon fat,True
beef fat,True
cheese fat grate parmesan reduc,True
chicken broth low_fat,True
condensed low_fat low_sodium cream of mushroom soup,True
cooking fat,True
crawfish fat,True
cream cheese low_fat,True
cream low_fat,True


## Sugar, flour, salt, butter, oil, dressing, cheese, mayo
### sugar

In [19]:
gb_train = train_ingrs[train_ingrs.ingridient.str.contains("sugar")].groupby('ingridient')
gb_test = test_ingrs[test_ingrs.ingridient.str.contains("sugar")].groupby('ingridient')
display(gb_train.all())
display(gb_test.all())

Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
baking sugar,True
brown sugar,True
cane sugar,True
caster sugar,True
chinese rock sugar,True
cinnamon sugar,True
coarse sugar,True
coconut sugar,True
confectioners sugar,True
dark brown sugar,True


Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
brown dark firmli pack sugar,True
brown sugar,True
cane sugar,True
caster sugar,True
cinnamon sugar,True
coarse sugar,True
coconut sugar,True
confectioners sugar,True
dark brown sugar,True
dark muscovado sugar,True


In [20]:
t_list = [('baking sugar', 'sugar'),
          ('caster sugar', 'superfine sugar'),
          ('superfine white sugar', 'superfine sugar'),
          ('confectioners sugar', 'powdered sugar'),
          ('decorating sugars', 'powdered sugar'),
          ('domino confectioners sugar', 'powdered sugar'),
          ('domino powdered sugar', 'powdered sugar'),
          ('powdered sugar icing', 'powdered sugar'),
          ('domino light brown sugar', 'light brown sugar'),
          ('extra fine granulated sugar', 'superfine sugar'),
          ('fine granulated sugar', 'superfine sugar'),
          ('granulated sugar', 'sugar'),
          ('granulated white sugar', 'sugar'),
          ('imperial sugar light brown sugar', 'light brown sugar'),
          ('refined sugar', 'sugar'),
          ('regular sugar', 'sugar'),
          ('white sugar', 'sugar'),
          ('brown dark firmli pack sugar', 'dark brown sugar'),
         ]
for t in t_list:
    train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace(t[0], t[1])
    test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace(t[0], t[1])
print "SUGAR: Train -", len(train_ingrs.ingridient.unique()), "--- Test -", len(test_ingrs.ingridient.unique())
gb_train = train_ingrs[train_ingrs.ingridient.str.contains("sugar")].groupby('ingridient')
gb_test = test_ingrs[test_ingrs.ingridient.str.contains("sugar")].groupby('ingridient')
display(gb_train.all())
display(gb_test.all())

SUGAR: Train - 6603 --- Test - 4411


Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
brown sugar,True
cane sugar,True
chinese rock sugar,True
cinnamon sugar,True
coarse sugar,True
coconut sugar,True
dark brown sugar,True
dark muscovado sugar,True
date sugar,True
demerara sugar,True


Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
brown sugar,True
cane sugar,True
cinnamon sugar,True
coarse sugar,True
coconut sugar,True
dark brown sugar,True
dark muscovado sugar,True
firmly packed brown sugar,True
firmly packed light brown sugar,True
golden brown sugar,True


### flour

In [21]:
gb_train = train_ingrs[train_ingrs.ingridient.str.contains("flour")].groupby('ingridient')
gb_test = test_ingrs[test_ingrs.ingridient.str.contains("flour")].groupby('ingridient')
display(gb_train.all())
display(gb_test.all())

Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
all purpose unbleached flour,True
all-purpose flour,True
almond flour,True
arepa flour,True
arrowroot flour,True
azteca flour tortillas,True
barley flour,True
besan (flour),True
blanched almond flour,True
bread flour,True


Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
all purpose unbleached flour,True
all-purpose flour,True
almond flour,True
arepa flour,True
barley flour,True
blanched almond flour,True
bread flour,True
brown rice flour,True
buckwheat flour,True
cake flour,True


In [22]:
t_list = [('all purpose unbleached flour', 'all_purpose_flour'),
          ('gold medal all purpose flour', 'all_purpose_flour'),
          ('all-purpose flour', 'all_purpose_flour'),
          ('chapati flour','chapatti flour'),
          ('cornflour', 'corn flour'),
          ('self raising flour','self rising flour'),
          ('white bread flour','white flour'),
          ('whole wheat bread flour','whole wheat flour'),
          ('wholemeal flour','whole wheat flour'),
          ('plain flour','flour'),
        ]
for t in t_list:
    train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace(t[0], t[1])
    test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace(t[0], t[1])
print "FLOUR: Train -", len(train_ingrs.ingridient.unique()), "--- Test -", len(test_ingrs.ingridient.unique())
gb_train = train_ingrs[train_ingrs.ingridient.str.contains("flour")].groupby('ingridient')
gb_test = test_ingrs[test_ingrs.ingridient.str.contains("flour")].groupby('ingridient')
display(gb_train.all())
display(gb_test.all())

FLOUR: Train - 6594 --- Test - 4405


Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
all_purpose_flour,True
almond flour,True
arepa flour,True
arrowroot flour,True
azteca flour tortillas,True
barley flour,True
besan (flour),True
blanched almond flour,True
bread flour,True
brown rice flour,True


Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
all_purpose_flour,True
almond flour,True
arepa flour,True
barley flour,True
blanched almond flour,True
bread flour,True
brown rice flour,True
buckwheat flour,True
cake flour,True
chapatti flour,True


### salt

In [23]:
gb_train = train_ingrs[train_ingrs.ingridient.str.contains("salt")].groupby('ingridient')
gb_test = test_ingrs[test_ingrs.ingridient.str.contains("salt")].groupby('ingridient')
display(gb_train.all())
display(gb_test.all())

Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
bacon salt,True
black salt,True
butter salt,True
canning salt,True
cashew chop unsalt,True
celery salt,True
celtic salt,True
coarse kosher salt,True
coarse salt,True
coarse sea salt,True


Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
black salt,True
celery salt,True
coarse kosher salt,True
coarse salt,True
coarse sea salt,True
coarse-grain salt,True
curing salt,True
curry guy smoked spicy salt,True
diamond crystal kosher salt,True
dried salted codfish,True


In [24]:
t_list = [('no salt added', 'salt free'),
          ('no-salt-added', 'salt free'),
        ]
for t in t_list:
    train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace(t[0], t[1])
    test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace(t[0], t[1])
print "SALT: Train -", len(train_ingrs.ingridient.unique()), "--- Test -", len(test_ingrs.ingridient.unique())
gb_train = train_ingrs[train_ingrs.ingridient.str.contains("salt")].groupby('ingridient')
gb_test = test_ingrs[test_ingrs.ingridient.str.contains("salt")].groupby('ingridient')
display(gb_train.all())
display(gb_test.all())

SALT: Train - 6594 --- Test - 4405


Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
bacon salt,True
black salt,True
butter salt,True
canning salt,True
cashew chop unsalt,True
celery salt,True
celtic salt,True
coarse kosher salt,True
coarse salt,True
coarse sea salt,True


Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
black salt,True
celery salt,True
coarse kosher salt,True
coarse salt,True
coarse sea salt,True
coarse-grain salt,True
curing salt,True
curry guy smoked spicy salt,True
diamond crystal kosher salt,True
dried salted codfish,True


### butter

In [25]:
gb_train = train_ingrs[train_ingrs.ingridient.str.contains("butter")].groupby('ingridient')
gb_test = test_ingrs[test_ingrs.ingridient.str.contains("butter")].groupby('ingridient')
display(gb_train.all())
display(gb_test.all())

Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
almond butter,True
apple butter,True
bread and butter pickle slices,True
bread and butter pickles,True
brown butter,True
butter,True
butter beans,True
butter cake,True
butter cooking spray,True
butter crackers,True


Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
almond butter,True
apple butter,True
bread and butter pickles,True
brown butter,True
butter,True
butter beans,True
butter cake mix,True
butter cookies,True
butter cooking spray,True
butter crackers,True


### oil

In [26]:

gb_train = train_ingrs[train_ingrs.ingridient.str.contains("oil")].groupby('ingridient')
gb_test = test_ingrs[test_ingrs.ingridient.str.contains("oil")].groupby('ingridient')
display(gb_train.all())
display(gb_test.all())

Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
almond oil,True
anise oil,True
annatto oil,True
avocado oil,True
basil olive oil,True
bertolli classico olive oil,True
black truffle oil,True
boiled eggs,True
boiled ham,True
boiling onions,True


Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
annatto oil,True
avocado oil,True
bertolli classico olive oil,True
bertolli olive oil & garlic sauce,True
best foods mayonnaise dressing with extra virgin olive oil,True
boiled eggs,True
boiler,True
boiling onions,True
boiling potatoes,True
boiling water,True


In [27]:
t_list = [('vegetable oil cooking spray', 'vegetable oil spray'),
          ('extra-virgin olive oil', 'evoo'),
          ('extra virgin olive oil', 'evoo'),
        ]
for t in t_list:
    train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace(t[0], t[1])
    test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace(t[0], t[1])
print "OIL: Train -", len(train_ingrs.ingridient.unique()), "--- Test -", len(test_ingrs.ingridient.unique())
gb_train = train_ingrs[train_ingrs.ingridient.str.contains("oil")].groupby('ingridient')
gb_test = test_ingrs[test_ingrs.ingridient.str.contains("oil")].groupby('ingridient')
display(gb_train.all())
display(gb_test.all())

OIL: Train - 6593 --- Test - 4404


Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
almond oil,True
anise oil,True
annatto oil,True
avocado oil,True
basil olive oil,True
bertolli classico olive oil,True
black truffle oil,True
boiled eggs,True
boiled ham,True
boiling onions,True


Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
annatto oil,True
avocado oil,True
bertolli classico olive oil,True
bertolli olive oil & garlic sauce,True
boiled eggs,True
boiler,True
boiling onions,True
boiling potatoes,True
boiling water,True
broiler-fryer chicken,True


### dressing

In [28]:
gb_train = train_ingrs[train_ingrs.ingridient.str.contains("dressing")].groupby('ingridient')
gb_test = test_ingrs[test_ingrs.ingridient.str.contains("dressing")].groupby('ingridient')
display(gb_train.all())
display(gb_test.all())

Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
asian dressing,True
avocado dressing,True
balsamic vinaigrette salad dressing,True
blue cheese dressing,True
bottled italian dressing,True
buttermilk ranch dressing,True
caesar salad dressing,True
catalina dressing,True
coleslaw dressing,True
dressing,True


Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
balsamic vinaigrette salad dressing,True
best foods mayonnaise dressing with evoo,True
blue cheese dressing,True
caesar salad dressing,True
dressing,True
french dressing,True
greek dressing,True
herb dressing,True
hidden valley original ranch salad dressing & seasoning mix,True
honey mustard dressing,True


### cheese

In [29]:
gb_train = train_ingrs[train_ingrs.ingridient.str.contains("cheese")].groupby('ingridient')
gb_test = test_ingrs[test_ingrs.ingridient.str.contains("cheese")].groupby('ingridient')
display(gb_train.all())
display(gb_test.all())

Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
2% milk shredded mozzarella cheese,True
aged cheddar cheese,True
aged manchego cheese,True
american cheese,True
american cheese food,True
american cheese slices,True
bertolli four cheese rosa sauce,True
blanco cheese queso,True
blue cheese,True
blue cheese dressing,True


Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
american cheese,True
american cheese slices,True
bertolli four cheese rosa sauce,True
blue cheese,True
blue cheese dressing,True
boursin cheese with garlic and herbs,True
brie cheese,True
campbell's condensed cheddar cheese soup,True
cheddar cheese,True
cheddar cheese soup,True


In [30]:
t_list = [('2% milk shredded', 'shredded low_fat'),
          ('cheese fat grate parmesan reduc', 'low_fat grated parmesan cheese'),
        ]
for t in t_list:
    train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace(t[0], t[1])
    test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace(t[0], t[1])
print "CHEESE: Train -", len(train_ingrs.ingridient.unique()), "--- Test -", len(test_ingrs.ingridient.unique())
gb_train = train_ingrs[train_ingrs.ingridient.str.contains("cheese")].groupby('ingridient')
gb_test = test_ingrs[test_ingrs.ingridient.str.contains("cheese")].groupby('ingridient')
display(gb_train.all())
display(gb_test.all())

CHEESE: Train - 6592 --- Test - 4404


Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
aged cheddar cheese,True
aged manchego cheese,True
american cheese,True
american cheese food,True
american cheese slices,True
bertolli four cheese rosa sauce,True
blanco cheese queso,True
blue cheese,True
blue cheese dressing,True
brie cheese,True


Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
american cheese,True
american cheese slices,True
bertolli four cheese rosa sauce,True
blue cheese,True
blue cheese dressing,True
boursin cheese with garlic and herbs,True
brie cheese,True
campbell's condensed cheddar cheese soup,True
cheddar cheese,True
cheddar cheese soup,True


### mayonnaise

In [31]:
gb_train = train_ingrs[train_ingrs.ingridient.str.contains("mayon")].groupby('ingridient')
gb_test = test_ingrs[test_ingrs.ingridient.str.contains("mayon")].groupby('ingridient')
display(gb_train.all())
display(gb_test.all())

Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
basil mayonnaise,True
best food's mayonnaise with lime juice,True
best foods real mayonnaise,True
canola mayonnaise,True
garlic mayonnaise,True
hellmann light mayonnais,True
hellmann light mayonnaise,True
hellmann real mayonnais,True
hellmann real mayonnaise,True
hellmanna or best food canola cholesterol free mayonnais,True


Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
basil mayonnaise,True
best foods mayonnaise dressing with evoo,True
canola mayonnaise,True
hellmann light mayonnais,True
hellmann light mayonnaise,True
hellmann mayonnaise with a hint of wasabi,True
hellmann real mayonnais,True
hellmann real mayonnaise,True
japanese mayonnaise,True
kewpie mayonnaise,True


In [32]:
t_list = [('mayonnais', 'mayonnaise'),
          ('mayonnaisee', 'mayonnaise'),
          ('mayonaise', 'mayonnaise'),
          ('light mayonnaise', 'low_fat mayonnaise'),
        ]
for t in t_list:
    train_ingrs['ingridient'] = train_ingrs['ingridient'].str.replace(t[0], t[1])
    test_ingrs['ingridient'] = test_ingrs['ingridient'].str.replace(t[0], t[1])
print "MAYO: Train -", len(train_ingrs.ingridient.unique()), "--- Test -", len(test_ingrs.ingridient.unique())
gb_train = train_ingrs[train_ingrs.ingridient.str.contains("mayon")].groupby('ingridient')
gb_test = test_ingrs[test_ingrs.ingridient.str.contains("mayon")].groupby('ingridient')
display(gb_train.all())
display(gb_test.all())

MAYO: Train - 6589 --- Test - 4401


Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
basil mayonnaise,True
best food's mayonnaise with lime juice,True
best foods real mayonnaise,True
canola mayonnaise,True
garlic mayonnaise,True
hellmann low_fat mayonnaise,True
hellmann real mayonnaise,True
hellmanna or best food canola cholesterol free mayonnaise,True
japanese mayonnaise,True
kewpie mayonnaise,True


Unnamed: 0_level_0,id
ingridient,Unnamed: 1_level_1
basil mayonnaise,True
best foods mayonnaise dressing with evoo,True
canola mayonnaise,True
hellmann low_fat mayonnaise,True
hellmann mayonnaise with a hint of wasabi,True
hellmann real mayonnaise,True
japanese mayonnaise,True
kewpie mayonnaise,True
low_fat mayonnaise,True
mayonnaise,True


### So far we cleaned up from  Train - 6714 --- Test - 4484 unique combinations down to  Train - 6589 --- Test - 4401


# Look for ingridients with digit and or punctuation

# Look for singular / plural forms

# Remove SOME verbs from ingiridients

# Alphabetize and see if that will decrease the number of unique combinations

nltk  word tokenize
Naive Bayes with TFID bag of words
KNN
SVM
Random Forest
Linear SVC
WordNetLemmatizer
TfidfVecotrizer
CountVectorizer
SelectFromModel
LogisticRegressionCV ( cross validation)
LassoCV