# Grill and BBQ Data Cleaning

## Setup

In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
# visualization
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn')

In [3]:
# for lemmatimzation
import spacy

## Read in Data

In [4]:
# Read in the data
df = pd.read_csv('unzipped_data/grill-and-bbq.csv', sep = '|')
print("Shape of DataFrame", df.shape)
df.head()

Shape of DataFrame (288456, 5)


Unnamed: 0,PAGE,URL,COUNT,AMOUNT,INGREDIENT
0,1,https://www.bigoven.com/recipe/chicken-breasts...,8,2 tablespoons,canola oil
1,1,https://www.bigoven.com/recipe/chicken-breasts...,8,1/2 large,onion
2,1,https://www.bigoven.com/recipe/chicken-breasts...,8,6,pineapple
3,1,https://www.bigoven.com/recipe/chicken-breasts...,8,,pepper
4,1,https://www.bigoven.com/recipe/chicken-breasts...,8,1 1/2 cup,tomatillo


## Clean Data: General

### Drop metadata

'PAGE' and 'COUNT' are not needed for the analysis.

In [5]:
df.drop(['PAGE', 'COUNT'], axis = 1, inplace = True)
df.head()

Unnamed: 0,URL,AMOUNT,INGREDIENT
0,https://www.bigoven.com/recipe/chicken-breasts...,2 tablespoons,canola oil
1,https://www.bigoven.com/recipe/chicken-breasts...,1/2 large,onion
2,https://www.bigoven.com/recipe/chicken-breasts...,6,pineapple
3,https://www.bigoven.com/recipe/chicken-breasts...,,pepper
4,https://www.bigoven.com/recipe/chicken-breasts...,1 1/2 cup,tomatillo


### Fix error in header

### Remove duplicates

In [6]:
print("shape of DataFrame before de-dup:", df.shape)
print("Number of duplicates:",sum(df.duplicated()))

shape of DataFrame before de-dup: (288456, 3)
Number of duplicates: 276769


In [7]:
# Remove duplicates
df = df.drop_duplicates()

print("shape of DataFrame after de-dup:", df.shape)

shape of DataFrame after de-dup: (11687, 3)


### Make all in lower case

In [8]:
for item in ['AMOUNT', 'INGREDIENT']: 
    df[item] = df[item].apply(lambda x: x.lower().strip())

## Look at 'URL' Column

### Create a column with dish id

In [9]:
# Write a function to get the dish id from URL
def get_dish_id(url): 
    dish_id = re.findall('\S*/([0-9]+)$', url)[0]
    return dish_id

In [10]:
# apply the function to the DataFrame
df['dish_id'] = df['URL'].apply(get_dish_id)
df.head()

Unnamed: 0,URL,AMOUNT,INGREDIENT,dish_id
0,https://www.bigoven.com/recipe/chicken-breasts...,2 tablespoons,canola oil,561575
1,https://www.bigoven.com/recipe/chicken-breasts...,1/2 large,onion,561575
2,https://www.bigoven.com/recipe/chicken-breasts...,6,pineapple,561575
3,https://www.bigoven.com/recipe/chicken-breasts...,,pepper,561575
4,https://www.bigoven.com/recipe/chicken-breasts...,1 1/2 cup,tomatillo,561575


### Create a column with dish name

In [11]:
# Write a function to get the dish name from URL
def get_dish_name(url): 
    dish_name = re.findall('\S*recipe/(\S*)/.*', url)[0]
    dish_name = re.sub('-', ' ', dish_name)
    return dish_name

In [12]:
df['dish_name'] = df['URL'].apply(get_dish_name)
df.head()

Unnamed: 0,URL,AMOUNT,INGREDIENT,dish_id,dish_name
0,https://www.bigoven.com/recipe/chicken-breasts...,2 tablespoons,canola oil,561575,chicken breasts with grilled pineapple and tom...
1,https://www.bigoven.com/recipe/chicken-breasts...,1/2 large,onion,561575,chicken breasts with grilled pineapple and tom...
2,https://www.bigoven.com/recipe/chicken-breasts...,6,pineapple,561575,chicken breasts with grilled pineapple and tom...
3,https://www.bigoven.com/recipe/chicken-breasts...,,pepper,561575,chicken breasts with grilled pineapple and tom...
4,https://www.bigoven.com/recipe/chicken-breasts...,1 1/2 cup,tomatillo,561575,chicken breasts with grilled pineapple and tom...


In [13]:
# Check top 5 dishes
df['dish_name'].value_counts()[:5]

korean bulgogi bbq beef tacos                                                48
grilled pork tenderloin                                                      43
grilled steak tacos with cilantro chimichurri sauce                          38
grilled steak salad caesar style                                             34
smokehouse pulled pork with memphis style barbecue sauce and classic slaw    34
Name: dish_name, dtype: int64

## Clean Columns

In [14]:
# since we got all information, remove URL column
df.drop('URL', axis = 1, inplace = True)
df.head()

Unnamed: 0,AMOUNT,INGREDIENT,dish_id,dish_name
0,2 tablespoons,canola oil,561575,chicken breasts with grilled pineapple and tom...
1,1/2 large,onion,561575,chicken breasts with grilled pineapple and tom...
2,6,pineapple,561575,chicken breasts with grilled pineapple and tom...
3,,pepper,561575,chicken breasts with grilled pineapple and tom...
4,1 1/2 cup,tomatillo,561575,chicken breasts with grilled pineapple and tom...


### Rearrange columns

In [15]:
# check all columns
df.columns

Index(['AMOUNT', 'INGREDIENT', 'dish_id', 'dish_name'], dtype='object')

In [16]:
# rearrange columns
df = df[['dish_id', 'dish_name', 'INGREDIENT', 'AMOUNT']]
df.head()

Unnamed: 0,dish_id,dish_name,INGREDIENT,AMOUNT
0,561575,chicken breasts with grilled pineapple and tom...,canola oil,2 tablespoons
1,561575,chicken breasts with grilled pineapple and tom...,onion,1/2 large
2,561575,chicken breasts with grilled pineapple and tom...,pineapple,6
3,561575,chicken breasts with grilled pineapple and tom...,pepper,
4,561575,chicken breasts with grilled pineapple and tom...,tomatillo,1 1/2 cup


## Inspect all records for 'grilled steak tacos with cilantro chimichurri sauce' to figure out the way to clean further

In [17]:
grilled_steak_tacos = df[df['dish_name'] == 'grilled steak tacos with cilantro chimichurri sauce']

In [18]:
grilled_steak_tacos

Unnamed: 0,dish_id,dish_name,INGREDIENT,AMOUNT
120202,1193837,grilled steak tacos with cilantro chimichurri ...,marinade,
120203,1193837,grilled steak tacos with cilantro chimichurri ...,steak,2 lbs
120204,1193837,grilled steak tacos with cilantro chimichurri ...,onion,1 sliced
120205,1193837,grilled steak tacos with cilantro chimichurri ...,orange,
120206,1193837,grilled steak tacos with cilantro chimichurri ...,lime,
120207,1193837,grilled steak tacos with cilantro chimichurri ...,soy sauce,1/3
120208,1193837,grilled steak tacos with cilantro chimichurri ...,olive oil,1/3
120209,1193837,grilled steak tacos with cilantro chimichurri ...,sugar,1/2 tsp
120210,1193837,grilled steak tacos with cilantro chimichurri ...,garlic,"4 smashed,"
120211,1193837,grilled steak tacos with cilantro chimichurri ...,cilantro,1/2 c


## Look at 'amt' Column

### What kind of ingredients are missing amounts? 

In [19]:
# How many?
missing_amt = df[df['AMOUNT'] == '']
missing_amt.shape

(1287, 4)

In [20]:
# Check a few example
missing_amt.head()

Unnamed: 0,dish_id,dish_name,INGREDIENT,AMOUNT
3,561575,chicken breasts with grilled pineapple and tom...,pepper,
31,860272,grilled chicken with pineapple salsa,olive oil,
32,860272,grilled chicken with pineapple salsa,pepper,
33,860272,grilled chicken with pineapple salsa,salt,
39,860272,grilled chicken with pineapple salsa,cilantro,


In [21]:
# Check the ingredients
missing_amt['INGREDIENT'].value_counts()

pepper                  105
black pepper             68
salt                     63
olive oil                45
garlic                   44
lemon                    43
kosher salt              38
cooking spray            31
onion                    26
chicken                  26
lime                     25
cilantro                 24
ground pepper            23
vegetable oil            22
chicken breast           18
tomatoes                 14
sugar                    14
brown sugar              13
spice                    13
cayenne                  13
bbq                      12
mustard                  12
paprika                  11
honey                    11
marinade                 11
sesame seeds             11
parsley                  11
pork                     11
worcestershire sauce     11
cider vinegar            10
                       ... 
eggs                      1
rotini                    1
chocolate                 1
nectarine                 1
milk                

### Divide it by amount and unit

In [22]:
# How many different values? 
df['AMOUNT'].value_counts().shape

(1289,)

In [23]:
# First strip whitespace
df['AMOUNT'] = df['AMOUNT'].apply(lambda x: x.strip())

In [24]:
# How many different values? 
df['AMOUNT'].value_counts().shape

(1289,)

In [25]:
# Check top 20 values
df['AMOUNT'].value_counts()[:20]

                 1287
1 tablespoon      636
1/4 cup           535
2 tablespoons     497
1 teaspoon        484
1/2 cup           424
1/2 teaspoon      399
1                 370
1 cup             292
2                 248
1/4 teaspoon      231
2 teaspoons       211
3 tablespoons     174
4                 159
1/3 cup           129
2 tb              121
1 tb              108
1 tsp             103
2 cups            100
1 ts               93
Name: AMOUNT, dtype: int64

In [26]:
def get_amt(amt_str):
    """This function takes numeric amount from the amount string"""
    if len(amt_str) == 0:
        return ''
    elif len(re.findall('[^a-z]+', amt_str)) == 0: 
        return ''
    else: 
        return re.findall('[^a-z]+', amt_str)[0].strip()

In [27]:
df['amt_num'] = df['AMOUNT'].apply(get_amt)
df.head()

Unnamed: 0,dish_id,dish_name,INGREDIENT,AMOUNT,amt_num
0,561575,chicken breasts with grilled pineapple and tom...,canola oil,2 tablespoons,2
1,561575,chicken breasts with grilled pineapple and tom...,onion,1/2 large,1/2
2,561575,chicken breasts with grilled pineapple and tom...,pineapple,6,6
3,561575,chicken breasts with grilled pineapple and tom...,pepper,,
4,561575,chicken breasts with grilled pineapple and tom...,tomatillo,1 1/2 cup,1 1/2


In [28]:
def get_unit(amt_str):
    """This function takes unit from the amount string"""
    if len(amt_str) == 0:
        return ''
    elif len(re.findall('[A-Za-z]+', amt_str)) == 0: 
        return ''
    else: 
        return re.findall('[A-Za-z]+', amt_str)[0].strip()

In [29]:
df['unit'] = df['AMOUNT'].apply(get_unit)
df.head()

Unnamed: 0,dish_id,dish_name,INGREDIENT,AMOUNT,amt_num,unit
0,561575,chicken breasts with grilled pineapple and tom...,canola oil,2 tablespoons,2,tablespoons
1,561575,chicken breasts with grilled pineapple and tom...,onion,1/2 large,1/2,large
2,561575,chicken breasts with grilled pineapple and tom...,pineapple,6,6,
3,561575,chicken breasts with grilled pineapple and tom...,pepper,,,
4,561575,chicken breasts with grilled pineapple and tom...,tomatillo,1 1/2 cup,1 1/2,cup


In [30]:
# unit
print("Unit: {}".format(df['unit'].value_counts().shape))
print("\nValues:\n{}".format(df['unit'].value_counts()))

Unit: (276,)

Values:
               2479
cup            1596
teaspoon       1282
tablespoons     838
tablespoon      774
teaspoons       387
tb              324
tsp             308
ts              287
c               279
tbsp            275
cups            257
pound           179
large           174
pounds          166
cloves          163
lb              145
oz              134
medium          129
small            98
clove            79
ounces           69
ounce            66
whole            61
lbs              52
t                46
boneless         41
tbs              39
g                33
inch             31
               ... 
med               1
loaf              1
rashers           1
sticks            1
bowl              1
pkgs              1
kilo              1
firm              1
fillets           1
eggplants         1
in                1
fat               1
by                1
sliced            1
loins             1
mm                1
slabs             1
smashed           

In [31]:
# Numeric amount
print("Numeric amount: {}".format(df['amt_num'].value_counts().shape))
print("\nValues:\n{}".format(df['amt_num'].value_counts()))

Numeric amount: (294,)

Values:
1              3141
2              1997
               1303
1/2            1264
1/4             979
4               553
3               549
1 1/2           304
1/3             177
3/4             175
6               171
8               131
5                85
1/8              59
12               45
16               38
2/3              31
1 1/4            30
10               29
2 1/2            24
1-1/2            22
1-2              20
1.5              13
0.5              13
2-3              13
3 1/2            12
100              12
3-4              10
7                 9
1 3/4             9
               ... 
2 (9              1
2 (6-             1
1 360             1
1 (3 1/2)         1
4 12              1
3 –4              1
350               1
1 3 1/2 –4-       1
1 &#189;          1
16 1 1/2-         1
2 20-             1
2.5-3             1
4 5-              1
3/4 3/4           1
3+                1
16 (              1
2-2 1/2           1
1/4-1/2 

## Clean 'Unit'

In [32]:
print("Number of unique 'unit' values:", df['unit'].value_counts().shape[0])
print("\n'unit' values:\n{}".format(df['unit'].value_counts()))

Number of unique 'unit' values: 276

'unit' values:
               2479
cup            1596
teaspoon       1282
tablespoons     838
tablespoon      774
teaspoons       387
tb              324
tsp             308
ts              287
c               279
tbsp            275
cups            257
pound           179
large           174
pounds          166
cloves          163
lb              145
oz              134
medium          129
small            98
clove            79
ounces           69
ounce            66
whole            61
lbs              52
t                46
boneless         41
tbs              39
g                33
inch             31
               ... 
med               1
loaf              1
rashers           1
sticks            1
bowl              1
pkgs              1
kilo              1
firm              1
fillets           1
eggplants         1
in                1
fat               1
by                1
sliced            1
loins             1
mm                1
slabs   

### Lemmatize the 'unit'

In [33]:
en_nlp = spacy.load('en')

def get_clean_unit(unit):
    list_unit = [token.lemma_ for token in en_nlp(unit)]
    if len(list_unit) == 0: 
        return ''
    elif len(list_unit) > 0:
        return list_unit[0]
    
df['clean_unit'] = df['unit'].apply(get_clean_unit)

In [34]:
df['clean_unit'].value_counts()

                   2479
cup                1853
teaspoon           1669
tablespoon         1612
tb                  363
pound               345
tsp                 310
ts                  287
c                   279
tbsp                276
clove               242
lb                  197
large               174
ounce               135
oz                  134
medium              129
small                98
whole                61
t                    46
boneles              41
g                    33
inch                 31
up                   28
lg                   28
slice                27
md                   26
can                  24
green                23
each                 23
bunch                20
                   ... 
m                     1
smash                 1
pack                  1
loin                  1
half                  1
mango                 1
a                     1
tomatillo             1
white                 1
pacific               1
negi            

In [35]:
print("Number of unique 'clean_unit' values:", df['clean_unit'].value_counts().shape[0])
print("\n'clean_unit' values:\n{}".format(df['clean_unit'].value_counts()))

Number of unique 'clean_unit' values: 227

'clean_unit' values:
                   2479
cup                1853
teaspoon           1669
tablespoon         1612
tb                  363
pound               345
tsp                 310
ts                  287
c                   279
tbsp                276
clove               242
lb                  197
large               174
ounce               135
oz                  134
medium              129
small                98
whole                61
t                    46
boneles              41
g                    33
inch                 31
up                   28
lg                   28
slice                27
md                   26
can                  24
green                23
each                 23
bunch                20
                   ... 
m                     1
smash                 1
pack                  1
loin                  1
half                  1
mango                 1
a                     1
tomatillo             1


### Make full version into abbreviated version

In [36]:
# We will also fix typo here
unit_abbr_dict = {"cup": "C", "c": "C",
                  "centimetre": "cm", 
                  "each": "ea", 
                  "inch": "in",
                  "kilo": "kg",
                  "kilo": "kilogram", "kg": "kilogram",
                  "leave": "leaf", 
                  "ounce": "oz", "once": "oz",
                  "pound": "lb", 
                  "tablespoon": "tbsp", "tb": "tbsp", "tablspoon": "tbsp", "tbl": "tbsp",
                  "teaspoon": "tsp", "t": "tsp",
                  
                  # leave the size with full version
                  "sm": "small", "md": "medium", "lg": "large"
                 }

In [37]:
def get_full_unit(unit):
    if unit in unit_abbr_dict: 
        return unit_abbr_dict[unit]
    else: 
        return unit

df['clean_unit'] = df['clean_unit'].apply(get_full_unit)

In [38]:
print("Number of unique 'clean_unit' values:", df['clean_unit'].value_counts().shape[0])
print("\n'clean_unit' values:\n{}".format(df['clean_unit'].value_counts()))

Number of unique 'clean_unit' values: 208

'clean_unit' values:
            2479
tbsp        2269
C           2132
tsp         2025
lb           542
ts           287
oz           270
clove        242
large        202
medium       155
small        109
whole         61
boneles       41
ea            41
g             33
in            32
up            28
slice         27
can           24
green         23
bunch         20
lime          19
bottle        18
to            17
red           16
pinch         16
ml            14
kilogram      14
package       14
chicken       13
            ... 
chipotle       1
cocktail       1
finely         1
scotch         1
pack           1
mango          1
serving        1
half           1
sprinkle       1
teapoon        1
big            1
hot            1
fat            1
by             1
eggplant       1
negi           1
few            1
bsp            1
anchovy        1
square         1
thyme          1
thick          1
tube           1
vanilla        1
b

### Inspect ingredients in the unit

In [39]:
# Why are there some ingredients in the unit? 
df[(df['unit'] == 'radishes') | (df['unit'] == 'eggplants') | (df['unit'] == 'romaine')]

Unnamed: 0,dish_id,dish_name,INGREDIENT,AMOUNT,amt_num,unit,clean_unit
197481,1158258,grilled lettuces with crme frache and avocado,lettuce,2 romaine,2,romaine,romaine
243884,1217740,bo ssm grilled pork and pickled slaw in lettuc...,grate,"6 radishes,",6,radishes,radish
264383,1279479,grilled vegetables with balsamic vinegar,bell pepper,"2 eggplants,",2,eggplants,eggplant


In [40]:
# some of these have ',' at the end. how many? 
def end_comma(match_str):
    if re.search('.+,$', match_str): 
        return 1
    else: 
        return 0

df['unit_end_comma'] = df['AMOUNT'].apply(end_comma)

In [41]:
df['unit_end_comma'].value_counts()

0    11585
1      102
Name: unit_end_comma, dtype: int64

In [42]:
df[df['unit_end_comma'] == 1]

Unnamed: 0,dish_id,dish_name,INGREDIENT,AMOUNT,amt_num,unit,clean_unit,unit_end_comma
7,561575,chicken breasts with grilled pineapple and tom...,chicken breast,"4 skinless,",4,skinless,skinles,1
60408,989783,jamaican spiced chicken thighs,chicken,"8 skinless,",8,skinless,skinles,1
60424,283460,grilled chicken with tarragon mustard marinade,chicken breast,"4 boneless,",4,boneless,boneles,1
63660,1163403,beer lime grilled chicken,beer,"1 lime,",1,lime,lime,1
63800,847200,grilled tofu steaks with piquillo salsa verde,mince,"1 shallot,",1,shallot,shallot,1
66744,1201584,honey mustard grilled chicken,chicken breast,"4 skinless,",4,skinless,skinles,1
109513,1156101,lime and pepper grilled chicken breasts,chicken breast,"4 boned,",4,boned,bone,1
116619,1185929,best grilled chicken recipe,brown sugar,"1 lime,",1,lime,lime,1
116620,1185929,best grilled chicken recipe,kosher salt,"1/2 lemon,",1/2,lemon,lemon,1
116629,1154780,bbq pork turnip noodle ramen for two,pork tenderloin,"1 scallion,",1,scallion,scallion,1


In [43]:
df[df['dish_id'] == '1234643']

Unnamed: 0,dish_id,dish_name,INGREDIENT,AMOUNT,amt_num,unit,clean_unit,unit_end_comma
281210,1234643,korean bulgogi bbq beef tacos,soy sauce,1 cup,1,cup,C,0
281211,1234643,korean bulgogi bbq beef tacos,mirin,1/2 up,1/2,up,up,0
281212,1234643,korean bulgogi bbq beef tacos,brown sugar,1/2 up,1/2,up,up,0
281213,1234643,korean bulgogi bbq beef tacos,sesame oil,2 tablespoons,2,tablespoons,tbsp,0
281214,1234643,korean bulgogi bbq beef tacos,garlic,4 cloves,4,cloves,clove,0
281215,1234643,korean bulgogi bbq beef tacos,ginger,1 teaspoons,1,teaspoons,tsp,0
281216,1234643,korean bulgogi bbq beef tacos,coriander,1 tablespoon,1,tablespoon,tbsp,0
281217,1234643,korean bulgogi bbq beef tacos,fillet,1 kg,1,kg,kilogram,0
281218,1234643,korean bulgogi bbq beef tacos,tortilla,12-14 mini,12-14,mini,mini,0
281219,1234643,korean bulgogi bbq beef tacos,soy sauce,2 tablespoons,2,tablespoons,tbsp,0


<font color = 'red'> There is something wrong with this recipe </font> 

## Clean 'amt_num'

In [44]:
print("Number of unique 'amt_num' values:", df['amt_num'].value_counts().shape[0])
print("\n'amt_num' values:\n{}".format(df['amt_num'].value_counts()))

Number of unique 'amt_num' values: 294

'amt_num' values:
1              3141
2              1997
               1303
1/2            1264
1/4             979
4               553
3               549
1 1/2           304
1/3             177
3/4             175
6               171
8               131
5                85
1/8              59
12               45
16               38
2/3              31
1 1/4            30
10               29
2 1/2            24
1-1/2            22
1-2              20
1.5              13
0.5              13
2-3              13
3 1/2            12
100              12
3-4              10
7                 9
1 3/4             9
               ... 
2 (9              1
2 (6-             1
1 360             1
1 (3 1/2)         1
4 12              1
3 –4              1
350               1
1 3 1/2 –4-       1
1 &#189;          1
16 1 1/2-         1
2 20-             1
2.5-3             1
4 5-              1
3/4 3/4           1
3+                1
16 (              1
2-

### Upon manual inspection, we found the numbers after "(" is not needed

In [45]:
df['clean_amt_num'] = df['amt_num'].apply(lambda x: x.split('(')[0])

In [46]:
print("Number of unique 'clean_amt_num' values:", df['clean_amt_num'].value_counts().shape[0])
print("\n'clean_amt_num' values:\n{}".format(df['clean_amt_num'].value_counts()))

Number of unique 'clean_amt_num' values: 236

'clean_amt_num' values:
1              3141
2              1997
               1303
1/2            1264
1/4             979
4               553
3               549
1 1/2           304
1/3             177
3/4             175
6               171
8               131
5                85
1/8              59
1                56
12               45
16               38
2/3              31
1 1/4            30
10               29
2 1/2            24
1-1/2            22
1-2              20
4                18
2                16
1.5              13
2-3              13
0.5              13
3 1/2            12
100              12
               ... 
------            1
20-30             1
230               1
1 14.5-           1
4 4-6             1
16                1
10-12             1
14 1/2            1
5-1/4             1
1 360             1
1/4-1/2           1
- 1/8             1
3 –4              1
350               1
1 3 1/2 –4-       1
1 &#189;  

### Inspect values ending with '-'

In [47]:
# how many ends with '-'?
def ends_with(str_val):
    if re.match('.*-$', str_val):
        return 1
    else: 
        return 0
    
df['ends_dash'] = df['clean_amt_num'].apply(ends_with)
df[df['ends_dash'] == 1]

Unnamed: 0,dish_id,dish_name,INGREDIENT,AMOUNT,amt_num,unit,clean_unit,unit_end_comma,clean_amt_num,ends_dash
8,676784,root beer pulled pork bbq crockpot,beer,1 2-liter bottle,1 2-,liter,liter,0,1 2-,1
30,860272,grilled chicken with pineapple salsa,chicken breast,4 6-ounce,4 6-,ounce,oz,0,4 6-,1
61,158721,korean barbecued beef,steak,1 1.75-lb,1 1.75-,lb,lb,0,1 1.75-,1
3959,158898,grilled bratwurst with onions braised in beer ...,bacon,4 4-oz.,4 4-,oz,oz,0,4 4-,1
16173,158759,apricot glazed pork kabobs,apricot,1 10-ounce jar,1 10-,ounce,oz,0,1 10-,1
23280,177663,grilled beef strip steak,steak,2 1-lb steaks,2 1-,lb,lb,0,2 1-,1
29914,162236,penne gorgonzola with grilled chicken,gorgonzola,1 8-oz package,1 8-,oz,oz,0,1 8-,1
37287,159014,grilled greek style pork roast with yogurt sauce,loin,1 3-lb,1 3-,lb,lb,0,1 3-,1
40545,158696,peruvian grilled chicken thighs with tomato ci...,cilantro,1 7-oz jar,1 7-,oz,oz,0,1 7-,1
46300,167936,georges barbecue chicken kabobs,pineapple,1 20-oz can,1 20-,oz,oz,0,1 20-,1


Looks like all of these show number of piece needed followed by the amount for each piece

In [48]:
def clean_ends_dash(row):
    if row['ends_dash'] == 1: 
        # if all -s, blank it out 
        if re.match('--+', row['clean_amt_num']): 
            row['clean_amt_num'] = ''
        elif len(row['clean_amt_num'].split(' ')) > 1:
            first_val = re.findall('(\S+)\s.*-', row['clean_amt_num'])[0]
            second_val = re.findall('\S+\s(.*)-', row['clean_amt_num'])[0]
            row['clean_amt_num'] = "{} x {}".format(first_val, second_val)
        else: 
            row['clean_amt_num'] = re.findall('(.*)-', row['clean_amt_num'])[0]
    return row

df = df.apply(clean_ends_dash, axis = 1)

In [49]:
df[df['ends_dash'] == 1]

Unnamed: 0,dish_id,dish_name,INGREDIENT,AMOUNT,amt_num,unit,clean_unit,unit_end_comma,clean_amt_num,ends_dash
8,676784,root beer pulled pork bbq crockpot,beer,1 2-liter bottle,1 2-,liter,liter,0,1 x 2,1
30,860272,grilled chicken with pineapple salsa,chicken breast,4 6-ounce,4 6-,ounce,oz,0,4 x 6,1
61,158721,korean barbecued beef,steak,1 1.75-lb,1 1.75-,lb,lb,0,1 x 1.75,1
3959,158898,grilled bratwurst with onions braised in beer ...,bacon,4 4-oz.,4 4-,oz,oz,0,4 x 4,1
16173,158759,apricot glazed pork kabobs,apricot,1 10-ounce jar,1 10-,ounce,oz,0,1 x 10,1
23280,177663,grilled beef strip steak,steak,2 1-lb steaks,2 1-,lb,lb,0,2 x 1,1
29914,162236,penne gorgonzola with grilled chicken,gorgonzola,1 8-oz package,1 8-,oz,oz,0,1 x 8,1
37287,159014,grilled greek style pork roast with yogurt sauce,loin,1 3-lb,1 3-,lb,lb,0,1 x 3,1
40545,158696,peruvian grilled chicken thighs with tomato ci...,cilantro,1 7-oz jar,1 7-,oz,oz,0,1 x 7,1
46300,167936,georges barbecue chicken kabobs,pineapple,1 20-oz can,1 20-,oz,oz,0,1 x 20,1


In [50]:
# drop the auxiliary column
df.drop('ends_dash', axis = 1, inplace = True)

In [51]:
print("Number of unique 'clean_amt_num' values:", df['clean_amt_num'].value_counts().shape[0])
print("\n'clean_amt_num' values:\n{}".format(df['clean_amt_num'].value_counts()))

Number of unique 'clean_amt_num' values: 231

'clean_amt_num' values:
1           3141
2           1997
            1309
1/2         1264
1/4          979
4            553
3            549
1 1/2        304
1/3          177
3/4          175
6            171
8            131
5             86
1/8           59
1             56
12            45
16            38
2/3           31
1 1/4         30
10            29
2 1/2         24
1-1/2         22
1-2           20
4             18
2             16
1.5           13
0.5           13
2-3           13
3 1/2         12
100           12
            ... 
10 - 12        1
1 / 2          1
20-30          1
230            1
4 4-6          1
16             1
10-12          1
14 1/2         1
4 4            1
1 x 15         1
1 x 13         1
450            1
7              1
3/4 3/4        1
3+             1
4 12           1
2-2 1/2        1
1 x 15.5       1
1 14.5         1
1 3            1
2  8           1
220            1
2 x 20         1
- 1/8       

What should we do with the range? --> for the purpose, we can probably just take the average

### Clean the cases with '-1/2'

In [52]:
# how many have '-1/2'?
def and_a_half(str_val):
    if re.match('.*-1/2', str_val):
        return 1
    else: 
        return 0
    
df['and_a_half'] = df['clean_amt_num'].apply(and_a_half)
df[df['and_a_half'] == 1]

Unnamed: 0,dish_id,dish_name,INGREDIENT,AMOUNT,amt_num,unit,clean_unit,unit_end_comma,clean_amt_num,and_a_half
19999,441163,barbecued pork sandwiches 5,broth,1-1/2 cups,1-1/2,cups,C,0,1-1/2,1
23164,441162,barbecued pork sandwiches 4,onion,1-1/2 teaspoons,1-1/2,teaspoons,tsp,0,1-1/2,1
23167,441162,barbecued pork sandwiches 4,ketchup,1-1/2 cups,1-1/2,cups,C,0,1-1/2,1
29984,281094,barbecued baby back ribs,liquid smoke,1-1/2 teaspoons,1-1/2,teaspoons,tsp,0,1-1/2,1
49445,441160,barbecued pork sandwiches 3,pepper,1-1/2 teaspoons,1-1/2,teaspoons,tsp,0,1-1/2,1
49446,441160,barbecued pork sandwiches 3,salt,2-1/2 teaspoons,2-1/2,teaspoons,tsp,0,2-1/2,1
49497,441176,barbecued pork sandwiches 2,ketchup,1-1/2 cups,1-1/2,cups,C,0,1-1/2,1
128350,293188,grilled chicken thighs and garden salsa,chicken,1-1/2 pounds,1-1/2,pounds,lb,0,1-1/2,1
147046,413439,asian grilled pork tenderloin with pineapple,kosher,1-1/2 teaspoons,1-1/2,teaspoons,tsp,0,1-1/2,1
181544,720777,grilled lemon chicken with satay dip,red wine vinegar,1-1/2 teaspoons,1-1/2,teaspoons,tsp,0,1-1/2,1


In [53]:
def clean_and_a_half(row):
    if row['and_a_half'] == 1: 
        # if have / before these, keep it
        if re.match('.*/.*-1/2', row['clean_amt_num']): 
            pass
        else:
            row['clean_amt_num'] = row['clean_amt_num'].replace('-', ' ')
    return row

df = df.apply(clean_and_a_half, axis = 1)

In [54]:
df[df['and_a_half'] == 1]

Unnamed: 0,dish_id,dish_name,INGREDIENT,AMOUNT,amt_num,unit,clean_unit,unit_end_comma,clean_amt_num,and_a_half
19999,441163,barbecued pork sandwiches 5,broth,1-1/2 cups,1-1/2,cups,C,0,1 1/2,1
23164,441162,barbecued pork sandwiches 4,onion,1-1/2 teaspoons,1-1/2,teaspoons,tsp,0,1 1/2,1
23167,441162,barbecued pork sandwiches 4,ketchup,1-1/2 cups,1-1/2,cups,C,0,1 1/2,1
29984,281094,barbecued baby back ribs,liquid smoke,1-1/2 teaspoons,1-1/2,teaspoons,tsp,0,1 1/2,1
49445,441160,barbecued pork sandwiches 3,pepper,1-1/2 teaspoons,1-1/2,teaspoons,tsp,0,1 1/2,1
49446,441160,barbecued pork sandwiches 3,salt,2-1/2 teaspoons,2-1/2,teaspoons,tsp,0,2 1/2,1
49497,441176,barbecued pork sandwiches 2,ketchup,1-1/2 cups,1-1/2,cups,C,0,1 1/2,1
128350,293188,grilled chicken thighs and garden salsa,chicken,1-1/2 pounds,1-1/2,pounds,lb,0,1 1/2,1
147046,413439,asian grilled pork tenderloin with pineapple,kosher,1-1/2 teaspoons,1-1/2,teaspoons,tsp,0,1 1/2,1
181544,720777,grilled lemon chicken with satay dip,red wine vinegar,1-1/2 teaspoons,1-1/2,teaspoons,tsp,0,1 1/2,1


In [55]:
# drop the auxiliary column
df.drop('and_a_half', axis = 1, inplace = True)

In [56]:
print("Number of unique 'clean_amt_num' values:", df['clean_amt_num'].value_counts().shape[0])
print("\n'clean_amt_num' values:\n{}".format(df['clean_amt_num'].value_counts()))

Number of unique 'clean_amt_num' values: 228

'clean_amt_num' values:
1           3141
2           1997
            1309
1/2         1264
1/4          979
4            553
3            549
1 1/2        326
1/3          177
3/4          175
6            171
8            131
5             86
1/8           59
1             56
12            45
16            38
2/3           31
1 1/4         30
10            29
2 1/2         28
1-2           20
4             18
2             16
3 1/2         14
0.5           13
2-3           13
1.5           13
100           12
3-4           10
            ... 
10 - 12        1
1 / 2          1
20-30          1
230            1
4 4-6          1
16             1
10-12          1
14 1/2         1
4 4            1
1 x 15         1
1 x 13         1
450            1
7              1
3/4 3/4        1
3+             1
4 12           1
2-2 1/2        1
1 x 15.5       1
1 14.5         1
1 3            1
2  8           1
220            1
2 x 20         1
- 1/8       

### Inspect some odd cases

In [57]:
df[df['clean_amt_num'] == '4 -6']

Unnamed: 0,dish_id,dish_name,INGREDIENT,AMOUNT,amt_num,unit,clean_unit,unit_end_comma,clean_amt_num
207330,1317655,carolina style pulled pork sandwich,cider vinegar,4 -6 lbs,4 -6,lbs,lb,0,4 -6


#### 3/4 3/4?

In [58]:
df[df['clean_amt_num'] == '3/4 3/4']

Unnamed: 0,dish_id,dish_name,INGREDIENT,AMOUNT,amt_num,unit,clean_unit,unit_end_comma,clean_amt_num
257100,1576996,grilled pizza with grilled fennel and parmesan,tomatoes,3/4 3/4 cup,3/4 3/4,cup,C,0,3/4 3/4


In [60]:
df['clean_amt_num'] = df['clean_amt_num'].replace('3/4 3/4', '3/4')

In [61]:
print("Number of unique 'clean_amt_num' values:", df['clean_amt_num'].value_counts().shape[0])
print("\n'clean_amt_num' values:\n{}".format(df['clean_amt_num'].value_counts()))

Number of unique 'clean_amt_num' values: 227

'clean_amt_num' values:
1           3141
2           1997
            1309
1/2         1264
1/4          979
4            553
3            549
1 1/2        326
1/3          177
3/4          176
6            171
8            131
5             86
1/8           59
1             56
12            45
16            38
2/3           31
1 1/4         30
10            29
2 1/2         28
1-2           20
4             18
2             16
3 1/2         14
1.5           13
0.5           13
2-3           13
100           12
3-4           10
            ... 
1 / 2          1
1 x 15         1
20-30          1
230            1
4 4-6          1
16             1
10-12          1
14 1/2         1
4 4            1
2 x 25         1
1 x 13         1
2.5-3          1
2  8           1
3+             1
4 12           1
2-2 1/2        1
1 x 15.5       1
1 14.5         1
1 3            1
7              1
2 x 20         1
450            1
- 1/8          1
1/4-1/2     