# Grill and BBQ Data Cleaning

## Setup

In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
# Visualization
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn')

## Read in Data

In [3]:
# Read in the data
df = pd.read_csv('grill-and-bbq.csv', sep = '|', encoding = 'latin-1')
print("Shape of DataFrame", df.shape)
df.head()

Shape of DataFrame (288456, 5)


Unnamed: 0,PAGE,URL,COUNT,INGREDIENT,AMOUNT
0,1,https://www.bigoven.com/recipe/chicken-breasts...,8,2 tablespoons,canola oil
1,1,https://www.bigoven.com/recipe/chicken-breasts...,8,1/2 large,onion
2,1,https://www.bigoven.com/recipe/chicken-breasts...,8,6,pineapple
3,1,https://www.bigoven.com/recipe/chicken-breasts...,8,,pepper
4,1,https://www.bigoven.com/recipe/chicken-breasts...,8,1 1/2 cup,tomatillo


## Clean Data: General

### Fix error in header

In [4]:
# Need to switch 'INGREDIENT' and 'AMOUNT'
df.rename(columns = {'INGREDIENT': 'amt', 'AMOUNT': 'ingredient'}, inplace = True)
df.head(10)

Unnamed: 0,PAGE,URL,COUNT,amt,ingredient
0,1,https://www.bigoven.com/recipe/chicken-breasts...,8,2 tablespoons,canola oil
1,1,https://www.bigoven.com/recipe/chicken-breasts...,8,1/2 large,onion
2,1,https://www.bigoven.com/recipe/chicken-breasts...,8,6,pineapple
3,1,https://www.bigoven.com/recipe/chicken-breasts...,8,,pepper
4,1,https://www.bigoven.com/recipe/chicken-breasts...,8,1 1/2 cup,tomatillo
5,1,https://www.bigoven.com/recipe/chicken-breasts...,8,2 tablespoons,cilantro
6,1,https://www.bigoven.com/recipe/chicken-breasts...,8,1 tablespoon,lime juice
7,1,https://www.bigoven.com/recipe/chicken-breasts...,8,"4 skinless,",chicken breast
8,1,https://www.bigoven.com/recipe/root-beer-pulle...,5,1 2-liter bottle,Beer
9,1,https://www.bigoven.com/recipe/root-beer-pulle...,5,1,Pork


### Remove duplicates

In [5]:
# Check duplicates
sum(df.duplicated())

274067

In [6]:
# We manually found that page 36 has some duplicates
df[df['PAGE'] == 36]['URL'].value_counts()[:5]

https://www.bigoven.com/recipe/grilled-steak-tacos-with-cilantro-chimichurri-sauce/1193837                          780
https://www.bigoven.com/recipe/bulgogi-bbq-pork-tacos-with-charred-tomatillo-sesame-sauce-spring-onion-s/1174013    460
https://www.bigoven.com/recipe/bbq-pulled-pork-sliders/1169199                                                      340
https://www.bigoven.com/recipe/beer-can-chicken/1160810                                                             240
https://www.bigoven.com/recipe/steak-and-corn-kabobs/1216418                                                        220
Name: URL, dtype: int64

In [7]:
df[(df['PAGE'] == 36) & (df['ingredient'] == 'steak')].head()

Unnamed: 0,PAGE,URL,COUNT,amt,ingredient
120180,36,https://www.bigoven.com/recipe/steak-and-corn-...,11,1 lb,steak
120366,36,https://www.bigoven.com/recipe/steak-and-corn-...,11,1 lb,steak
120552,36,https://www.bigoven.com/recipe/steak-and-corn-...,11,1 lb,steak
120738,36,https://www.bigoven.com/recipe/steak-and-corn-...,11,1 lb,steak
120924,36,https://www.bigoven.com/recipe/steak-and-corn-...,11,1 lb,steak


These seem like actual duplicates

In [8]:
# Remove duplicates
df = df.drop_duplicates()

In [9]:
# Check if page 36 look find now
df[df['PAGE'] == 36]['URL'].value_counts()[:5]

https://www.bigoven.com/recipe/grilled-steak-tacos-with-cilantro-chimichurri-sauce/1193837                          38
https://www.bigoven.com/recipe/bulgogi-bbq-pork-tacos-with-charred-tomatillo-sesame-sauce-spring-onion-s/1174013    23
https://www.bigoven.com/recipe/bbq-pulled-pork-sliders/1169199                                                      17
https://www.bigoven.com/recipe/beer-can-chicken/1160810                                                             12
https://www.bigoven.com/recipe/prosciutto-wrapped-chicken-kebabs/1208275                                            11
Name: URL, dtype: int64

In [10]:
df[(df['PAGE'] == 36) & (df['ingredient'] == 'steak')]

Unnamed: 0,PAGE,URL,COUNT,amt,ingredient
120180,36,https://www.bigoven.com/recipe/steak-and-corn-...,11,1 lb,steak


These seem more reasonable.

### Make all in lower case

In [11]:
for item in ['amt', 'ingredient']: 
    df[item] = df[item].apply(lambda x: x.lower().strip())

## Look at 'URL' Column

### Create column with dish name

In [12]:
# Write a function to get the dish name from URL
def get_dish_name(url): 
    dish_name = re.findall('\S*recipe/(\S*)/.*', url)[0]
    dish_name = re.sub('-', ' ', dish_name)
    return dish_name

In [13]:
df['dish_name'] = df['URL'].apply(get_dish_name)
df.head()

Unnamed: 0,PAGE,URL,COUNT,amt,ingredient,dish_name
0,1,https://www.bigoven.com/recipe/chicken-breasts...,8,2 tablespoons,canola oil,chicken breasts with grilled pineapple and tom...
1,1,https://www.bigoven.com/recipe/chicken-breasts...,8,1/2 large,onion,chicken breasts with grilled pineapple and tom...
2,1,https://www.bigoven.com/recipe/chicken-breasts...,8,6,pineapple,chicken breasts with grilled pineapple and tom...
3,1,https://www.bigoven.com/recipe/chicken-breasts...,8,,pepper,chicken breasts with grilled pineapple and tom...
4,1,https://www.bigoven.com/recipe/chicken-breasts...,8,1 1/2 cup,tomatillo,chicken breasts with grilled pineapple and tom...


In [14]:
# Check top 5 dishes
df['dish_name'].value_counts()[:5]

grilled steak tacos with cilantro chimichurri sauce        76
grilled pork tenderloin                                    62
cold rice noodles with grilled chicken and peanut sauce    56
baby loin back ribs                                        54
canyon ranch grilled chicken enchiladas                    54
Name: dish_name, dtype: int64

### Inspect all records for 'grilled steak tacos with cilantro chimichurri sauce' to fiure out the way to clean further

In [15]:
grilled_steak_tacos = df[df['dish_name'] == 'grilled steak tacos with cilantro chimichurri sauce']

In [16]:
grilled_steak_tacos

Unnamed: 0,PAGE,URL,COUNT,amt,ingredient,dish_name
120202,36,https://www.bigoven.com/recipe/grilled-steak-t...,49,,marinade,grilled steak tacos with cilantro chimichurri ...
120203,36,https://www.bigoven.com/recipe/grilled-steak-t...,49,2 lbs,steak,grilled steak tacos with cilantro chimichurri ...
120204,36,https://www.bigoven.com/recipe/grilled-steak-t...,49,1 sliced,onion,grilled steak tacos with cilantro chimichurri ...
120205,36,https://www.bigoven.com/recipe/grilled-steak-t...,49,,orange,grilled steak tacos with cilantro chimichurri ...
120206,36,https://www.bigoven.com/recipe/grilled-steak-t...,49,,lime,grilled steak tacos with cilantro chimichurri ...
120207,36,https://www.bigoven.com/recipe/grilled-steak-t...,49,1/3,soy sauce,grilled steak tacos with cilantro chimichurri ...
120208,36,https://www.bigoven.com/recipe/grilled-steak-t...,49,1/3,olive oil,grilled steak tacos with cilantro chimichurri ...
120209,36,https://www.bigoven.com/recipe/grilled-steak-t...,49,1/2 tsp,sugar,grilled steak tacos with cilantro chimichurri ...
120210,36,https://www.bigoven.com/recipe/grilled-steak-t...,49,"4 smashed,",garlic,grilled steak tacos with cilantro chimichurri ...
120211,36,https://www.bigoven.com/recipe/grilled-steak-t...,49,1/2 c,cilantro,grilled steak tacos with cilantro chimichurri ...


In [17]:
# Check URL
grilled_steak_tacos['URL'].value_counts()

https://www.bigoven.com/recipe/grilled-steak-tacos-with-cilantro-chimichurri-sauce/1193837    76
Name: URL, dtype: int64

In [18]:
# Check ingredients
grilled_steak_tacos['ingredient'].value_counts()

cilantro           10
olive oil           6
onion               6
garlic              6
kosher salt         4
sugar               4
cumin               4
lime                4
kosher              2
soy sauce           2
tomatoes            2
peppercorn          2
italian parsley     2
lime juice          2
red onion           2
tortilla            2
orange              2
chili               2
coriander           2
steak               2
vinegar             2
pepper              2
marinade            2
radish              2
Name: ingredient, dtype: int64

In [19]:
grilled_steak_tacos[grilled_steak_tacos['ingredient'] == 'steak']

Unnamed: 0,PAGE,URL,COUNT,amt,ingredient,dish_name
120203,36,https://www.bigoven.com/recipe/grilled-steak-t...,49,2 lbs,steak,grilled steak tacos with cilantro chimichurri ...
123985,37,https://www.bigoven.com/recipe/grilled-steak-t...,49,2 lbs,steak,grilled steak tacos with cilantro chimichurri ...


- <font color='red'>Note: These are from two different pages. Do we need page information? </font>
- But even if we remove duplicates in terms of page, there's still an issue. count = 49, and we have 76 records, which would mean we have only 38 records for each recipe.

## Look at 'amt' Column

### What kind of ingredients are missing amounts? 

In [20]:
# How many?
missing_amt = df[df['amt'] == '']
missing_amt.shape

(1560, 6)

In [21]:
# Check a few example
missing_amt.head()

Unnamed: 0,PAGE,URL,COUNT,amt,ingredient,dish_name
3,1,https://www.bigoven.com/recipe/chicken-breasts...,8,,pepper,chicken breasts with grilled pineapple and tom...
31,1,https://www.bigoven.com/recipe/grilled-chicken...,10,,olive oil,grilled chicken with pineapple salsa
32,1,https://www.bigoven.com/recipe/grilled-chicken...,10,,pepper,grilled chicken with pineapple salsa
33,1,https://www.bigoven.com/recipe/grilled-chicken...,10,,salt,grilled chicken with pineapple salsa
39,1,https://www.bigoven.com/recipe/grilled-chicken...,10,,cilantro,grilled chicken with pineapple salsa


In [22]:
# Check the ingredients
missing_amt['ingredient'].value_counts()

pepper             124
salt                82
black pepper        81
garlic              58
lemon               55
olive oil           51
kosher salt         46
chicken             35
cooking spray       35
onion               29
cilantro            29
lime                28
ground pepper       26
vegetable oil       25
chicken breast      19
brown sugar         19
spice               19
tomatoes            18
sugar               17
cayenne             16
bbq                 15
pork                15
marinade            15
cider vinegar       15
honey               14
soy sauce           14
paprika             14
mustard             13
sesame seeds        13
canola oil          13
                  ... 
broth                1
umeboshi             1
yeast                1
soup                 1
radicchio            1
almond               1
bay leaves           1
mozzarella           1
tea                  1
italian parsley      1
seasonings           1
bay leaf             1
parmesan   

### Can we divide it by amount and unit? 
This count make the cleaning easier

In [23]:
# First strip whitespace
df['amt'] = df['amt'].apply(lambda x: x.strip())

In [24]:
# How many different values? 
df['amt'].value_counts().shape

(1289,)

In [25]:
# Check top 20 values
df['amt'].value_counts()[:20]

                 1560
1 tablespoon      774
1/4 cup           663
2 tablespoons     597
1 teaspoon        595
1/2 cup           527
1/2 teaspoon      497
1                 443
1 cup             364
2                 302
1/4 teaspoon      290
2 teaspoons       260
3 tablespoons     218
4                 184
1/3 cup           157
2 tb              156
1 tb              137
1 ts              131
2 cups            125
1 tsp             123
Name: amt, dtype: int64

In [26]:
def get_amt(amt_str):
    """This function takes numeric amount from the amount string"""
    if len(amt_str) == 0:
        return ''
    elif len(re.findall('[^a-z]+', amt_str)) == 0: 
        return ''
    else: 
        return re.findall('[^a-z]+', amt_str)[0].strip()

In [27]:
df['amt_num'] = df['amt'].apply(get_amt)
df.head()

Unnamed: 0,PAGE,URL,COUNT,amt,ingredient,dish_name,amt_num
0,1,https://www.bigoven.com/recipe/chicken-breasts...,8,2 tablespoons,canola oil,chicken breasts with grilled pineapple and tom...,2
1,1,https://www.bigoven.com/recipe/chicken-breasts...,8,1/2 large,onion,chicken breasts with grilled pineapple and tom...,1/2
2,1,https://www.bigoven.com/recipe/chicken-breasts...,8,6,pineapple,chicken breasts with grilled pineapple and tom...,6
3,1,https://www.bigoven.com/recipe/chicken-breasts...,8,,pepper,chicken breasts with grilled pineapple and tom...,
4,1,https://www.bigoven.com/recipe/chicken-breasts...,8,1 1/2 cup,tomatillo,chicken breasts with grilled pineapple and tom...,1 1/2


In [28]:
def get_unit(amt_str):
    """This function takes unit from the amount string"""
    if len(amt_str) == 0:
        return ''
    elif len(re.findall('[A-Za-z]+', amt_str)) == 0: 
        return ''
    else: 
        return re.findall('[A-Za-z]+', amt_str)[0].strip()

In [29]:
df['unit'] = df['amt'].apply(get_unit)
df.head()

Unnamed: 0,PAGE,URL,COUNT,amt,ingredient,dish_name,amt_num,unit
0,1,https://www.bigoven.com/recipe/chicken-breasts...,8,2 tablespoons,canola oil,chicken breasts with grilled pineapple and tom...,2,tablespoons
1,1,https://www.bigoven.com/recipe/chicken-breasts...,8,1/2 large,onion,chicken breasts with grilled pineapple and tom...,1/2,large
2,1,https://www.bigoven.com/recipe/chicken-breasts...,8,6,pineapple,chicken breasts with grilled pineapple and tom...,6,
3,1,https://www.bigoven.com/recipe/chicken-breasts...,8,,pepper,chicken breasts with grilled pineapple and tom...,,
4,1,https://www.bigoven.com/recipe/chicken-breasts...,8,1 1/2 cup,tomatillo,chicken breasts with grilled pineapple and tom...,1 1/2,cup


In [30]:
# unit
print("Unit: {}".format(df['unit'].value_counts().shape))
print("\nValues:\n{}".format(df['unit'].value_counts()))

Unit: (276,)

Values:
               2984
cup            1982
teaspoon       1613
tablespoons    1025
tablespoon      950
teaspoons       474
tb              418
tsp             386
ts              373
c               367
tbsp            318
cups            318
pound           217
cloves          204
large           200
pounds          200
lb              181
medium          166
oz              157
small           118
clove           100
ounce            85
ounces           82
whole            75
lbs              65
t                61
boneless         52
tbs              46
g                40
inch             37
               ... 
sesame            1
by                1
slabs             1
sticks            1
pacific           1
rashers           1
links             1
cilantro          1
peaches           1
brown             1
generous          1
oranges           1
potato            1
thick             1
leaves            1
loaf              1
bsp               1
chili             

In [31]:
# Numeric amount
print("Numeric amount: {}".format(df['amt_num'].value_counts().shape))
print("\nValues:\n{}".format(df['amt_num'].value_counts()))

Numeric amount: (294,)

Values:
1            3862
2            2450
             1580
1/2          1577
1/4          1219
4             679
3             662
1 1/2         374
3/4           226
6             216
1/3           214
8             161
5             100
1/8            73
12             58
16             47
1 1/4          38
2/3            36
10             34
2 1/2          31
1-2            28
1-1/2          27
0.5            20
1.5            18
100            17
2-3            16
3-4            14
3 1/2          14
20             13
1 3/4          12
             ... 
1 1/2 - 2       1
2 1&            1
1500            1
1 1.75-         1
3 (6-           1
2 8             1
1 18-           1
6-7             1
2  8            1
2 (9            1
4 12            1
1 13-           1
10-12           1
3/4 3/4         1
5-1/4           1
1 3             1
3 5-6           1
4 (12-          1
1 / 2           1
4 4             1
0.67            1
1 (4-5          1
------        

### Rearrange columns

In [32]:
df.columns

Index(['PAGE', 'URL', 'COUNT', 'amt', 'ingredient', 'dish_name', 'amt_num',
       'unit'],
      dtype='object')

In [33]:
df = df[['dish_name', 'COUNT', 'ingredient', 'amt_num', 'unit', 'PAGE', 'URL', 'amt']]
df.head()

Unnamed: 0,dish_name,COUNT,ingredient,amt_num,unit,PAGE,URL,amt
0,chicken breasts with grilled pineapple and tom...,8,canola oil,2,tablespoons,1,https://www.bigoven.com/recipe/chicken-breasts...,2 tablespoons
1,chicken breasts with grilled pineapple and tom...,8,onion,1/2,large,1,https://www.bigoven.com/recipe/chicken-breasts...,1/2 large
2,chicken breasts with grilled pineapple and tom...,8,pineapple,6,,1,https://www.bigoven.com/recipe/chicken-breasts...,6
3,chicken breasts with grilled pineapple and tom...,8,pepper,,,1,https://www.bigoven.com/recipe/chicken-breasts...,
4,chicken breasts with grilled pineapple and tom...,8,tomatillo,1 1/2,cup,1,https://www.bigoven.com/recipe/chicken-breasts...,1 1/2 cup
