# Grill and BBQ Data Cleaning

## Setup

In [2]:
import numpy as np
import pandas as pd
import re

In [3]:
# visualization
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn')

In [4]:
# for lemmatimzation
#import spacy

## Read in Data

In [5]:
# Read in the data
df = pd.read_csv('data/RECIPE.csv', sep = '|')
print("Shape of DataFrame", df.shape)
df.head()

Shape of DataFrame (31967, 5)


Unnamed: 0,PAGE,URL,COUNT,INGREDIENT,AMOUNT
0,100,https://www.bigoven.com/recipe/anyone-can-make...,10,,black pepper
1,100,https://www.bigoven.com/recipe/anyone-can-make...,10,1 large,onion
2,100,https://www.bigoven.com/recipe/anyone-can-make...,10,1 tablespoon,red pepper flakes
3,100,https://www.bigoven.com/recipe/anyone-can-make...,10,1,pork
4,100,https://www.bigoven.com/recipe/anyone-can-make...,10,1,toast


## Clean Data: General

### Drop metadata

'PAGE' and 'COUNT' are not needed for the analysis.

In [6]:
df.drop(['PAGE', 'COUNT'], axis = 1, inplace = True)
df.head()

Unnamed: 0,URL,INGREDIENT,AMOUNT
0,https://www.bigoven.com/recipe/anyone-can-make...,,black pepper
1,https://www.bigoven.com/recipe/anyone-can-make...,1 large,onion
2,https://www.bigoven.com/recipe/anyone-can-make...,1 tablespoon,red pepper flakes
3,https://www.bigoven.com/recipe/anyone-can-make...,1,pork
4,https://www.bigoven.com/recipe/anyone-can-make...,1,toast


### Fix error in header

### Remove duplicates

In [7]:
print("shape of DataFrame before de-dup:", df.shape)
print("Number of duplicates:",sum(df.duplicated()))

shape of DataFrame before de-dup: (31967, 3)
Number of duplicates: 9314


In [8]:
# Remove duplicates
df = df.drop_duplicates()

print("shape of DataFrame after de-dup:", df.shape)

shape of DataFrame after de-dup: (22653, 3)


### Make all in lower case

In [9]:
for item in ['AMOUNT', 'INGREDIENT']: 
    df[item] = df[item].apply(lambda x: x.lower().strip())

## Look at 'URL' Column

### Create a column with dish id

In [10]:
# Write a function to get the dish id from URL
def get_dish_id(url): 
    dish_id = re.findall('\S*/([0-9]+)$', url)[0]
    return dish_id

In [11]:
# apply the function to the DataFrame
df['dish_id'] = df['URL'].apply(get_dish_id)
df.head()

Unnamed: 0,URL,INGREDIENT,AMOUNT,dish_id
0,https://www.bigoven.com/recipe/anyone-can-make...,,black pepper,1591891
1,https://www.bigoven.com/recipe/anyone-can-make...,1 large,onion,1591891
2,https://www.bigoven.com/recipe/anyone-can-make...,1 tablespoon,red pepper flakes,1591891
3,https://www.bigoven.com/recipe/anyone-can-make...,1,pork,1591891
4,https://www.bigoven.com/recipe/anyone-can-make...,1,toast,1591891


### Create a column with dish name

In [12]:
# Write a function to get the dish name from URL
def get_dish_name(url): 
    dish_name = re.findall('\S*recipe/(\S*)/.*', url)[0]
    dish_name = re.sub('-', ' ', dish_name)
    return dish_name

In [13]:
df['dish_name'] = df['URL'].apply(get_dish_name)
df.head()

Unnamed: 0,URL,INGREDIENT,AMOUNT,dish_id,dish_name
0,https://www.bigoven.com/recipe/anyone-can-make...,,black pepper,1591891,anyone can make bbq pulled pork
1,https://www.bigoven.com/recipe/anyone-can-make...,1 large,onion,1591891,anyone can make bbq pulled pork
2,https://www.bigoven.com/recipe/anyone-can-make...,1 tablespoon,red pepper flakes,1591891,anyone can make bbq pulled pork
3,https://www.bigoven.com/recipe/anyone-can-make...,1,pork,1591891,anyone can make bbq pulled pork
4,https://www.bigoven.com/recipe/anyone-can-make...,1,toast,1591891,anyone can make bbq pulled pork


In [14]:
# Check top 5 dishes
df['dish_name'].value_counts()[:5]

bbq pulled pork                             51
grilled lemon chicken                       49
korean bulgogi bbq beef tacos               48
grilled pork tenderloin                     47
hoisin grilled chicken with soba noodles    46
Name: dish_name, dtype: int64

## Clean Columns

In [15]:
# since we got all information, remove URL column
df.drop('URL', axis = 1, inplace = True)
df.head()

Unnamed: 0,INGREDIENT,AMOUNT,dish_id,dish_name
0,,black pepper,1591891,anyone can make bbq pulled pork
1,1 large,onion,1591891,anyone can make bbq pulled pork
2,1 tablespoon,red pepper flakes,1591891,anyone can make bbq pulled pork
3,1,pork,1591891,anyone can make bbq pulled pork
4,1,toast,1591891,anyone can make bbq pulled pork


### Rearrange columns

In [16]:
# check all columns
df.columns

Index(['INGREDIENT', 'AMOUNT', 'dish_id', 'dish_name'], dtype='object')

In [17]:
# rearrange columns
df = df[['dish_id', 'dish_name', 'INGREDIENT', 'AMOUNT']]
df.head()

Unnamed: 0,dish_id,dish_name,INGREDIENT,AMOUNT
0,1591891,anyone can make bbq pulled pork,,black pepper
1,1591891,anyone can make bbq pulled pork,1 large,onion
2,1591891,anyone can make bbq pulled pork,1 tablespoon,red pepper flakes
3,1591891,anyone can make bbq pulled pork,1,pork
4,1591891,anyone can make bbq pulled pork,1,toast


## Inspect all records for 'grilled steak tacos with cilantro chimichurri sauce' to figure out the way to clean further

In [18]:
grilled_steak_tacos = df[df['dish_name'] == 'grilled steak tacos with cilantro chimichurri sauce']

In [19]:
grilled_steak_tacos

Unnamed: 0,dish_id,dish_name,INGREDIENT,AMOUNT
18689,1193837,grilled steak tacos with cilantro chimichurri ...,,cilantro
18690,1193837,grilled steak tacos with cilantro chimichurri ...,,marinade
18691,1193837,grilled steak tacos with cilantro chimichurri ...,,cilantro
18692,1193837,grilled steak tacos with cilantro chimichurri ...,,coriander
18693,1193837,grilled steak tacos with cilantro chimichurri ...,,cumin
18694,1193837,grilled steak tacos with cilantro chimichurri ...,,kosher
18695,1193837,grilled steak tacos with cilantro chimichurri ...,,lime
18696,1193837,grilled steak tacos with cilantro chimichurri ...,,onion
18697,1193837,grilled steak tacos with cilantro chimichurri ...,,orange
18698,1193837,grilled steak tacos with cilantro chimichurri ...,,pepper


## Look at 'amt' Column

### What kind of ingredients are missing amounts? 

In [20]:
# How many?
missing_amt = df[df['AMOUNT'] == '']
missing_amt.shape

(0, 4)

In [21]:
# Check a few example
missing_amt.head()

Unnamed: 0,dish_id,dish_name,INGREDIENT,AMOUNT


In [22]:
# Check the ingredients
missing_amt['INGREDIENT'].value_counts()

Series([], Name: INGREDIENT, dtype: int64)

### Divide it by amount and unit

In [23]:
# How many different values? 
df['AMOUNT'].value_counts().shape

(495,)

In [24]:
# First strip whitespace
df['AMOUNT'] = df['AMOUNT'].apply(lambda x: x.strip())

In [25]:
# How many different values? 
df['AMOUNT'].value_counts().shape

(495,)

In [26]:
# Check top 20 values
df['AMOUNT'].value_counts()[:20]

garlic            1465
salt               939
black pepper       761
pepper             718
olive oil          709
onion              638
lemon              556
soy sauce          493
chicken breast     492
brown sugar        457
chicken            399
ginger             310
sugar              306
cumin              302
kosher salt        299
pork               288
cilantro           287
mustard            283
honey              281
steak              280
Name: AMOUNT, dtype: int64

In [27]:
def get_amt(amt_str):
    """This function takes numeric amount from the amount string"""
    if len(amt_str) == 0:
        return ''
    elif len(re.findall('[^a-z]+', amt_str)) == 0: 
        return ''
    else: 
        return re.findall('[^a-z]+', amt_str)[0].strip()

In [28]:
df['amt_num'] = df['AMOUNT'].apply(get_amt)
df.head()

Unnamed: 0,dish_id,dish_name,INGREDIENT,AMOUNT,amt_num
0,1591891,anyone can make bbq pulled pork,,black pepper,
1,1591891,anyone can make bbq pulled pork,1 large,onion,
2,1591891,anyone can make bbq pulled pork,1 tablespoon,red pepper flakes,
3,1591891,anyone can make bbq pulled pork,1,pork,
4,1591891,anyone can make bbq pulled pork,1,toast,


In [29]:
def get_unit(amt_str):
    """This function takes unit from the amount string"""
    if len(amt_str) == 0:
        return ''
    elif len(re.findall('[A-Za-z]+', amt_str)) == 0: 
        return ''
    else: 
        return re.findall('[A-Za-z]+', amt_str)[0].strip()

In [30]:
df['unit'] = df['AMOUNT'].apply(get_unit)
df.head()

Unnamed: 0,dish_id,dish_name,INGREDIENT,AMOUNT,amt_num,unit
0,1591891,anyone can make bbq pulled pork,,black pepper,,black
1,1591891,anyone can make bbq pulled pork,1 large,onion,,onion
2,1591891,anyone can make bbq pulled pork,1 tablespoon,red pepper flakes,,red
3,1591891,anyone can make bbq pulled pork,1,pork,,pork
4,1591891,anyone can make bbq pulled pork,1,toast,,toast


In [31]:
# unit
print("Unit: {}".format(df['unit'].value_counts().shape))
print("\nValues:\n{}".format(df['unit'].value_counts()))

Unit: (426,)

Values:
garlic            1465
chicken            972
salt               939
black              784
onion              757
olive              735
pepper             718
red                590
lemon              590
soy                493
brown              475
pork               434
lime               345
ginger             341
kosher             307
sugar              306
cumin              302
cilantro           287
chili              284
mustard            283
honey              281
steak              280
paprika            274
vegetable          263
sesame             247
worcestershire     233
cayenne            231
butter             224
oregano            204
parsley            202
                  ... 
prune                1
tilapia              1
half                 1
calamari             1
turnip               1
somen                1
frenched             1
protein              1
nitrite              1
fall                 1
grand                1
saute       

In [32]:
# Numeric amount
print("Numeric amount: {}".format(df['amt_num'].value_counts().shape))
print("\nValues:\n{}".format(df['amt_num'].value_counts()))

Numeric amount: (3,)

Values:
     22599
-       53
è        1
Name: amt_num, dtype: int64


## Clean 'Unit'

In [33]:
print("Number of unique 'unit' values:", df['unit'].value_counts().shape[0])
print("\n'unit' values:\n{}".format(df['unit'].value_counts()))

Number of unique 'unit' values: 426

'unit' values:
garlic            1465
chicken            972
salt               939
black              784
onion              757
olive              735
pepper             718
red                590
lemon              590
soy                493
brown              475
pork               434
lime               345
ginger             341
kosher             307
sugar              306
cumin              302
cilantro           287
chili              284
mustard            283
honey              281
steak              280
paprika            274
vegetable          263
sesame             247
worcestershire     233
cayenne            231
butter             224
oregano            204
parsley            202
                  ... 
prune                1
tilapia              1
half                 1
calamari             1
turnip               1
somen                1
frenched             1
protein              1
nitrite              1
fall                 1
grand

### Lemmatize the 'unit'

In [34]:
#en_nlp = spacy.load('en')

def get_clean_unit(unit):
    list_unit = [token.lemma_ for token in en_nlp(unit)]
    if len(list_unit) == 0: 
        return ''
    elif len(list_unit) > 0:
        return list_unit[0]
    
#df['clean_unit'] = df['unit'].apply(get_clean_unit)
df['clean_unit'] = df['unit']

In [35]:
df['clean_unit'].value_counts()

garlic            1465
chicken            972
salt               939
black              784
onion              757
olive              735
pepper             718
red                590
lemon              590
soy                493
brown              475
pork               434
lime               345
ginger             341
kosher             307
sugar              306
cumin              302
cilantro           287
chili              284
mustard            283
honey              281
steak              280
paprika            274
vegetable          263
sesame             247
worcestershire     233
cayenne            231
butter             224
oregano            204
parsley            202
                  ... 
prune                1
tilapia              1
half                 1
calamari             1
turnip               1
somen                1
frenched             1
protein              1
nitrite              1
fall                 1
grand                1
saute                1
bouquet    

In [36]:
print("Number of unique 'clean_unit' values:", df['clean_unit'].value_counts().shape[0])
print("\n'clean_unit' values:\n{}".format(df['clean_unit'].value_counts()))

Number of unique 'clean_unit' values: 426

'clean_unit' values:
garlic            1465
chicken            972
salt               939
black              784
onion              757
olive              735
pepper             718
red                590
lemon              590
soy                493
brown              475
pork               434
lime               345
ginger             341
kosher             307
sugar              306
cumin              302
cilantro           287
chili              284
mustard            283
honey              281
steak              280
paprika            274
vegetable          263
sesame             247
worcestershire     233
cayenne            231
butter             224
oregano            204
parsley            202
                  ... 
prune                1
tilapia              1
half                 1
calamari             1
turnip               1
somen                1
frenched             1
protein              1
nitrite              1
fall            

### Make full version into abbreviated version

In [37]:
# We will also fix typo here
unit_abbr_dict = {"cup": "C", "c": "C",
                  "centimetre": "cm", 
                  "each": "ea", 
                  "inch": "in",
                  "kilo": "kg",
                  "kilo": "kilogram", "kg": "kilogram",
                  "leave": "leaf", 
                  "ounce": "oz", "once": "oz",
                  "pound": "lb", 
                  "tablespoon": "tbsp", "tb": "tbsp", "tablspoon": "tbsp", "tbl": "tbsp",
                  "teaspoon": "tsp", "t": "tsp",
                  
                  # leave the size with full version
                  "sm": "small", "md": "medium", "lg": "large"
                 }

In [38]:
def get_full_unit(unit):
    if unit in unit_abbr_dict: 
        return unit_abbr_dict[unit]
    else: 
        return unit

df['clean_unit'] = df['clean_unit'].apply(get_full_unit)

In [39]:
print("Number of unique 'clean_unit' values:", df['clean_unit'].value_counts().shape[0])
print("\n'clean_unit' values:\n{}".format(df['clean_unit'].value_counts()))

Number of unique 'clean_unit' values: 426

'clean_unit' values:
garlic            1465
chicken            972
salt               939
black              784
onion              757
olive              735
pepper             718
red                590
lemon              590
soy                493
brown              475
pork               434
lime               345
ginger             341
kosher             307
sugar              306
cumin              302
cilantro           287
chili              284
mustard            283
honey              281
steak              280
paprika            274
vegetable          263
sesame             247
worcestershire     233
cayenne            231
butter             224
oregano            204
parsley            202
                  ... 
prune                1
tilapia              1
half                 1
calamari             1
turnip               1
somen                1
frenched             1
protein              1
nitrite              1
fall            

### Inspect ingredients in the unit

In [40]:
# Why are there some ingredients in the unit? 
df[(df['unit'] == 'radishes') | (df['unit'] == 'eggplants') | (df['unit'] == 'romaine')]

Unnamed: 0,dish_id,dish_name,INGREDIENT,AMOUNT,amt_num,unit,clean_unit
3509,1297109,grilled chicken caesar mac,1 head,romaine,,romaine,romaine
9099,1742990,chicken souvlaki with grilled pita,,romaine,,romaine,romaine


In [41]:
# some of these have ',' at the end. how many? 
def end_comma(match_str):
    if re.search('.+,$', match_str): 
        return 1
    else: 
        return 0

df['unit_end_comma'] = df['AMOUNT'].apply(end_comma)

In [42]:
df['unit_end_comma'].value_counts()

0    22653
Name: unit_end_comma, dtype: int64

In [43]:
df[df['unit_end_comma'] == 1]

Unnamed: 0,dish_id,dish_name,INGREDIENT,AMOUNT,amt_num,unit,clean_unit,unit_end_comma


In [44]:
df[df['dish_id'] == '1234643']

Unnamed: 0,dish_id,dish_name,INGREDIENT,AMOUNT,amt_num,unit,clean_unit,unit_end_comma
1355,1234643,korean bulgogi bbq beef tacos,,lime,,lime,lime,0
1356,1234643,korean bulgogi bbq beef tacos,,lime,,lime,lime,0
1357,1234643,korean bulgogi bbq beef tacos,"1 carrot,",grate,,grate,grate,0
1358,1234643,korean bulgogi bbq beef tacos,1 cup,kale,,kale,kale,0
1359,1234643,korean bulgogi bbq beef tacos,1 cup,cabbage,,cabbage,cabbage,0
1360,1234643,korean bulgogi bbq beef tacos,1 cup,soy sauce,,soy,soy,0
1361,1234643,korean bulgogi bbq beef tacos,1 kg,fillet,,fillet,fillet,0
1362,1234643,korean bulgogi bbq beef tacos,1 tablespoon,mirin,,mirin,mirin,0
1363,1234643,korean bulgogi bbq beef tacos,1 tablespoon,sour cream,,sour,sour,0
1364,1234643,korean bulgogi bbq beef tacos,1 tablespoon,coriander,,coriander,coriander,0


<font color = 'red'> There is something wrong with this recipe </font> 

## Clean 'amt_num'

In [45]:
print("Number of unique 'amt_num' values:", df['amt_num'].value_counts().shape[0])
print("\n'amt_num' values:\n{}".format(df['amt_num'].value_counts()))

Number of unique 'amt_num' values: 3

'amt_num' values:
     22599
-       53
è        1
Name: amt_num, dtype: int64


### Upon manual inspection, we found the numbers after "(" is not needed

In [46]:
df['clean_amt_num'] = df['amt_num'].apply(lambda x: x.split('(')[0])

In [47]:
print("Number of unique 'clean_amt_num' values:", df['clean_amt_num'].value_counts().shape[0])
print("\n'clean_amt_num' values:\n{}".format(df['clean_amt_num'].value_counts()))

Number of unique 'clean_amt_num' values: 3

'clean_amt_num' values:
     22599
-       53
è        1
Name: clean_amt_num, dtype: int64


### Inspect values ending with '-'

In [48]:
# how many ends with '-'?
def ends_with(str_val):
    if re.match('.*-$', str_val):
        return 1
    else: 
        return 0
    
df['ends_dash'] = df['clean_amt_num'].apply(ends_with)
df[df['ends_dash'] == 1]

Unnamed: 0,dish_id,dish_name,INGREDIENT,AMOUNT,amt_num,unit,clean_unit,unit_end_comma,clean_amt_num,ends_dash
173,1229072,pulled pork barbecue nachos,2 tablespoons,all-purpose flour,-,all,all,0,-,1
245,1148724,grilled five spice chicken thighs with soy vin...,2 tbs.,five-spice,-,five,five,0,-,1
331,1505338,korean pulled pork sandwich with asian slaw,2 tablespoons,five-spice,-,five,five,0,-,1
759,1255239,grilled jerk chicken,1,five-spice,-,five,five,0,-,1
1320,1809396,haley d williams,3 cups,all-purpose flour,-,all,all,0,-,1
1555,1785251,romanos macaroni grill chicken scaloppine,1/2 cup,all-purpose flour,-,all,all,0,-,1
1694,1621460,grilled chicken and roasted red pepper panini,8 whole,sun-dried tomato,-,sun,sun,0,-,1
1767,1496959,pressure cooker chinese bbq char siu pulled pork,1/2,five-spice,-,five,five,0,-,1
1948,1771327,hoisin grilled chicken with soba noodles,1/2 teaspoon,five-spice,-,five,five,0,-,1
2427,1669544,grilled naan with garlic scape chutney,,all-purpose flour,-,all,all,0,-,1


Looks like all of these show number of piece needed followed by the amount for each piece

In [49]:
def clean_ends_dash(row):
    if row['ends_dash'] == 1: 
        # if all -s, blank it out 
        if re.match('--+', row['clean_amt_num']): 
            row['clean_amt_num'] = ''
        elif len(row['clean_amt_num'].split(' ')) > 1:
            first_val = re.findall('(\S+)\s.*-', row['clean_amt_num'])[0]
            second_val = re.findall('\S+\s(.*)-', row['clean_amt_num'])[0]
            row['clean_amt_num'] = "{} x {}".format(first_val, second_val)
        else: 
            row['clean_amt_num'] = re.findall('(.*)-', row['clean_amt_num'])[0]
    return row

df = df.apply(clean_ends_dash, axis = 1)

In [50]:
df[df['ends_dash'] == 1]

Unnamed: 0,dish_id,dish_name,INGREDIENT,AMOUNT,amt_num,unit,clean_unit,unit_end_comma,clean_amt_num,ends_dash
173,1229072,pulled pork barbecue nachos,2 tablespoons,all-purpose flour,-,all,all,0,,1
245,1148724,grilled five spice chicken thighs with soy vin...,2 tbs.,five-spice,-,five,five,0,,1
331,1505338,korean pulled pork sandwich with asian slaw,2 tablespoons,five-spice,-,five,five,0,,1
759,1255239,grilled jerk chicken,1,five-spice,-,five,five,0,,1
1320,1809396,haley d williams,3 cups,all-purpose flour,-,all,all,0,,1
1555,1785251,romanos macaroni grill chicken scaloppine,1/2 cup,all-purpose flour,-,all,all,0,,1
1694,1621460,grilled chicken and roasted red pepper panini,8 whole,sun-dried tomato,-,sun,sun,0,,1
1767,1496959,pressure cooker chinese bbq char siu pulled pork,1/2,five-spice,-,five,five,0,,1
1948,1771327,hoisin grilled chicken with soba noodles,1/2 teaspoon,five-spice,-,five,five,0,,1
2427,1669544,grilled naan with garlic scape chutney,,all-purpose flour,-,all,all,0,,1


In [51]:
# drop the auxiliary column
df.drop('ends_dash', axis = 1, inplace = True)

In [52]:
print("Number of unique 'clean_amt_num' values:", df['clean_amt_num'].value_counts().shape[0])
print("\n'clean_amt_num' values:\n{}".format(df['clean_amt_num'].value_counts()))

Number of unique 'clean_amt_num' values: 2

'clean_amt_num' values:
     22652
è        1
Name: clean_amt_num, dtype: int64


What should we do with the range? --> for the purpose, we can probably just take the average

### Clean the cases with '-1/2'

In [53]:
# how many have '-1/2'?
def and_a_half(str_val):
    if re.match('.*-1/2', str_val):
        return 1
    else: 
        return 0
    
df['and_a_half'] = df['clean_amt_num'].apply(and_a_half)
df[df['and_a_half'] == 1]

Unnamed: 0,dish_id,dish_name,INGREDIENT,AMOUNT,amt_num,unit,clean_unit,unit_end_comma,clean_amt_num,and_a_half


In [54]:
def clean_and_a_half(row):
    if row['and_a_half'] == 1: 
        # if have / before these, keep it
        if re.match('.*/.*-1/2', row['clean_amt_num']): 
            pass
        else:
            row['clean_amt_num'] = row['clean_amt_num'].replace('-', ' ')
    return row

df = df.apply(clean_and_a_half, axis = 1)

In [55]:
df[df['and_a_half'] == 1]

Unnamed: 0,dish_id,dish_name,INGREDIENT,AMOUNT,amt_num,unit,clean_unit,unit_end_comma,clean_amt_num,and_a_half


In [56]:
# drop the auxiliary column
df.drop('and_a_half', axis = 1, inplace = True)

In [57]:
print("Number of unique 'clean_amt_num' values:", df['clean_amt_num'].value_counts().shape[0])
print("\n'clean_amt_num' values:\n{}".format(df['clean_amt_num'].value_counts()))

Number of unique 'clean_amt_num' values: 2

'clean_amt_num' values:
     22652
è        1
Name: clean_amt_num, dtype: int64


### Inspect some odd cases

In [58]:
df[df['clean_amt_num'] == '4 -6']

Unnamed: 0,dish_id,dish_name,INGREDIENT,AMOUNT,amt_num,unit,clean_unit,unit_end_comma,clean_amt_num


#### 3/4 3/4?

In [59]:
df[df['clean_amt_num'] == '3/4 3/4']

Unnamed: 0,dish_id,dish_name,INGREDIENT,AMOUNT,amt_num,unit,clean_unit,unit_end_comma,clean_amt_num


In [60]:
df['clean_amt_num'] = df['clean_amt_num'].replace('3/4 3/4', '3/4')

In [61]:
print("Number of unique 'clean_amt_num' values:", df['clean_amt_num'].value_counts().shape[0])
print("\n'clean_amt_num' values:\n{}".format(df['clean_amt_num'].value_counts()))

Number of unique 'clean_amt_num' values: 2

'clean_amt_num' values:
     22652
è        1
Name: clean_amt_num, dtype: int64


In [62]:
dfRecipe = df[['dish_id','dish_name']].drop_duplicates()
 
dfRecipe.to_csv('data/recipeset.csv', sep='|', encoding='utf-8', index=False)

dfIngre = df[['dish_id','INGREDIENT','AMOUNT',]].drop_duplicates()
dfIngre.columns =['dish_id','amount','ingredient']
dfIngre.to_csv('data/ingredients.csv', sep='|', encoding='utf-8')