In [13]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


In [14]:
# Reading in the data
menu_data_df = pd.read_csv("../data/uncleaned_indv_items.csv", index_col=0)
print(menu_data_df.shape)

menu_data_df.info()


(106, 18)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 106 entries, 0 to 105
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   item_name                106 non-null    object 
 1   price                    106 non-null    float64
 2   menu_section             106 non-null    object 
 3   Calories                 106 non-null    float64
 4   Total Fat (g)            106 non-null    float64
 5   Saturated Fat (g)        106 non-null    float64
 6   Trans Fat (g)            106 non-null    float64
 7   Cholesterol (mg)         106 non-null    float64
 8   Sodium (mg)              106 non-null    float64
 9   Total Carbohydrates (g)  106 non-null    float64
 10  Dietary Fiber (g)        106 non-null    float64
 11  Sugars (g)               106 non-null    float64
 12  Includes (g)             106 non-null    float64
 13  Protein (g)              106 non-null    float64
 14  Vitamin D (mcg) 

- As we can see all the NaN values are only for alleregen info, so i will be replacing all NaN with zero
- Next, we will combine the `Sugars (g)` and `Includes (g)` into one column named `Total Sugars (g)`

In [15]:
menu_data_df.fillna(0, inplace=True)

menu_data_df['Total Sugars (g)'] = menu_data_df['Sugars (g)'] + menu_data_df['Includes (g)']

del menu_data_df['Sugars (g)']
del menu_data_df['Includes (g)']

menu_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 106 entries, 0 to 105
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   item_name                106 non-null    object 
 1   price                    106 non-null    float64
 2   menu_section             106 non-null    object 
 3   Calories                 106 non-null    float64
 4   Total Fat (g)            106 non-null    float64
 5   Saturated Fat (g)        106 non-null    float64
 6   Trans Fat (g)            106 non-null    float64
 7   Cholesterol (mg)         106 non-null    float64
 8   Sodium (mg)              106 non-null    float64
 9   Total Carbohydrates (g)  106 non-null    float64
 10  Dietary Fiber (g)        106 non-null    float64
 11  Protein (g)              106 non-null    float64
 12  Vitamin D (mcg)          106 non-null    float64
 13  Calcium (mg)             106 non-null    float64
 14  Iron (mg)                1

- If price is less than $1.00 that means it is a sauce packet and will get dropped from the data frame

In [16]:
menu_data_df = menu_data_df[menu_data_df['price'] >= 1.00]
menu_data_df.shape

(93, 17)

In [17]:
menu_data_df.head()

Unnamed: 0,item_name,price,menu_section,Calories,Total Fat (g),Saturated Fat (g),Trans Fat (g),Cholesterol (mg),Sodium (mg),Total Carbohydrates (g),Dietary Fiber (g),Protein (g),Vitamin D (mcg),Calcium (mg),Iron (mg),Potassium (mg),Total Sugars (g)
0,Soft Taco,1.89,Tacos,180.0,8.0,4.0,0.0,25.0,500.0,18.0,3.0,9.0,0.0,110.0,1.7,130.0,2.0
1,Soft Taco Supreme®,2.89,Tacos,210.0,10.0,5.0,0.0,25.0,510.0,20.0,3.0,10.0,0.0,130.0,1.7,200.0,3.0
2,Spicy Potato Soft Taco,1.1,Tacos,240.0,12.0,3.0,0.0,10.0,480.0,28.0,2.0,5.0,0.0,110.0,1.3,270.0,2.0
3,Crunchy Taco,1.89,Tacos,170.0,10.0,3.5,0.0,25.0,300.0,13.0,3.0,8.0,0.0,70.0,0.9,140.0,1.0
4,Crunchy Taco Supreme®,2.89,Tacos,190.0,11.0,4.5,0.0,25.0,320.0,15.0,3.0,8.0,0.0,80.0,0.9,200.0,2.0


# Removing duplicates and limited time items

## Checking and removing duplicates

In [18]:
# generate count statistics of duplicate entries
if len(menu_data_df[menu_data_df.item_name.duplicated()]) > 0:
    print("No. of duplicated entries: ", len(menu_data_df[menu_data_df.item_name.duplicated()]))
else:
    print("No duplicated entries found")

No. of duplicated entries:  20


In [19]:
# Viewing the duplated rows
duplated_rows = menu_data_df.loc[menu_data_df.item_name.duplicated(), :]
duplated_rows


Unnamed: 0,item_name,price,menu_section,Calories,Total Fat (g),Saturated Fat (g),Trans Fat (g),Cholesterol (mg),Sodium (mg),Total Carbohydrates (g),Dietary Fiber (g),Protein (g),Vitamin D (mcg),Calcium (mg),Iron (mg),Potassium (mg),Total Sugars (g)
67,Chips and Nacho Cheese Sauce,2.59,Nachos,220.0,13.0,1.5,0.0,0.0,280.0,24.0,2.0,3.0,0.0,60.0,0.2,290.0,1.0
68,Bean Burrito,1.99,Veggie Cravings,350.0,9.0,4.0,0.0,5.0,1040.0,55.0,10.0,13.0,0.0,210.0,4.0,400.0,5.0
69,Cheesy Bean and Rice Burrito,1.0,Veggie Cravings,420.0,16.0,4.0,0.0,5.0,920.0,55.0,7.0,9.0,0.0,160.0,3.4,360.0,5.0
70,Fiesta Veggie Burrito,2.0,Veggie Cravings,570.0,28.0,8.0,0.0,25.0,1020.0,65.0,9.0,14.0,0.0,270.0,3.8,470.0,6.0
71,Spicy Potato Soft Taco,1.1,Veggie Cravings,240.0,12.0,3.0,0.0,10.0,480.0,28.0,2.0,5.0,0.0,110.0,1.3,270.0,2.0
72,Black Bean Chalupa Supreme®,4.19,Veggie Cravings,340.0,18.0,4.0,0.0,10.0,460.0,36.0,6.0,10.0,0.0,140.0,2.5,300.0,5.0
73,Black Bean Crunchwrap Supreme®,5.39,Veggie Cravings,520.0,18.0,5.0,0.0,5.0,1100.0,77.0,8.0,13.0,0.0,260.0,4.8,610.0,8.0
74,Veggie Mexican Pizza,5.19,Veggie Cravings,470.0,25.0,7.0,0.0,20.0,730.0,46.0,7.0,14.0,0.0,310.0,2.6,360.0,4.0
76,Cheesy Roll Up,1.0,Veggie Cravings,180.0,9.0,6.0,0.0,20.0,430.0,17.0,1.0,8.0,0.0,230.0,1.0,60.0,2.0
77,Cheesy Fiesta Potatoes,2.59,Veggie Cravings,240.0,13.0,2.0,0.0,5.0,520.0,28.0,3.0,3.0,0.0,40.0,0.3,560.0,1.0


In [20]:
# Removing duplates from item_name column
menu_data_df = menu_data_df.drop_duplicates(subset=['item_name'])
menu_data_df.shape

(73, 17)

In [21]:
# generate count statistics of duplicate entries
if len(menu_data_df[menu_data_df.item_name.duplicated()]) > 0:
    print("No. of duplicated entries: ", len(menu_data_df[menu_data_df.item_name.duplicated()]))
else:
    print("No duplicated entries found")

No duplicated entries found


## Dropping limited time items 

In [22]:
# Dropping limited time items 
menu_data_df = menu_data_df.drop(menu_data_df[menu_data_df['item_name']=='Steak and Bacon Grilled Cheese Burrito'].index)
menu_data_df = menu_data_df.drop(menu_data_df[menu_data_df['item_name']=='Strawberry Twists'].index)
menu_data_df = menu_data_df.drop(menu_data_df[menu_data_df['item_name']=='Wild Strawberry Creme Delight Freeze'].index)
menu_data_df = menu_data_df.drop(menu_data_df[menu_data_df['item_name']=='Blue Raspberry Freeze'].index)
menu_data_df = menu_data_df.drop(menu_data_df[menu_data_df['item_name']=='Breakfast Taco Sausage'].index)
menu_data_df = menu_data_df.drop(menu_data_df[menu_data_df['item_name']=='Breakfast Taco Bacon'].index)
menu_data_df = menu_data_df.drop(menu_data_df[menu_data_df['item_name']=='Breakfast Taco Potato'].index)

# Steak and Bacon Grilled Cheese Burrito
# Strawberry Twists
# Wild Strawberry Creme Delight Freeze
# Blue Raspberry Freeze
# Breakfast Taco Sausage
# Breakfast Taco Bacon
# Breakfast Taco Potato

In [23]:
print(menu_data_df.shape)
menu_data_df.head()


(67, 17)


Unnamed: 0,item_name,price,menu_section,Calories,Total Fat (g),Saturated Fat (g),Trans Fat (g),Cholesterol (mg),Sodium (mg),Total Carbohydrates (g),Dietary Fiber (g),Protein (g),Vitamin D (mcg),Calcium (mg),Iron (mg),Potassium (mg),Total Sugars (g)
0,Soft Taco,1.89,Tacos,180.0,8.0,4.0,0.0,25.0,500.0,18.0,3.0,9.0,0.0,110.0,1.7,130.0,2.0
1,Soft Taco Supreme®,2.89,Tacos,210.0,10.0,5.0,0.0,25.0,510.0,20.0,3.0,10.0,0.0,130.0,1.7,200.0,3.0
2,Spicy Potato Soft Taco,1.1,Tacos,240.0,12.0,3.0,0.0,10.0,480.0,28.0,2.0,5.0,0.0,110.0,1.3,270.0,2.0
3,Crunchy Taco,1.89,Tacos,170.0,10.0,3.5,0.0,25.0,300.0,13.0,3.0,8.0,0.0,70.0,0.9,140.0,1.0
4,Crunchy Taco Supreme®,2.89,Tacos,190.0,11.0,4.5,0.0,25.0,320.0,15.0,3.0,8.0,0.0,80.0,0.9,200.0,2.0


In [24]:
# Ready to use dataset
menu_data_df.to_csv("../data/cleaned_indv_items.csv.csv")