In [13]:
import pandas as pd
import numpy as np

from datetime import date

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

today=date.today()

In [14]:
# Reading in the data
menu_data_df = pd.read_csv("../data/uncleaned_taco_bell_menu_items2024-08-21.csv", index_col=0)

print(menu_data_df.shape)

menu_data_df.info()


(98, 18)
<class 'pandas.core.frame.DataFrame'>
Index: 98 entries, 0 to 97
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   item_name                98 non-null     object 
 1   price                    98 non-null     float64
 2   menu_section             98 non-null     object 
 3   Calories                 98 non-null     float64
 4   Total Fat (g)            98 non-null     float64
 5   Saturated Fat (g)        98 non-null     float64
 6   Trans Fat (g)            98 non-null     float64
 7   Cholesterol (mg)         98 non-null     float64
 8   Sodium (mg)              98 non-null     float64
 9   Total Carbohydrates (g)  98 non-null     float64
 10  Dietary Fiber (g)        98 non-null     float64
 11  Sugars (g)               98 non-null     float64
 12  Includes (g)             98 non-null     float64
 13  Protein (g)              98 non-null     float64
 14  Vitamin D (mcg)         

- As we can see all the NaN values are only for alleregen info, so i will be replacing all NaN with zero
- Next, we will combine the `Sugars (g)` and `Includes (g)` into one column named `Total Sugars (g)`

In [15]:
menu_data_df.fillna(0, inplace=True)

menu_data_df['Total Sugars (g)'] = menu_data_df['Sugars (g)'] + menu_data_df['Includes (g)']

del menu_data_df['Sugars (g)']
del menu_data_df['Includes (g)']

menu_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 98 entries, 0 to 97
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   item_name                98 non-null     object 
 1   price                    98 non-null     float64
 2   menu_section             98 non-null     object 
 3   Calories                 98 non-null     float64
 4   Total Fat (g)            98 non-null     float64
 5   Saturated Fat (g)        98 non-null     float64
 6   Trans Fat (g)            98 non-null     float64
 7   Cholesterol (mg)         98 non-null     float64
 8   Sodium (mg)              98 non-null     float64
 9   Total Carbohydrates (g)  98 non-null     float64
 10  Dietary Fiber (g)        98 non-null     float64
 11  Protein (g)              98 non-null     float64
 12  Vitamin D (mcg)          98 non-null     float64
 13  Calcium (mg)             98 non-null     float64
 14  Iron (mg)                98 non-n

- If price is less than $1.00 that means it is a sauce packet and will get dropped from the data frame

In [16]:
menu_data_df = menu_data_df[menu_data_df['price'] >= 1.00]
menu_data_df.shape

(84, 17)

In [17]:
menu_data_df.head()

Unnamed: 0,item_name,price,menu_section,Calories,Total Fat (g),Saturated Fat (g),Trans Fat (g),Cholesterol (mg),Sodium (mg),Total Carbohydrates (g),Dietary Fiber (g),Protein (g),Vitamin D (mcg),Calcium (mg),Iron (mg),Potassium (mg),Total Sugars (g)
0,Steak Cheesy Street Chalupas,5.49,Tacos,410.0,25.0,6.0,0.0,35.0,750.0,27.0,3.0,19.0,0.0,150.0,3.6,210.0,4.0
1,Cantina Chicken Cheesy Street Chalupas,5.49,Tacos,410.0,25.0,6.0,0.0,40.0,670.0,28.0,4.0,19.0,0.0,140.0,3.7,270.0,3.0
2,Double Stacked Taco,1.99,Tacos,320.0,16.0,5.0,0.0,25.0,600.0,34.0,4.0,11.0,0.0,130.0,1.9,300.0,3.0
3,Soft Taco,1.89,Tacos,180.0,8.0,4.0,0.0,25.0,490.0,18.0,3.0,9.0,0.0,100.0,1.7,130.0,2.0
4,Soft Taco Supreme®,2.89,Tacos,210.0,10.0,5.0,0.0,30.0,510.0,20.0,3.0,9.0,0.0,120.0,1.7,200.0,3.0


# Removing duplicates and limited time items

## Dropping limited time items 

In [18]:
# Dropping limited time items 
menu_data_df = menu_data_df.drop(menu_data_df[menu_data_df['item_name']=='Steak and Bacon Grilled Cheese Burrito'].index) # discontinued 
menu_data_df = menu_data_df.drop(menu_data_df[menu_data_df['item_name']=='Strawberry Twists'].index) # discontinued 
menu_data_df = menu_data_df.drop(menu_data_df[menu_data_df['item_name']=='Wild Strawberry Creme Delight Freeze'].index) # discontinued 
menu_data_df = menu_data_df.drop(menu_data_df[menu_data_df['item_name']=='Blue Raspberry Freeze'].index)
menu_data_df = menu_data_df.drop(menu_data_df[menu_data_df['item_name']=='Breakfast Taco Sausage'].index) # discontinued 
menu_data_df = menu_data_df.drop(menu_data_df[menu_data_df['item_name']=='Breakfast Taco Bacon'].index) # discontinued 
menu_data_df = menu_data_df.drop(menu_data_df[menu_data_df['item_name']=='Breakfast Taco Potato'].index) # discontinued 
menu_data_df = menu_data_df.drop(menu_data_df[menu_data_df['item_name']=='Double Berry Freeze'].index)
menu_data_df = menu_data_df.drop(menu_data_df[menu_data_df['item_name']=='Bell Breakfast Box'].index)


### Removed Item Names ### 
# Steak and Bacon Grilled Cheese Burrito
# Strawberry Twists
# Wild Strawberry Creme Delight Freeze
# Blue Raspberry Freeze
# Breakfast Taco Sausage
# Breakfast Taco Bacon
# Breakfast Taco Potato
# Double Berry Freeze
# Bell Breakfast Box

menu_data_df.shape

(84, 17)

## Checking and removing duplicates

In [19]:
# generate count statistics of duplicate entries
if len(menu_data_df[menu_data_df.item_name.duplicated()]) > 0:
    print("No. of duplicated entries: ", len(menu_data_df[menu_data_df.item_name.duplicated()]))
else:
    print("No duplicated entries found")

No. of duplicated entries:  21


In [20]:
# Viewing the duplated rows
duplated_rows = menu_data_df.loc[menu_data_df.item_name.duplicated(), :]
duplated_rows


Unnamed: 0,item_name,price,menu_section,Calories,Total Fat (g),Saturated Fat (g),Trans Fat (g),Cholesterol (mg),Sodium (mg),Total Carbohydrates (g),Dietary Fiber (g),Protein (g),Vitamin D (mcg),Calcium (mg),Iron (mg),Potassium (mg),Total Sugars (g)
62,Chips and Guacamole,3.29,Nachos,250.0,16.0,2.0,0.0,0.0,240.0,22.0,4.0,3.0,0.1,50.0,0.2,280.0,1.0
63,Chips and Nacho Cheese Sauce,2.89,Nachos,220.0,13.0,1.5,0.0,0.0,280.0,24.0,2.0,3.0,0.0,60.0,0.2,290.0,1.0
65,Bean Burrito,1.99,Veggie Cravings,360.0,10.0,4.5,0.0,5.0,1080.0,54.0,8.0,13.0,3.1,200.0,3.8,420.0,5.0
66,Cheesy Bean and Rice Burrito,1.49,Veggie Cravings,420.0,17.0,4.5,0.0,5.0,940.0,55.0,6.0,9.0,1.5,160.0,3.3,370.0,5.0
67,Spicy Potato Soft Taco,1.29,Veggie Cravings,240.0,12.0,3.0,0.0,10.0,470.0,28.0,2.0,5.0,0.0,100.0,1.3,270.0,2.0
68,Black Bean Chalupa Supreme®,4.69,Veggie Cravings,340.0,18.0,4.0,0.0,10.0,460.0,36.0,6.0,10.0,0.0,140.0,2.5,300.0,5.0
69,Black Bean Crunchwrap Supreme®,5.89,Veggie Cravings,520.0,18.0,5.0,0.0,5.0,1100.0,77.0,8.0,13.0,0.0,260.0,4.8,610.0,8.0
70,Veggie Mexican Pizza,5.59,Veggie Cravings,470.0,25.0,7.0,0.0,20.0,760.0,46.0,6.0,14.0,1.5,310.0,2.5,370.0,4.0
71,Cheese Quesadilla,4.79,Veggie Cravings,470.0,24.0,13.0,0.0,50.0,1000.0,41.0,3.0,18.0,0.0,500.0,2.5,160.0,5.0
72,Cheesy Roll Up,1.19,Veggie Cravings,180.0,9.0,6.0,0.0,20.0,430.0,17.0,1.0,8.0,0.0,230.0,1.0,60.0,2.0


In [21]:
# Removing duplates from item_name column
menu_data_df = menu_data_df.drop_duplicates(subset=['item_name'], keep="last")
menu_data_df.shape

(63, 17)

In [22]:
# generate count statistics of duplicate entries
if len(menu_data_df[menu_data_df.item_name.duplicated()]) > 0:
    print("No. of duplicated entries: ", len(menu_data_df[menu_data_df.item_name.duplicated()]))
else:
    print("No duplicated entries found")

No duplicated entries found


In [23]:
print(menu_data_df.shape)
menu_data_df


(63, 17)


Unnamed: 0,item_name,price,menu_section,Calories,Total Fat (g),Saturated Fat (g),Trans Fat (g),Cholesterol (mg),Sodium (mg),Total Carbohydrates (g),Dietary Fiber (g),Protein (g),Vitamin D (mcg),Calcium (mg),Iron (mg),Potassium (mg),Total Sugars (g)
0,Steak Cheesy Street Chalupas,5.49,Tacos,410.0,25.0,6.0,0.0,35.0,750.0,27.0,3.0,19.0,0.0,150.0,3.6,210.0,4.0
1,Cantina Chicken Cheesy Street Chalupas,5.49,Tacos,410.0,25.0,6.0,0.0,40.0,670.0,28.0,4.0,19.0,0.0,140.0,3.7,270.0,3.0
2,Double Stacked Taco,1.99,Tacos,320.0,16.0,5.0,0.0,25.0,600.0,34.0,4.0,11.0,0.0,130.0,1.9,300.0,3.0
3,Soft Taco,1.89,Tacos,180.0,8.0,4.0,0.0,25.0,490.0,18.0,3.0,9.0,0.0,100.0,1.7,130.0,2.0
4,Soft Taco Supreme®,2.89,Tacos,210.0,10.0,5.0,0.0,30.0,510.0,20.0,3.0,9.0,0.0,120.0,1.7,200.0,3.0
6,Crunchy Taco,1.89,Tacos,170.0,10.0,4.0,0.0,25.0,300.0,13.0,3.0,8.0,0.0,60.0,0.9,140.0,1.0
7,Crunchy Taco Supreme®,2.89,Tacos,190.0,11.0,5.0,0.0,30.0,320.0,15.0,3.0,8.0,0.0,80.0,0.9,200.0,2.0
8,Nacho Cheese Doritos® Locos Tacos,2.69,Tacos,170.0,10.0,4.0,0.0,25.0,360.0,12.0,3.0,8.0,0.0,70.0,0.9,150.0,1.0
9,Nacho Cheese Doritos® Locos Tacos Supreme®,3.69,Tacos,190.0,11.0,5.0,0.0,30.0,380.0,14.0,3.0,8.0,0.0,90.0,0.9,210.0,2.0
10,Chalupa Supreme®,4.99,Tacos,360.0,20.0,6.0,0.0,25.0,570.0,31.0,4.0,12.0,0.0,120.0,2.3,230.0,5.0


In [25]:
# Ready to use dataset
menu_data_df.to_csv(f"../data/cleaned_taco_bell_menu_items_{today}.csv")