In [1]:
# Developled on Python version 3.11.4

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from time import sleep
from bs4 import BeautifulSoup
import pandas as pd

# Initialization of Functions

In [2]:
# Function that parses through the nutritonal data, and returns a dictionary 

def nutrition_info_parsing(text):
    nutrition_lines = [line.strip() for line in text.split("\n") if line.strip()]

    nutrient_dict = {}

    i = 0
    while i < len(nutrition_lines):
        line = nutrition_lines[i]

        if line == 'Calories':
            nutrient_dict['Calories'] = float(nutrition_lines[i+1])
            i += 2  # Increment by 2 to jump to the next component

        elif any(word in line for word in ['Fat' ,'Cholesterol','Includes', 'Sugars', 'Sodium', 'Carbohydrates', 'Fiber', 'Protein', 'Vitamin D', 'Calcium', 'Iron', 'Potassium']):
            nutrient = line.split()
            
            # Check if the next line contains a value (like '8g', '25mg', etc.)
            if i+1 < len(nutrition_lines) and any(val in nutrition_lines[i+1] for val in ['g', 'mg', 'mcg']):

                # Use the nutrient as key and the next line as value
                nutrient_name = ' '.join(nutrient[:-1])  # Exclude the value (like '8g') from the nutrient name
                unit = ''.join([char for char in nutrient[-1] if not char.isdigit() and char != '.'])
                nutrient_key = f"{nutrient_name} ({unit})"
                nutrient_value = float(''.join([char for char in nutrient[-1] if char.isdigit() or char == '.']))

                if '<' in nutrient_name:
                    nutrient_name = nutrient_name.replace('<', '').strip()
                    nutrient_key = f"{nutrient_name} ({unit})"

                nutrient_dict[nutrient_key] = nutrient_value
                
                i += 2  # Increment by 2 to jump to the next component

            else:
                i += 1  # No expected value on the next line, move on

        else:
            i += 1


    return nutrient_dict



## The next series of function uses Selenium to automate the webscraping processes 

In [3]:

def get_menu_section_links(driver, base_url="https://www.tacobell.com", menu_endpoint="/food", store_location = "?store=038911#"):
    driver.get(base_url+menu_endpoint)

    cites_allowed_WS = [
    "/food/tacos",
    "/food/burritos",
    "/food/quesadillas",
    "/food/nachos",
    "/food/sides-sweets",
    "/food/drinks",
    "/food/power-menu",
    "/food/vegetarian",
    "/food/breakfast",
    "/food/specialties"
    ]

    # Look into https://www.tacobell.com/sitemap.xml a little bit more.

    # Using XPath to locate the main parent div that contains all the links
    element = driver.find_element(By.XPATH, '//div[contains(@class, "styles_menu-tiles__1JTJ3")]')

    # Parse the content using BeautifulSoup
    soup = BeautifulSoup(element.get_attribute('outerHTML'), 'html.parser')

    # Extract all the links and their href values
    links = [a['href'] for a in soup.find_all('a') if a.has_attr('href')]

    allowed_links = [link for link in links if link in cites_allowed_WS]
    
    section_links = [base_url + link + store_location for link in allowed_links]


    return section_links



In [4]:


def pulling_data(driver, store_location="?store=038911#", base_url="https://www.tacobell.com"):
    menu_section_links = get_menu_section_links(driver)

    menu_data = []
    for f in menu_section_links:
        driver.get(f)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        heading = soup.find('h1')
        
        links = [a['href'] for a in soup.find_all('a', class_='styles_product-title__6KCyw')]
        
        full_links = [base_url + link + store_location+"#" if not link.endswith('1') else base_url+link+"#" for link in links]

        for item in full_links:
            driver.get(item)

            subpage_soup = BeautifulSoup(driver.page_source, 'html.parser')
            header = subpage_soup.find_all('h1')
            item_name = [i.text for i in header if len(i.text) !=0]

            price = subpage_soup.find('span', class_='styles_price__3-xtw')

            try:
                # Check if the "Nutrition Info" link exists on the webpage
                nutrition_link = driver.find_element(By.LINK_TEXT, "Nutrition Info")
                nutrition_link.click()
                sleep(2)
                
                driver.switch_to.frame(driver.find_element(By.CLASS_NAME, "styles_frame__1rZvs"))
                
                # Grabbing nutri info
                nutrition_info = driver.find_element(By.CLASS_NAME, 'nf')
                nutrition_info_txt = nutrition_info.text

                # Grabbing allergen info
                allergen_info = driver.find_element(By.CLASS_NAME, "allergenInfo")
                allergen_info_text = allergen_info.text

                # Append the data to the taco_data list
                menu_data.append({
                    'item_name': item_name[0],
                    'price': float(price.text[1:]),
                    'menu_section': heading.text,
                    **nutrition_info_parsing(nutrition_info_txt)
                })
                
            except NoSuchElementException:  # Element not found
                continue  # Go to the next item in the loop

        print(f"The {heading.text} section has been sucessfully pulled")


    print("\nAll individual items from Taco Bell's menu have been acquired")


    return menu_data

In [5]:
driver = webdriver.Chrome()
menu_data = pulling_data(driver)
driver.quit()


The Tacos section has been sucessfully pulled
The Burritos section has been sucessfully pulled
The Specialties section has been sucessfully pulled
The Sides & Sweets section has been sucessfully pulled
The Quesadillas section has been sucessfully pulled
The Drinks section has been sucessfully pulled
The Nachos section has been sucessfully pulled
The Veggie Cravings section has been sucessfully pulled
The Bowls section has been sucessfully pulled
The Breakfast section has been sucessfully pulled

All individual items from Taco Bell's menu have been acquired


In [6]:
driver.quit()

In [7]:
menu_data_df = pd.DataFrame(menu_data)
menu_data_df.info
print(menu_data_df.shape)

(107, 18)


In [8]:
menu_data_df.head()

Unnamed: 0,item_name,price,menu_section,Calories,Total Fat (g),Saturated Fat (g),Trans Fat (g),Cholesterol (mg),Sodium (mg),Total Carbohydrates (g),Dietary Fiber (g),Sugars (g),Includes (g),Protein (g),Vitamin D (mcg),Calcium (mg),Iron (mg),Potassium (mg)
0,Soft Taco,1.89,Tacos,180.0,8.0,4.0,0.0,25.0,500.0,18.0,3.0,1.0,1.0,9.0,0.0,110.0,1.7,130.0
1,Soft Taco Supreme速,2.89,Tacos,210.0,10.0,5.0,0.0,25.0,510.0,20.0,3.0,2.0,1.0,10.0,0.0,130.0,1.7,200.0
2,Spicy Potato Soft Taco,1.1,Tacos,240.0,12.0,3.0,0.0,10.0,480.0,28.0,2.0,1.0,1.0,5.0,0.0,110.0,1.3,270.0
3,Crunchy Taco,1.89,Tacos,170.0,10.0,3.5,0.0,25.0,300.0,13.0,3.0,1.0,0.0,8.0,0.0,70.0,0.9,140.0
4,Crunchy Taco Supreme速,2.89,Tacos,190.0,11.0,4.5,0.0,25.0,320.0,15.0,3.0,2.0,0.0,8.0,0.0,80.0,0.9,200.0


In [9]:
menu_data_df.to_csv("../data/uncleaned_taco_bell_menu_items.csv")