# Scraping Products on Sale From Big Y

### Importing required modules and packages
* selenium - Primary package used to load webpages
    * selenium.webdriver.common.by By - used to locate objects on page
    * selenium.webdriver.firefox.options Options - allows you to specify certain settings for the webdriver
* pandas - handling the data after scraping off page
* time sleep - used to wait for the page to fully load before doing anything
    * Selenium has an implicit wait that for some reason I couldn't figure out how to use lol

In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options as FirefoxOptions
import pandas as pd
from time import sleep

Adding the "--headless" setting to the FirefoxOptions() so I can load the pages without a window appearing. This only takes effect when calling the webdriver with the options variable as an argument

In [2]:
options = FirefoxOptions()
options.add_argument("--headless")


In [4]:
def getDeals(driver):
    #Function returns a Dataframe containing products with deals: Product Name, Price, Category, Image URL, and UPC
    #Should be called when deal page is already loaded in driver
    #The webdriver can not be used in another called/nested function unless passed as an argument 
    
    driver = driver
    
    
    #Get Elements Containing Discounts/Products
    
    '''
    parentElement finds the HTML element containing the HTML elements with all the products
        * find_elements returns a list of selenium webobjects that match the search
        * find_element returns the first selenium webobjects that matches the search
        * selenium webobjects can be searched using methods like find_element(s)
        * NOTE: selenium webobjects can only be parsed while the driver is active
    childElements searches the parentElements list of webobjects
        * Looking back, I most likely could have just done:
            - elements = parentElement.find_elements(By.CLASS_NAME, "col-md-3")
            - and it would have returned the same results
    '''
    
    parentElement = driver.find_element(By.CLASS_NAME, "col-md-12.col-sm-12.col-xs-12.list-items-content")
    childElements = parentElement.find_elements(By.CLASS_NAME, "col-md-3")
    
    #Get DF of Discounts
   
    produceDict = {}
    
    for i in childElements:
        name = i.get_attribute("data-name")
        category = i.get_attribute("data-categoryname")
        image = i.get_attribute("data-image")
        upc = i.get_attribute("data-upc")
        price = i.find_element(By.TAG_NAME, "strong").text

        produceDict[name] = [name, price, category, image, upc]

    produce = pd.DataFrame.from_dict(produceDict, orient='index')
    
    #Clean Deal/Discount Dictionary

    #Rename Columns
    produce = produce.reset_index()
    produce = produce.drop('index', axis=1)
    produce = produce.rename(columns={0: 'Name', 1 : 'Price', 2: 'Category', 3: 'Image', 4: "UPC"})
    
    #Clean Price Column
    produce['Price'] = produce['Price'].apply(lambda x: x[:-22])
    produce
    return produce

In [3]:
def bigYDeals():
    #Function that navigates to the discounts page, loads all the discount, then calls the getDeals() function
    #Returns a dataframe containing all the discounts
    driver = webdriver.Firefox()
    
    driver.get(r"https://www.bigy.com/rs/store-locator?keyword=06076")
    
    sleep(1)
    
    #Navigates from the select store page to the discounts page
    
    driver.find_element(By.CLASS_NAME, "by-btn.make-my-store-btn").click()
    driver.find_element(By.CLASS_NAME, "weekly-ad-header").click()
    driver.find_element(By.CLASS_NAME, "weekly-ad-thumbnail").click()
    driver.find_element(By.CLASS_NAME, 'btn-view-all').click()
    driver.find_element(By.ID, "closeCookie").click()
    
    
    #This clicks the load more button until it is unable to
    for i in range(30):
        sleep(1)
        try:
            driver.find_element(By.CLASS_NAME, "col-md-12.col-sm-12.col-xs-12.text-center.ss-btn-load-more").click()

        except:
            pass
    
    sleep(1)
    deals_df = getDeals(driver)
    driver.close()
    return deals_df
    

In [12]:

products = bigYDeals()

products

Unnamed: 0,Name,Price,Category,Image,UPC
0,Olivia's Organics Herb Salad Blend,$3.29,Produce,https://bigycdn.blob.core.windows.net/marketin...,78970772001
1,Food Club Solid White Albacore Tuna,4 FOR $5.00,Soup and Canned Goods,https://bigycdn.blob.core.windows.net/marketin...,3680028127
2,Wish-Bone Salad Dressing,2 FOR $4.00,Condiments and Sauces,https://bigycdn.blob.core.windows.net/marketin...,4132100570
3,Driscoll's Organic Blueberries,$4.99,Produce,https://bigycdn.blob.core.windows.net/marketin...,71575630009
4,Phillips Gourmet Sliced Baby Bella Mushrooms,$1.99,Produce,https://bigycdn.blob.core.windows.net/marketin...,2170620010
...,...,...,...,...,...
110,Modelo Especial,$16.99,"Beer, Wine and Spirits",https://bigycdn.blob.core.windows.net/marketin...,3354400162
111,Arnold Palmer Spiked,$16.99,"Beer, Wine and Spirits",https://bigycdn.blob.core.windows.net/marketin...,61300875220
112,Harpoon Or Ufo Beer Co.,$16.99,"Beer, Wine and Spirits",https://bigycdn.blob.core.windows.net/marketin...,4182700012
113,Narragansett Lager,$11.99,"Beer, Wine and Spirits",https://bigycdn.blob.core.windows.net/marketin...,68907661325


Saving the dataframe to a csv

In [17]:
products.to_csv(r"Big_Y_Sales_July_14_2024.csv", header=True)