- https://stackoverflow.com/questions/49565042/way-to-change-google-chrome-user-agent-in-selenium
- https://www.scrapingbee.com/blog/selenium-python/#using-a-proxy-with-selenium-wire

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup

import pandas as pd
import time

In [2]:
base_url = "https://voila.ca/"

In [3]:
# Set up webdriver
options = webdriver.ChromeOptions()
options.add_argument("--window-size=1600,960")
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36")
driver = webdriver.Chrome(options = options)

In [4]:
driver.get(base_url)

In [5]:
# Wait until browse menu button is loaded, then access element
# Level 1: Access Browse menu
browse_menu = WebDriverWait(driver,10).until(
    EC.presence_of_element_located((By.ID, "nav-menu-button"))
)
browse_menu.click() # Click on browse menu button

# Level 2: Access meat and seafood
meat_and_seafood = WebDriverWait(driver,10).until(
    EC.presence_of_element_located((By.LINK_TEXT, "Meat & Seafood"))
)
meat_and_seafood.click()

# Level 3: Access chicken
chicken = WebDriverWait(driver,10).until(
    EC.presence_of_element_located((By.LINK_TEXT, "Chicken"))
)
chicken.click()

# Level 4: Access whole chicken
whole_chicken = WebDriverWait(driver,10).until(
    EC.presence_of_element_located((By.LINK_TEXT, "Whole Chicken"))
)

whole_chicken.click()

In [6]:
# Extract HTML of page
html = driver.page_source

In [15]:
# End session
driver.quit()

In [7]:
# Parse the response text using Beautiful Soup
soup = BeautifulSoup(html, 'html.parser')

In [12]:
price_dict = {
    "item"      : [],
    "weight"    : [],
    "unit_cost" : [],
    "cost"      : [],
    "url"       : []
}

price_dict["item"]      = price_dict["item"]      + [tag.text for tag in soup.find_all("a",{"data-test":"fop-product-link"})]
price_dict["weight"]    = price_dict["weight"]    + [tag.text for tag in soup.find_all("span",{"class":"text__Text-sc-6l1yjp-0 base__SizeText-sc-1mnb0pd-38 fop__SizeText-sc-sgv9y1-4 bhymDA iImbUZ jrvLpU"})]
price_dict["unit_cost"] = price_dict["unit_cost"] + [tag.text for tag in soup.find_all("span",{"class":"text__Text-sc-6l1yjp-0 standard-promotion__PromotionIntentText-sc-1vpsrpe-2 fop__PricePerText-sc-sgv9y1-5 dLNLFE eMCyTR eNYENy"})]
price_dict["cost"]      = price_dict["cost"]      + [tag.text for tag in soup.find_all("strong",{"class":"base__Price-sc-1mnb0pd-29 sc-ksdxAp ftyPQ kWOBdd"})]
price_dict["url"]       = price_dict["url"]      + [tag.get("href") for tag in soup.find_all("a",{"data-test":"fop-product-link"})]

In [13]:
from datetime import datetime
df = pd.DataFrame(price_dict)
df["time_of_scrape"] = datetime.now()
df

Unnamed: 0,item,weight,unit_cost,cost,url,time_of_scrape
0,Compliments Whole Chicken Grade A,1.75kgtypically,($11.00 per kilogram),$19.25typically,/products/218890KG/details,2023-11-09 17:18:42.240746
1,Longo's Rotisserie Chicken BBQ 1 kg,1kg,($15.99 per kilogram),$15.99,/products/18859EA/details,2023-11-09 17:18:42.240746
2,Prime Organic Whole Chicken 1.3 kg,1.5kgtypically,($13.21 per kilogram),$19.82typically,/products/587643KG/details,2023-11-09 17:18:42.240746
3,Mina Halal Whole Chicken 1 pieces,1.7kg,($0.94 per 100g),$16.00,/products/4089EA/details,2023-11-09 17:18:42.240746
4,Yorkshire Valley Farms Organic Whole Chicken,1.6kgtypically,($15.99 per kilogram),$25.58typically,/products/20400KG/details,2023-11-09 17:18:42.240746
5,Longo's Rotisserie Chicken Raised Without Anti...,1kg,($17.99 per kilogram),$17.99,/products/18860EA/details,2023-11-09 17:18:42.240746


In [14]:
df.to_csv(f"../11_raw_data/{time.strftime('%Y%m%d-%H%M')}_voila_price.csv", index = False)