In [43]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [6]:
# Define the base URL and parameters
base_url = "https://www.petsmart.ca/cat/food-and-treats/wet-food"
params = {
    "sort": "best-sellers",
    "format": "new"
}

# Define the page range
start_page = 1
end_page = 18  # adjust this to the total number of pages
dataframes = []

# Loop over each page
for page in range(start_page, end_page + 1):
    # Construct the URL with the current page number
    url = f"{base_url}?page={page}&sort={params['sort']}&format={params['format']}"

    # Send a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all the product elements on the page
        products = soup.find_all('div', class_='sparky-l-grid__item')
        
        for product in products:
            # Extract the product name
            name = product.find('a', class_='sparky-c-text-link sparky-c-product-card__text-link').get_text(strip=True)
            
            # Extract the product price
            price = product.find('div', class_='sparky-c-price sparky-c-product-card__price-group sparky-c-price--lg')
            if price:
                price = price.get_text(strip=True)
            else:
                price='placeholder'

            # # Extract the product URL
            product_url = product.find('a', class_='sparky-c-text-link sparky-c-product-card__text-link')['href']
            
            data = {
                'name': [name],
                'price': [price],
                'url': [product_url]
            }
            dataframes.append(pd.DataFrame(data))
    else:
        print(f'Failed to retrieve the webpage. Status code: {response.status_code}')


Unnamed: 0,name,price,url
0,Purina Pro Plan Complete Essentials Adult Wet ...,$44.99,/cat/food-and-treats/wet-food/purina-pro-plan-...
1,"Tiki Cat® After Dark Wet Cat Food - Non-GMO, G...",$34.99,/cat/food-and-treats/wet-food/tiki-cat-after-d...
2,Purina Pro Plan Complete Essentials Adult Wet ...,$2.19,/cat/food-and-treats/wet-food/purina-pro-plan-...
3,Purina Pro Plan Complete Essentials Adult Wet ...,$2.19,/cat/food-and-treats/wet-food/purina-pro-plan-...
4,Hill's® Science Diet® Urinary Hairball Control...,$3.49,/cat/food-and-treats/wet-food/hills-science-di...
...,...,...,...
278,Kit Cat Gravy Series Cat Food - Chicken & Quai...,$1.99,/cat/food-and-treats/wet-food/kit-cat-gravy-se...
279,Kit Cat Gravy Series Cat Food - Chicken,$1.99,/cat/food-and-treats/wet-food/kit-cat-gravy-se...
280,Whiskas Meaty Selections Adult Wet Cat Food Ch...,$18.99,/cat/food-and-treats/wet-food/whiskas-meaty-se...
281,"HALO® Adult Cat Food - Natural, Grain Free, Ch...",$3.79,/cat/food-and-treats/wet-food/halo-adult-cat-f...


In [8]:
wet_food_df = pd.concat(dataframes).reset_index(drop=True)

wet_food_df.to_csv('wet_food.csv', index=False)

In [67]:
wet_food_df[wet_food_df['name'].str.contains('variety', case=False)]

variety_pack_df = wet_food_df[wet_food_df['name'].str.contains('variety', case=False)]
singles_df = wet_food_df[~wet_food_df['name'].str.contains('variety', case=False)]

singles_df[~singles_df['name'].str.contains('oz', case=False)]



In [98]:
def scrape_url(url):
    response = requests.get('https://www.petsmart.ca/cat/food-and-treats/'+url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract flavor and size from the HTML
    # flavor = soup.find('span', {'class': 'flavor'}).text.strip()
    # size = soup.find('span', {'class': 'size'}).text.strip()
    element = soup.select('[data-testid="variant-title-size"]')
    # parent = element.parent
    # second_child = parent.contents[1]
    return element

# <div class="sparky-c-definition-list__item size-variant__radio-fields-title" data-testid="variant-title-size"><dt class="sparky-c-definition-list__term">Size:</dt><dd class="sparky-c-definition-list__description">3 Oz</dd></div>
results =(scrape_url(singles_df.loc[10, 'url']))

results[0].find('dd').get_text(strip=True)

'3 Oz'