In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

from typing import Optional
import re
from datetime import datetime

In [2]:
def get_candle_data(block) -> dict:
  anchor_tags = block.find_all('a')
  href_values = [a.get('href') for a in anchor_tags]
  link = href_values[0]

  name_tag = anchor_tags[1]
  name = name_tag.get_text().strip()

  price_str_raw = block.find(class_='range-price').get_text()
  price_str = re.search(r'\d+,\d*', price_str_raw).group().replace(',', '.')
  price = float(price_str)
  link_id = re.search(r'\/([^\/]+)$', link).group(1)
  date = datetime.now().strftime('%Y-%m-%d')

  return {
      'link': link,
      'name': name,
      'price': price,
      'id': link_id,
      'date': date
  }

In [3]:
def handle_page(page):
  candles_data = []
  soup = BeautifulSoup(page.content, 'html.parser')
  blocks = soup.find_all(lambda tag: tag.has_attr('id') and re.match('result-wrapper_buy_form_\d+', tag['id']))
  for block in blocks:
    data = get_candle_data(block)
    candles_data.append(data)
  return candles_data

In [4]:
def get_candles_data():
  def is_first_page(url):
    return not re.search(r'_s\d+$', url)

  candles_data = []

  base_url = 'https://www.goosecreekcandle.de/3-Wick-Candles_s'
  index = 1
  is_first_iteration = True

  while True:
    url = f'{base_url}{index}'
    print(f'new iter {index} {url}')
    page = requests.get(url)

    actual_url = page.url

    if not is_first_iteration and is_first_page(actual_url):
      return candles_data

    candles_data += handle_page(page)
    index += 1
    is_first_iteration = False

In [5]:
candles = get_candles_data()

new iter 1 https://www.goosecreekcandle.de/3-Wick-Candles_s1
new iter 2 https://www.goosecreekcandle.de/3-Wick-Candles_s2
new iter 3 https://www.goosecreekcandle.de/3-Wick-Candles_s3
new iter 4 https://www.goosecreekcandle.de/3-Wick-Candles_s4
new iter 5 https://www.goosecreekcandle.de/3-Wick-Candles_s5
new iter 6 https://www.goosecreekcandle.de/3-Wick-Candles_s6
new iter 7 https://www.goosecreekcandle.de/3-Wick-Candles_s7
new iter 8 https://www.goosecreekcandle.de/3-Wick-Candles_s8
new iter 9 https://www.goosecreekcandle.de/3-Wick-Candles_s9
new iter 10 https://www.goosecreekcandle.de/3-Wick-Candles_s10
new iter 11 https://www.goosecreekcandle.de/3-Wick-Candles_s11
new iter 12 https://www.goosecreekcandle.de/3-Wick-Candles_s12
new iter 13 https://www.goosecreekcandle.de/3-Wick-Candles_s13
new iter 14 https://www.goosecreekcandle.de/3-Wick-Candles_s14
new iter 15 https://www.goosecreekcandle.de/3-Wick-Candles_s15
new iter 16 https://www.goosecreekcandle.de/3-Wick-Candles_s16
new iter 1

In [6]:
df = pd.DataFrame(candles, columns=['name', 'price', 'link', 'id'])

In [7]:
def make_clickable(link):
    return f'<a href="{link}" target="_blank">{link}</a>'

df['link'] = df['link'].apply(make_clickable)

In [None]:
from IPython.display import HTML

# tea_columns = df.filter(regex='(?i)tea').columns

cheapies = df
cheapies = df[(df.price < 15)]
# cheapies = df[df[tea_columns].notnull().any(axis=1)]
HTML(cheapies.to_html(escape=False))

Unnamed: 0,name,price,link,id
3,Amazing Grace - GRACE 3-Wick-Candle 411g,13.95,https://www.goosecreekcandle.de/Amazing-Grace-GRACE-3-Wick-Candle-411g,Amazing-Grace-GRACE-3-Wick-Candle-411g
10,Apple Cider Donut 3-Docht-Kerze 411g,13.95,https://www.goosecreekcandle.de/?a=10940&lang=eng,?a=10940&lang=eng
11,Apple Cider Ice Cream 3-Docht-Kerze 411g,13.95,https://www.goosecreekcandle.de/?a=12345&lang=eng,?a=12345&lang=eng
12,Apple Gathering 3-Docht-Kerze 411g,13.95,https://www.goosecreekcandle.de/?a=12346&lang=eng,?a=12346&lang=eng
14,Apple Pom Twister 3-Docht-Kerze 411g,13.95,https://www.goosecreekcandle.de/?a=9366&lang=eng,?a=9366&lang=eng
19,Autumn Outdoors 3-Wick-Candle 411g,15.95,https://www.goosecreekcandle.de/Autumn-Outdoors-3-Wick-Candle-411g,Autumn-Outdoors-3-Wick-Candle-411g
23,Autumn Sunset 3-Docht-Kerze 411g,15.95,https://www.goosecreekcandle.de/?a=12347&lang=eng,?a=12347&lang=eng
26,Bake - BAKE 3-Wick-Candle 411g,15.95,https://www.goosecreekcandle.de/Bake-BAKE-3-Wick-Candle-411g,Bake-BAKE-3-Wick-Candle-411g
29,Baking a Cake 3-Docht-Kerze 411g,13.95,https://www.goosecreekcandle.de/?a=12017&lang=eng,?a=12017&lang=eng
32,Banana Cabana Beach 3-Wick-Candle 411g,13.95,https://www.goosecreekcandle.de/Banana-Cabana-Beach-3-Wick-Candle-411g,Banana-Cabana-Beach-3-Wick-Candle-411g


## Code for candle price tracking telegram bot

Needed fields:
* id = sku

* name

* url

* pic url

* ingredients

* old_price

* price

* discount

In [None]:
def get_ingredients(url: str, soup) -> Optional[list[str]]:
    url_regex = r'\S*\?a=[0-9]+&lang=\S*'

    is_old_formatting = bool(re.fullmatch(url_regex, url))

    if is_old_formatting:
        return None

    ingr_table = soup.find('tbody')
    rows = ingr_table.find_all('tr')

    ingredients = []
    for row in rows:
        cells = row.find_all('td')
        if len(cells) == 2:
            level_ingredients = cells[1].text.strip().split(',')
            for i, ingredient in enumerate(level_ingredients):
                level_ingredients[i] = ingredient.strip()
            ingredients = ingredients + level_ingredients
    return ingredients

In [None]:
def get_candle_details(url: str) -> Optional[dict]:
    try:
        cookies = {'oss_country': 'GR', 'JTLSHOP': 'f5a6b3ed9cdca67475b65081793f61e9'}
        page = requests.get(url, cookies=cookies)
        soup = BeautifulSoup(page.content, 'html.parser')

        sku = soup.select_one('.product-sku span').text.strip()

        name = soup.find(class_='product-title has-border-bottom').text.strip()

        pic_element = soup.find(class_='square square-image js-gallery-images').find('source')
        pic_url = pic_element.attrs['srcset'].split(',')[-1].strip().split(' ')[0]

        ingredients = get_ingredients(url, soup)

        price = soup.find(class_='range-price').text.strip().split(' ')[0]
        price = float(price.replace(',', '.'))

        return {
            'candle_id': sku,
            'name': name,
            'picture_url': pic_url,
            'ingredients': ingredients,
            'price': price
        }
    except Exception as e:
        print(f'Error processing candle {url}: {e}')
        return None

In [None]:
def get_urls_from_page(page) -> list[str]:
  urls: list[str] = []
  soup = BeautifulSoup(page.content, 'html.parser')
  href_blocks = soup.find_all(class_='d-block position-relative')
  for block in href_blocks:
    urls.append(block['href'])
  return urls

In [None]:
def get_candle_urls() -> list[str]:
  def is_first_page(url):
    return not re.search(r'_s\d+$', url)

  candle_urls: list[str] = []

  base_url = 'https://www.goosecreekcandle.de/3-Wick-Candles_s'
  index = 1
  is_first_iteration = True

  while True:
    url = f'{base_url}{index}'
    print(f'new iter {index} {url}')
    page = requests.get(url)

    if not is_first_iteration and is_first_page(page.url):
      return candle_urls

    candle_urls = candle_urls + get_urls_from_page(page)
    index += 1
    is_first_iteration = False

In [None]:
urls = get_candle_urls()
len(urls)

new iter 1 https://www.goosecreekcandle.de/3-Wick-Candles_s1
new iter 2 https://www.goosecreekcandle.de/3-Wick-Candles_s2
new iter 3 https://www.goosecreekcandle.de/3-Wick-Candles_s3
new iter 4 https://www.goosecreekcandle.de/3-Wick-Candles_s4
new iter 5 https://www.goosecreekcandle.de/3-Wick-Candles_s5
new iter 6 https://www.goosecreekcandle.de/3-Wick-Candles_s6
new iter 7 https://www.goosecreekcandle.de/3-Wick-Candles_s7
new iter 8 https://www.goosecreekcandle.de/3-Wick-Candles_s8
new iter 9 https://www.goosecreekcandle.de/3-Wick-Candles_s9
new iter 10 https://www.goosecreekcandle.de/3-Wick-Candles_s10
new iter 11 https://www.goosecreekcandle.de/3-Wick-Candles_s11
new iter 12 https://www.goosecreekcandle.de/3-Wick-Candles_s12
new iter 13 https://www.goosecreekcandle.de/3-Wick-Candles_s13
new iter 14 https://www.goosecreekcandle.de/3-Wick-Candles_s14
new iter 15 https://www.goosecreekcandle.de/3-Wick-Candles_s15
new iter 16 https://www.goosecreekcandle.de/3-Wick-Candles_s16
new iter 1

651

In [None]:
candles = []
for url in urls:
  candles.append(get_candle_details(url))

In [None]:
len(candles)

651