In [None]:
# Web scraping
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.service import Service  # Import the Service class

# Data Exploration and Manipulation
import pandas as pd
import numpy as np

# Additional Functionality
from datetime import datetime
import time
import os

In [None]:
# Set up Selenium Firefox WebDriver
from selenium.common.exceptions import WebDriverException

options = Options()
options.headless = True  # Run Firefox in headless mode

try:
    # Initialize WebDriver without specifying file paths (works in GitHub Actions)
    driver = webdriver.Firefox(options=options)
except WebDriverException as e:
    print("⚠️ Firefox or Geckodriver not found in environment. Skipping Selenium section.")
    driver = None

# Define Links
links = [
    'food-cupboard-supplies',
    'drinks',
    'household-supplies'
]

# Define Loop
for link in links:
    for i in range(1, 2, 1):
        base_url = 'https://www.jumia.co.ke/'
        url = base_url + str(link) + '/?page=' + str(i) + '#catalog-listing'

        content = requests.get(url).text
        soup = BeautifulSoup(content, 'lxml')

        pages = soup.find_all('div', class_='info')
        if not pages:
            print(f"No products found on page {i}. Breaking loop early.")
            break  # stops the loop entirely — efficient

        bigdata = pd.DataFrame()
        for index, page in enumerate(pages):
            product_name = page.find('h3', class_='name').text
            product_category = 'Grocery'
            product_subcategory = link

            try:
                product_ratings = page.find('div', class_='stars _s').text
            except Exception as e:
                product_ratings = np.nan

            try:
                initial_price = page.find('div', class_='old').text
            except Exception as e:
                initial_price = np.nan

            try:
                discount = page.find('div', class_='bdg _dsct _sm').text
                discount = f"-{discount}"
            except Exception as e:
                discount = np.nan

            try:
                final_price = page.find('div', class_='prc').text
            except Exception as e:
                final_price = np.nan

            # Initialize variables to avoid stale data
            verified_ratings = product_availability = delivery_fee = delivery_day_home = delivery_day_station = np.nan

            # Extract individual product link
            product_link_tag = page.find_parent('a', class_='core')
            if product_link_tag:
                product_link = base_url + product_link_tag['href']
                driver.get(product_link)

                try:
                    close_window_button = WebDriverWait(driver, 5).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, ".cls"))
                    )
                    close_window_button.click()
                except:
                    pass

                try:
                    accept_cookies_button = WebDriverWait(driver, 5).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, "button.-df"))
                    )
                    accept_cookies_button.click()
                except:
                    pass

                try:
                    verified_ratings = WebDriverWait(driver, 5).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, "#jm > main > div:nth-child(1) > section > div > div.col10 > div.-phs > div.-df.-i-ctr.-pbs > a"))
                    ).text
                except:
                    continue

                try:
                    stock_element = WebDriverWait(driver, 5).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, "#jm > main > div:nth-child(1) > section > div > div.col10 > div.-phs > p"))
                    )
                    product_availability = stock_element.text
                except:
                    continue

                try:
                    delivery_fee = WebDriverWait(driver, 5).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, "#jm > main > div:nth-child(1) > section > div > div.col10 > div.-phs > div.markup.-fs12.-pbs"))
                    ).text
                except:
                    continue

                try:
                    delivery_day_station = WebDriverWait(driver, 5).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, "#jm > main > div:nth-child(1) > div.col4 > section > div:nth-child(3) > article.-pvxs.-hr._bet > section > div > article:nth-child(1) > div > div:nth-child(2) > div:nth-child(2)"))
                    ).text
                except:
                    try:
                        delivery_day_station = WebDriverWait(driver, 5).until(
                            EC.presence_of_element_located((By.CSS_SELECTOR, "#jm > main > div:nth-child(1) > div.col4 > section > div > article.-pvxs.-hr._bet > section > div > article:nth-child(1) > div > div:nth-child(2) > div:nth-child(2)"))
                        ).text
                    except Exception as e:
                        pass

                try:
                    delivery_day_home = WebDriverWait(driver, 5).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, "#jm > main > div:nth-child(1) > div.col4 > section > div:nth-child(3) > article.-pvxs.-hr._bet > section > div > article:nth-child(2) > div > div:nth-child(2) > div:nth-child(2)"))
                    ).text
                except:
                    try:
                        delivery_day_home = WebDriverWait(driver, 5).until(
                            EC.presence_of_element_located((By.CSS_SELECTOR, "#jm > main > div:nth-child(1) > div.col4 > section > div > article.-pvxs.-hr._bet > section > div > article:nth-child(2) > div > div:nth-child(2) > div:nth-child(2)"))
                        ).text
                    except Exception as e:
                        pass

            # Get the current date and time
            last_scraped = datetime.now()

            # Append data to DataFrame
            data = pd.DataFrame({
                'product_name': [product_name],
                'product_category': [product_category],
                'product_subcategory': [product_subcategory],
                'product_availability': [product_availability],
                'product_ratings': [product_ratings],
                'verifies_ratings': [verified_ratings],
                'initial_price': [initial_price],
                'discount': [discount],
                'final_price': [final_price],
                'delivery_fee': [delivery_fee],
                'last_scraped': [last_scraped],
                'pickup_station': [delivery_day_station],
                'door_delivery': [delivery_day_home]
            })

            bigdata = pd.concat([bigdata, data])

        # Handle Database Import Error
        if not bigdata.empty:
            bigdata.to_csv('jumia.csv', index=False)

# Close the driver
driver.quit()

print(f'Shape {bigdata.shape}')