In [3]:
pip install webdriver_manager

Collecting webdriver_manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting python-dotenv (from webdriver_manager)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv, webdriver_manager
Successfully installed python-dotenv-1.0.1 webdriver_manager-4.0.2
Note: you may need to restart the kernel to use updated packages.


In [5]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

# Set up the Selenium WebDriver (Chrome in this case)
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run browser in headless mode
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# URL to start with
base_url = 'https://www.jumia.co.ke/mlp-led-tvs/'

# Create empty lists to store data
product_names = []
prices = []
old_prices = []
discounts = []
ratings = []
reviews = []

# Function to extract product data from a single page
def extract_data(soup):
    # Find all product containers
    products = soup.find_all('div', class_='info')

    # Loop through each product and extract relevant data
    for product in products:
        # Extract the product name
        name = product.find('h3', class_='name').get_text(strip=True)
        product_names.append(name)

        # Extract the price
        price = product.find('div', class_='prc').get_text(strip=True)
        prices.append(price)

        # Extract the old price (if available)
        old_price = product.find('div', class_='old')
        old_prices.append(old_price.get_text(strip=True) if old_price else 'N/A')

        # Extract the discount (if available)
        discount = product.find('div', class_='bdg _dsct _sm')
        discounts.append(discount.get_text(strip=True) if discount else 'N/A')

        # Extract the rating and reviews (if available)
        rating = product.find('div', class_='stars _s')
        if rating:
            stars = rating.get_text(strip=True).split(' out of ')[0]
            reviews_count = product.find('div', class_='rev').get_text(strip=True)
            ratings.append(stars)
            reviews.append(reviews_count)
        else:
            ratings.append('N/A')
            reviews.append('N/A')

# Start scraping process
driver.get(base_url)
time.sleep(3)  # Allow page to load

while True:
    # Get page source and parse it with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Extract data from the current page
    extract_data(soup)

    try:
        # Find the 'Next' button and click it to go to the next page
        next_button = driver.find_element(By.XPATH, '//a[@aria-label="Next Page"]')
        next_button.click()
        time.sleep(3)  # Wait for the next page to load
    except:
        # If 'Next' button is not found, we've reached the last page
        print("No more pages to scrape.")
        break

# Close the browser after scraping is done
driver.quit()

# Create a pandas DataFrame to store the scraped data
df = pd.DataFrame({
    'Product Name': product_names,
    'Price': prices,
    'Old Price': old_prices,
    'Discount': discounts,
    'Rating': ratings,
    'Reviews': reviews
})

# Save the data to a CSV file
df.to_csv('jumia_led_tvs_all_pages.csv', index=False)

print("Scraping completed and data saved to jumia_led_tvs_all_pages.csv")

No more pages to scrape.
Scraping completed and data saved to jumia_led_tvs_all_pages.csv
