## Jumia Online Webscraper

Import libraries

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
from urllib.parse import urljoin


Set url and use requests to fetch response

In [2]:
url = 'https://www.jumia.co.ke/'

In [3]:
response = requests.get(url)

In [4]:
#Check if the request was successful
requests.get(url)

<Response [200]>

Use BeautifulSoup to parse the HTML content

In [5]:
soup = BeautifulSoup(response.text, 'html.parser')

In [14]:
%%capture
print(soup.prettify())

Scrape the deals of the week section and scrape product reviews.

In [None]:
%%capture #Hide the output
# Create an empty list to store the product data
product_data = []

# Function to scrape product reviews (optional, can be extended for review data)
def scrape_reviews(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
    }
    
    # Send GET request to the product page
    response = requests.get(url, headers=headers)
    
    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Extract the rating (e.g., 4.7 out of 5)
    rating_tag = soup.find('div', class_='stars _m _al')
    if rating_tag:
        rating = rating_tag.text.strip()  # Extracts "4.7 out of 5"
    else:
        rating = "No rating available"
    
    print(f"Product rating: {rating}")
    return rating

# URL of the page you are scraping from
base_url = 'https://www.jumia.co.ke'

# Assuming soup is the BeautifulSoup object of the page you're scraping
products = soup.find_all('article', class_='prd')

# Extract product details
for product in products:
    # Product link
    product_link_tag = product.find('a', class_='core')
    product_link = product_link_tag['href'] if product_link_tag else 'No link'
    
    # Ensure the full product URL is correctly formed
    full_product_link = urljoin(base_url, product_link)  # This ensures no malformed URL

    # Product name
    product_name_tag = product.find('div', class_='name')
    product_name = product_name_tag.text.strip() if product_name_tag else 'No name'
    
    # Current price
    price_tag = product.find('div', class_='prc')
    price = price_tag.text.strip() if price_tag else 'No price'
    
    # Original price (if available)
    original_price = price_tag.get('data-oprc', 'N/A') if price_tag else 'N/A'
    
    # Discount (if available)
    discount_tag = product.find('div', class_='bdg _dsct')
    discount = discount_tag.text.strip() if discount_tag else 'No Discount'
    
    # Stock availability
    stock_info = product.find('div', class_='stk')
    stock = stock_info.text.strip() if stock_info else 'Stock info unavailable'
    
    # Image URL
    img_tag = product.find('img', class_='img')
    img_url = img_tag['src'] if img_tag else 'No image'
    
    # Extract rating (e.g., 4.7 out of 5)
    rating_tag = product.find('div', class_='stars _m _al')
    if rating_tag:
        rating = rating_tag.text.strip()  # Extracts "4.7 out of 5"
    else:
        rating = "No rating available"

    # Collect data into a dictionary
    product_info = {
        'Product Name': product_name,
        'Price': price,
        'Original Price': original_price,
        'Discount': discount,
        'Stock Availability': stock,
        'Image URL': img_url,
        'Rating': rating,
        'Product Link': full_product_link
    }
    # Loop through the product_info dictionary and print each key and its corresponding value
    for key, value in product_info.items():
        print(f"{key}: {value}\n")

    # Append product data to the list
    product_data.append(product_info)
    
    # Call the function to scrape reviews for this product
    scrape_reviews(full_product_link)
    
    print("-" * 40)

Product Name: Jameson Jameson Irish Whiskey - 750 Ml

Price: KSh 2,099

Original Price: KSh 2,930

Discount: 28%

Stock Availability: 671 items left

Image URL: data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7

Rating: No rating available

Product Link: https://www.jumia.co.ke/jameson-jameson-irish-whiskey-750-ml-58538273.html

Product rating: 4.7 out of 5
----------------------------------------
Product Name: NIVEA Perfect & Radiant Even Tone Day And Night Cream For Women - 50ml

Price: KSh 1,260

Original Price: KSh 1,800

Discount: 30%

Stock Availability: 660 items left

Image URL: data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7

Rating: No rating available

Product Link: https://www.jumia.co.ke/nivea-perfect-radiant-even-tone-day-and-night-cream-for-women-50ml-68528030.html

Product rating: 4.3 out of 5
----------------------------------------
Product Name: Sony Bluetooth Headphone Wh-ch520 - Beige (1YR WRTY)

Price: KSh

In [None]:
# Convert the list of product data into a Pandas DataFrame
df = pd.DataFrame(product_data)

# Save the DataFrame to a CSV file
df.to_csv('jumia_products.csv', index=False)
print("Data has been saved to 'jumia_products.csv'")

Data has been saved to 'jumia_products.csv'
