### Web Scraping for `jumia` site :-


In [1]:
import requests
import base64
import re
from PIL import Image
from io import BytesIO
import os


#parses the unwanted data and helps to organize 
#and format the messy web data by fixing bad HTML 
#and present to us in an easily-traversible XML structures.
from bs4 import BeautifulSoup as bs  

import csv
import pandas as pd
import numpy as np

In [2]:
# Access the website
url = 'https://www.jumia.com.eg/ar/catalog/?q=mobile'
request = requests.get(url)
request

<Response [200]>

In [3]:
request.content[:500]


b'<!DOCTYPE html><html lang="ar" dir="rtl"><head><meta charset="utf-8"/><title>\xd8\xb9\xd8\xb1\xd9\x88\xd8\xb6 \xd8\xb9\xd9\x84\xd9\x8a \xd8\xac\xd9\x85\xd9\x8a\xd8\xb9 \xd8\xa7\xd9\x84\xd9\x85\xd9\x86\xd8\xaa\xd8\xac\xd8\xa7\xd8\xaa - \xd8\xaa\xd8\xb3\xd9\x88\xd9\x82 \xd8\xa3\xd9\x81\xd8\xb6\xd9\x84 \xd8\xa7\xd9\x84\xd9\x85\xd9\x86\xd8\xaa\xd8\xac\xd8\xa7\xd8\xaa \xd8\xa8\xd8\xa3\xd9\x81\xd8\xb6\xd9\x84 \xd8\xa7\xd9\x84\xd8\xa3\xd8\xb3\xd8\xb9\xd8\xa7\xd8\xb1 - \xd8\xac\xd9\x88\xd9\x85\xd9\x8a\xd8\xa7 \xd9\x85\xd8\xb5\xd8\xb1</title><meta property="og:type" content="product"/><meta property="og:site_name" content="Jumia \xd9\x85\xd8\xb5\xd8\xb1"/><meta property="og:title" content="\xd8\xb9\xd8\xb1\xd9\x88\xd8\xb6 \xd8\xb9\xd9\x84\xd9\x8a \xd8\xac\xd9\x85\xd9\x8a\xd8\xb9 \xd8\xa7\xd9\x84\xd9\x85\xd9\x86\xd8\xaa\xd8\xac\xd8\xa7\xd8\xaa - \xd8\xaa\xd8\xb3\xd9\x88\xd9\x82 \xd8\xa3\xd9\x81\xd8\xb6\xd9\x84 \xd8\xa7\xd9\x84\xd9\x85\xd9\x86\xd8\xaa\xd8\xac\xd8\xa7\xd8\xaa \xd8\xa8\xd8\xa3\xd

In [4]:
soup = bs(request.content,'html.parser')

In [5]:
containers = soup.find_all('a', {'class':'core'})

In [6]:
containers[0]

<a class="core" data-brand="Samsung" data-category="Phones &amp; Tablets/Mobile Phones/Smartphones/Android Phones" data-dimension23="" data-dimension26="12" data-dimension27="4.3" data-dimension28="1" data-dimension37="0" data-dimension43="CFS|JMALL|TBOOST|TW_27|XMAS22_51|xmas_10" data-dimension44="0" data-id="SA024MP1AH2PENAFAMZ" data-list="" data-name="Galaxy A24 - 6.5 inches 128GB/6GB RAM - Dual SIM 4G Mobile Phone – Silver" data-position="1" data-price="232.30" data-track-onclick="eecProduct" data-track-onview="eecProduct" href="/ar/samsung-galaxy-a24-6.5-inches-128gb6gb-ram-dual-sim-4g-mobile-phone-silver-49985087.html"><div class="img-c"></div><div class="info"><div class="bdg _mall _xs">المتجر الرسمي</div><h3 class="name">Samsung

In [7]:
len(containers)

40

In [8]:
# Scrapping Name
name = containers[10].find_all('h3', {'class':'name'})
name[0].text.strip()

'Samsung Galaxy A14 - 6.6-inch 4GB/64GB Dual Sim 4G - Mobile Phone - Light Green'

In [9]:
# Scrapping Price
price = containers[35].find_all('div', {'class':'prc'})
price[0].text.strip()

'جنيه 8,999.00'

In [10]:
# Scrapping Rating
rating = containers[13].find_all('div', {'class':'stars _s'})
rating[0].text.strip()

'4.7 out of 5'

### Now we will Scrape only one page :-

In [17]:
def clean_filename(name):
    cleaned_name = re.sub(r'[\\/*?:"<>|]', '_', name)  
    cleaned_name = re.sub(r'[^a-zA-Z0-9_.\s-]', '', cleaned_name)  # Remove other unwanted characters
    cleaned_name = re.sub(r'\s+', ' ', cleaned_name).strip()  # Replace multiple spaces with single space
    return cleaned_name[:100]  # Limit the length for file...


In [12]:
if not os.path.exists('images'):
    os.makedirs('images')

with open('JUMIA_DATA.csv', 'w', encoding='utf-8', newline='') as file:
    writer_obj = csv.writer(file)
    writer_obj.writerow(['Name', 'Price', 'Rating', 'Image_URL'])
    
    for container in containers:
        # Scraping Name
        name = container.find_all('h3', {'class': 'name'})
        name = name[0].text.strip() if name else np.nan
        
        # Scraping Price
        price = container.find_all('div', {'class': 'prc'})
        price = price[0].text.strip() if price else np.nan
        
        # Scraping Rating
        rating_element = container.find_all('div', {'class': 'stars _s'})
        rating = rating_element[0].text.strip() if rating_element else np.nan
        
         # Scraping Image URL 
        image_element = container.find('div', {'class': 'img-c'}).find('img')
        
        if image_element:
            image_src = image_element.get('data-src')  
            
            if image_src:
                # Clean the name that will help us to creat a valid name
                cleaned_name = clean_filename(name)
                
                # For CSV ...
                image_url = f'image_{cleaned_name}.jpg'
                
                # Write image URL to CSV
                writer_obj.writerow([name, price, rating, image_url])
                
                # We can Download the image using the obtained URL (using requests library)
                try:
                    response = requests.get(image_src)
                    with open(f'images/image_{cleaned_name}.jpg', 'wb') as img_file:
                        img_file.write(response.content)
                except Exception as e:
                    print(f"Error downloading image: {e}")
        else:
            image_url = np.nan
        
        writer_obj.writerow([name, price, rating, image_url])
    
print('Done writing to CSV file')

Done writing to CSV file.


In [13]:
df=pd.read_csv('JUMIA_DATA.csv')

In [14]:
df.isna().sum()

Name         0
Price        0
Rating       8
Image_URL    0
dtype: int64

In [15]:
df

Unnamed: 0,Name,Price,Rating,Image_URL
0,Samsung Galaxy A24 - 6.5 inches 128GB/6GB RAM ...,"جنيه 7,899.00",4.3 out of 5,image_Samsung Galaxy A24 - 6.5 inches 128GB_6G...
1,Samsung Galaxy A24 - 6.5 inches 128GB/6GB RAM ...,"جنيه 7,899.00",4.3 out of 5,image_Samsung Galaxy A24 - 6.5 inches 128GB_6G...
2,Samsung Galaxy A24 - 6.5 inches 128GB/6GB RAM ...,"جنيه 7,899.00",4.4 out of 5,image_Samsung Galaxy A24 - 6.5 inches 128GB_6G...
3,Samsung Galaxy A24 - 6.5 inches 128GB/6GB RAM ...,"جنيه 7,899.00",4.4 out of 5,image_Samsung Galaxy A24 - 6.5 inches 128GB_6G...
4,Samsung A24 6.5 Inch 128GB/8GB Dual SIM Mobile...,"جنيه 8,499.00",4.9 out of 5,image_Samsung A24 6.5 Inch 128GB_8GB Dual SIM ...
...,...,...,...,...
75,Nokia نوكيا C1 الاصدار الثاني بشريحتين اتصال، ...,"جنيه 1,999.00",4 out of 5,image_Nokia C1 16 1.jpg
76,Nokia نوكيا C1 الاصدار الثاني بشريحتين اتصال، ...,"جنيه 1,999.00",5 out of 5,image_Nokia C1 16 1.jpg
77,Nokia نوكيا C1 الاصدار الثاني بشريحتين اتصال، ...,"جنيه 1,999.00",5 out of 5,image_Nokia C1 16 1.jpg
78,Nokia Nokia نوكيا 210 - موبايل 2.4 بوصة ثنائي ...,"جنيه 1,199.00",3.3 out of 5,image_Nokia Nokia 210 - 2.4 -.jpg


In [16]:
df['Image_URL'][0]

'image_Samsung Galaxy A24 - 6.5 inches 128GB_6GB RAM - Dual SIM 4G Mobile Phone Silver.jpg'

### Now we will Scrape all pages Together :-
- We have only `Three Pages ` in jumia for mobile data .

In [17]:
if not os.path.exists('images_for_all'):
    os.makedirs('images_for_all')
    
with open('JUMIA_DATA_all.csv', 'w', encoding='utf-8', newline='') as file:
    writer_obj = csv.writer(file)
    writer_obj.writerow(['Name', 'Price', 'Rating', 'Image_URL'])
    
    for i in range(1,4):
        url = f'https://www.jumia.com.eg/ar/catalog/?q=mobile&page={i}#catalog-listing'
        request = requests.get(url)
        soup = bs(request.content,'html.parser')
        containers = soup.find_all('a', {'class':'core'})
        
        for container in containers:
            # Scrapping Name
            name = container.find_all('h3', {'class': 'name'})
            name = name[0].text.strip() if name else np.nan

            # Scrapping Price
            price = container.find_all('div', {'class': 'prc'})
            price = price[0].text.strip() if price else np.nan

            # Scrapping Rating
            rating_element = container.find_all('div', {'class': 'stars _s'})
            rating = rating_element[0].text.strip() if rating_element else np.nan
            
             # Scraping Image URL
            image_element = container.find('div', {'class': 'img-c'}).find('img')
        
            if image_element:
                image_src = image_element.get('data-src')
            
                if image_src:
                    cleaned_name = clean_filename(name)
                
                    image_url = f'image_{cleaned_name}.jpg'
                
                    # Write image URL to CSV
                    writer_obj.writerow([name, price, rating, image_url])
                
                    try:
                        response = requests.get(image_src)
                        with open(f'images_for_all/image_{cleaned_name}.jpg', 'wb') as img_file:
                            img_file.write(response.content)
                    except Exception as e:
                        print(f"Error downloading image: {e}")
            else:
                image_url = np.nan
        
            writer_obj.writerow([name, price, rating, image_url])
    
print('Done writing to CSV file')

Done writing to CSV file.


In [18]:
df=pd.read_csv('JUMIA_DATA_all.csv')

df.isna().sum()

Name          0
Price         0
Rating       84
Image_URL     0
dtype: int64

In [19]:
df

Unnamed: 0,Name,Price,Rating,Image_URL
0,Samsung Galaxy A24 - 6.5 inches 128GB/6GB RAM ...,"جنيه 7,899.00",4.3 out of 5,image_Samsung Galaxy A24 - 6.5 inches 128GB_6G...
1,Samsung Galaxy A24 - 6.5 inches 128GB/6GB RAM ...,"جنيه 7,899.00",4.3 out of 5,image_Samsung Galaxy A24 - 6.5 inches 128GB_6G...
2,Samsung Galaxy A24 - 6.5 inches 128GB/6GB RAM ...,"جنيه 7,899.00",4.4 out of 5,image_Samsung Galaxy A24 - 6.5 inches 128GB_6G...
3,Samsung Galaxy A24 - 6.5 inches 128GB/6GB RAM ...,"جنيه 7,899.00",4.4 out of 5,image_Samsung Galaxy A24 - 6.5 inches 128GB_6G...
4,Samsung A24 6.5 Inch 128GB/8GB Dual SIM Mobile...,"جنيه 8,499.00",4.9 out of 5,image_Samsung A24 6.5 Inch 128GB_8GB Dual SIM ...
...,...,...,...,...
235,Huawei Nova Y90 - 6.7-inch 8GB/128GB Dual Sim ...,"جنيه 8,088.99",4 out of 5,image_Huawei Nova Y90 - 6.7-inch 8GB_128GB Dua...
236,Samsung A54 - 6.4 Inch - 8GB/256GB RAM - 5G - ...,"جنيه 16,900.00",,image_Samsung A54 - 6.4 Inch - 8GB_256GB RAM -...
237,Samsung A54 - 6.4 Inch - 8GB/256GB RAM - 5G - ...,"جنيه 16,900.00",,image_Samsung A54 - 6.4 Inch - 8GB_256GB RAM -...
238,Samsung سامسونج جالاكسي A23 رام 4 جيجا، 128 جي...,"جنيه 7,999.00",5 out of 5,image_Samsung A23 4 128 -.jpg
