### 1. Import needed libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import csv
import lxml
from os.path import exists
import os
from pandas import *

### 2. Files Creating functions

In [2]:
def create_products_file():
    file_name = "Products.csv"
    with open(file_name, "w", newline='', encoding="utf-8") as file:
        wr = csv.writer(file)
        wr.writerow(["Product", "Price", "General category", "Sub category", "Category", "Brand", "Specifications", "Rating", "Reviews", "Link", "Seller", "Seller Score", "Seller link"])

In [3]:
def create_done_file():
    file_name = "Done.csv"
    with open(file_name, "w", newline='', encoding="utf-8") as file:
        wr = csv.writer(file)
        wr.writerow(['Sub category'])
        


In [4]:
def create_sellers_file():
    file_name = "Sellers.csv"
    with open(file_name, "w", newline='', encoding="utf-8") as file:
        wr = csv.writer(file)
        wr.writerow(['Seller', 'Score', 'Profile link'])

### 3. Check files existence

In [5]:
def check_files_exist():
    products_file_exists = os.path.exists('Products.csv')
    if not products_file_exists:
        create_products_file()
    
    done_file_exists = os.path.exists('Done.csv')
    if not done_file_exists:
        create_done_file()
    
    seller_file_exists = os.path.exists('Sellers.csv')
    if not done_file_exists:
        create_sellers_file()

### 4. Write to files

In [6]:
def append_products_file(category_name):
    
    with open('Products.csv', 'a', newline='', encoding="utf-8") as file:
        wr = csv.writer(file)
        
        with open(category_name, newline='') as f:
            reader = csv.reader(f)
            category_data = list(reader)
            
            for i in range(0, len(category_data)):
                wr.writerow(category_data[i])
                
            f.close()
        file.close()
        
        os.remove(category_name)

In [7]:
def create_sub_category_file(sub_category):
    file_name = sub_category + ".csv"
    with open(file_name, "w", newline='', encoding="utf-8") as file:
        wr = csv.writer(file)

In [8]:
def write_to_done_file(sub_category):
    with open("Done.csv", 'a', newline='', encoding="utf-8") as file:
        wr = csv.writer(file)
        wr.writerow([sub_category])
        file.close()
    print("------", sub_category.replace(".csv", ""), "crawling is done! -----")
        
    append_products_file(sub_category)

In [9]:
def write_to_sellers_file(seller_info):
    with open("Sellers.csv", 'a', newline='', encoding="utf-8") as file:
        wr = csv.writer(file)
        wr.writerow(seller_info)
        file.close()

In [10]:
def write_to_sub_category_file(file_name, new_row):
    file_name = file_name + ".csv"
    with open(file_name, 'a', newline='', encoding="utf-8") as file:
        wr = csv.writer(file)
        wr.writerow(new_row)
        file.close()

### 5. Get page

In [11]:
def get_this_page(url):
    result = requests.get(url)

    src = result.content
    soup = BeautifulSoup(src, "lxml")
    
    return soup

In [12]:
def page_exists(urlLink):
    
    defSoup = get_this_page(urlLink)
    defSoup = defSoup.text
    
    return("No results found!" not in defSoup)

In [13]:
def max_pages_exceeded(firstLinkPart, idx = 50):  

    print("Checking:", firstLinkPart)
    
    fullUrlLink = firstLinkPart + "?page=" + str(idx) + "#catalog-listing"

    print("Max exceeded:", page_exists(fullUrlLink))
    return(page_exists(fullUrlLink))


In [14]:
def get_next_page(current_url, current_index):

    next_index = current_index + 1
    
    current_index = '=' + str(current_index) + '#'
    next_index = '=' + str(next_index) + '#'
    
    next_url = current_url.replace(current_index, next_index)
    if page_exists(next_url):
        return(next_url)
    else:
        return(False)
    

### 6. Crawling functions

In [15]:
def crawl_product_page(url_link, crawled_sellers):
    seller_name = seller_score = seller_link = product_specifications = "NaN"
    
    soup = get_this_page(url_link)
    
    cont = soup.find("main", {"class": "-pvs"})
        
    if cont == None:
        return seller_name, seller_score, seller_link, product_specifications
        
    specs = cont.find("div", {"class": "row -pas"})
    if specs != None:
        product_specifications = specs.get_text()
    
    if "col4" not in str(cont):
        return seller_name, seller_score, seller_link, product_specifications
    
    sellerBox = cont.find("div", {"class": "col4"})
    
    name = sellerBox.find("p", {"class": "-m -pbs"})
    seller_name = name.get_text()
    
    # check_seller(seller_name)
    
    if seller_name == 'Jumia':
        seller_score = '100%'
        return seller_name, seller_score, seller_link, product_specifications
    
    score = sellerBox.find("bdo", {"class": "-m -prxs"})
    if score != None:
        seller_score = score.get_text()
        
    link = sellerBox.find("a", {"class": "-pas -df -i-ctr -upp"})
    if link != None:
        seller_link = "https://www.jumia.com.eg" + str(link.get('href'))
    
    if seller_name not in crawled_sellers:
        crawled_sellers.append(seller_name)
        write_to_sellers_file([seller_name, seller_score, seller_link])
    
    return seller_name, seller_score, seller_link, product_specifications

In [16]:
def crawl_page(category_name, url_link, general_category_name, crawled_sellers):

    print("Now crawling:", url_link)
    
    soup = get_this_page(url_link)
    cont = soup.find("div", {"class": "-paxs row _no-g _4cl-3cm-shs"})

    if "a class=\"core\"" not in str(cont):
        return

    products = cont.find_all("a", {"class": "core"})

    for product in products:
        name = product.find("h3", {"class": "name"}).get_text()

        if name:
            price = product.find("div", {"class": "prc"}).get_text()
            link = "https://www.jumia.com.eg" + str(product.get("href"))
            
            seller_name, seller_score, seller_link, specifications = crawl_product_page(link, crawled_sellers)
            
            if "Brand:" in str(product):
                brand = str(product.get("data-brand"))
            else:
                brand = "NaN"
                
            if "stars _s" in str(product):
                rating = product.find("div", {"class": "stars _s"}).get_text()
                reviews = product.find("div", {"class": "rev"}).get_text()
                rating = rating.replace(" out of 5", "")
                reviews = reviews.replace(rating, "").replace("(", "").replace(")", "").replace(" out of ", "")
            else:
                rating = "NaN"
                reviews = "NaN"

            full_category_names = str(product.get("data-category"))
            full_category_names = full_category_names.split('/')
            
            if len(full_category_names) >= 2:
                sub_category_name = full_category_names[-2]
                final_category = full_category_names[-1]
            else:
                sub_category_name = category_name
                final_category = category_name
                
            new_product_row = [name, price, general_category_name, sub_category_name, final_category, brand, specifications, rating, reviews, link,  seller_name, seller_score, seller_link]
            write_to_sub_category_file(category_name, new_product_row)

In [17]:
def crawl_category(category_name, url_link, general_category_name, crawled_categories, crawled_sellers):

    if (category_name in crawled_categories):
        print("-----------------------------", category_name, "is already crawled! -----------------------------")
        return
    
    create_sub_category_file(category_name)
    
    url_link = url_link + "?page=1#catalog-listing"
    
    current_index = 1
    
    while url_link:
        
        crawl_page(category_name, url_link, general_category_name, crawled_sellers)
        
        url_link = get_next_page(url_link, current_index)

        current_index += 1        
    
    file_name = category_name + ".csv"
    write_to_done_file(file_name)

### 7. Break category function

In [18]:
def break_category(crawled_categories, big_category, general_category_name, crawled_sellers):

    soup = get_this_page(big_category)

    sub_div = soup.find("div", {"class": "col4 -me-start -pvs"})

    if "-db -pvs -phxl -hov-bg-gy05" in str(sub_div):
        temp_sub_categories = sub_div.find_all("a", {"class": "-db -pvs -phxl -hov-bg-gy05"})
    elif "-db -pvs -phm -hov-bg-gy05" in str(sub_div):
        temp_sub_categories = sub_div.find_all("a", {"class": "-db -pvs -phm -hov-bg-gy05"})
    else:
        temp_sub_categories = []
                
    for temp_sub_category in temp_sub_categories:
        sub_category_link = "https://www.jumia.com.eg" + str(temp_sub_category.get("href"))
        category_name = temp_sub_category.get("href").replace("/seller/", "").replace("/", "")
        print(sub_category_link)

        if category_name + '.csv' in crawled_categories:
            print("-----------------------------", category_name, "is already crawled! -----------------------------")
            
        elif max_pages_exceeded(sub_category_link) or (category_name == "I/O Port Cards"):
            print("------- Breaking category:", category_name)
            break_category(crawled_categories, sub_category_link, general_category_name, crawled_sellers)
            
        else:
            category_name = temp_sub_category.get("href").replace("/seller/", "").replace("/", "")
            print("--------------- Category:", category_name)
            crawl_category(category_name, sub_category_link, general_category_name, crawled_categories, crawled_sellers)
            

### 8. Run program

In [19]:
url = "https://www.jumia.com.eg/seller/"
soup = get_this_page(url)

In [20]:
check_files_exist()

categories_data = read_csv("Done.csv")
crawled_categories = categories_data['Sub category'].tolist()

sellers_data = read_csv("Sellers.csv")
crawled_sellers = sellers_data['Seller'].tolist()

In [21]:
general_categories_links = []
general_categories_names = []

cont = soup.find("div", {"class": "card -fh"})
general_categories = cont.find_all("a", {"class": "-db -pvs -phxl -hov-bg-gy05"})

for category in general_categories:
    category_link = str(category.get("href"))
    
    if category_link != "None":
        general_categories_links.append("https://www.jumia.com.eg" + category_link)
        general_categories_names.append(category.get_text())

categories_count = len(general_categories_links)

In [23]:
if '-db -pvs -phxl -hov-bg-gy05' in str(soup):
      print('yes')
else:
  print('no')

yes


In [None]:
for i in range(categories_count):
    if general_categories_names[i] != 'Crawled!':
        break_category(crawled_categories, general_categories_links[i], general_categories_names[i], crawled_sellers)
        general_categories_names[i] = 'Crawled!'