"""## Static websites
1. Get robot.txt and create exclusion protocol for other calls [wrapper for requests.get()]
2. Data model with Pydantic
4. Get the website structure for directories
5. Follow pagination
6. Catalogue VS. Product page
7. Save data to CSV
"""

In [2]:
import requests
from bs4 import BeautifulSoup

# Setup session and user-agent
headers = {  # Need to be replaced with your details
    'User-Agent': 'Webscraping Capacity Building 1.0 / ANSD',
    'From': 'demba.diack@ansd.sn'  
}

s = requests.Session()
s.headers.update(headers)

soumari_root = "https://www.soumari.com/"

In [3]:
######################################## 02 Getting robots.txt file
robots_soumari = s.get(soumari_root + "robots.txt")
print(robots_soumari.text)

User-agent: *



In [4]:
######################################## 03### Data Model
"""1. Define what data needs to be captured and its type
2. Try to keep consistency across different websites to facilitate post-processing

We use Pydantic [https://docs.pydantic.dev/](https://docs.pydantic.dev/)
"""
from datetime import date
from pydantic import BaseModel
#Basic product class, to be reused over different websites
class Product(BaseModel):
    link: str
    source: str
    category: str = None
    subcategory: str = None
    subsubcategory: str = None
    name: str = None
    brand: str = None
    uid: str = None
    price: float
    regular_price: float = None
    currency: str
    in_stock: str = None
    description: str = None
    date: str = date.today().strftime("%Y-%m-%d")


#Specific product class for a source website
class Soumari(Product):
    source: str = "Soumari"
    currency: str = "CFA"

In [5]:
######################################## 04 getting the list of menus
"""You could do more validations for data fields in Pydantic, but it may impact the speed and stability of webscraping.
 We suggest to perform most of the cleaning afterwards"""
### Website structure
homepage = s.get(soumari_root)
page = BeautifulSoup(homepage.text, 'html.parser')
ma_var = page.find(id="menu-mega-menu").find_all("a")
my_list = [] # et non my_list = {}
for item in ma_var:
    txt = "link:"+item.get("href")
    names ="name:"+item.get_text()
    #print (txt+"..."+names)
    my_list.append({"link":item.get("href"),"name":item.get_text()})

#len(my_list)
#my_list


links = [{"link": item.get("href"), "name": item.get_text()} for item in page.find(id="menu-mega-menu").find_all("a")]
# Have a deeper look into list comprehension if the above sounds not intuitive. It is simple than it looks!


links

[{'link': 'https://www.soumari.com/categorie-produit/hello-ramadan/',
  'name': ' HELLO RAMADAN'},
 {'link': 'https://www.soumari.com/shop/', 'name': ' TENDANCES'},
 {'link': 'https://www.soumari.com/categorie-produit/cosmetique-bio/',
  'name': ' COSMETIQUE & BIO'},
 {'link': 'https://www.soumari.com/categorie-produit/cosmetique-bio/bio/',
  'name': 'BIO'},
 {'link': 'https://www.soumari.com/categorie-produit/cosmetique-bio/cosmetique/',
  'name': 'COSMETIQUE'},
 {'link': 'https://www.soumari.com/categorie-produit/meuble-deco/',
  'name': ' MEUBLE & DECO'},
 {'link': 'https://www.soumari.com/categorie-produit/smartphones-haut-de-gamme/',
  'name': ' TÉLÉPHONES'},
 {'link': 'https://www.soumari.com/categorie-produit/smartphones-haut-de-gamme/tecno/',
  'name': 'TECNO'},
 {'link': 'https://www.soumari.com/categorie-produit/smartphones-haut-de-gamme/samsung/',
  'name': 'SAMSUNG'},
 {'link': 'https://www.soumari.com/categorie-produit/smartphones-haut-de-gamme/huawei/',
  'name': 'HUAWEI'

In [15]:
######################################## 05  We can perform some cleaning, removing invalid links and the general "shop" one with all products.  
#It is an opinionated choice, you could also just use the "shop" page and get each individual product.
clean_links = [
    {"link": item.get("href"), "category": item.get_text()}
    for item in page.find(id="menu-mega-menu").find_all("a")
    if (
        item.get("href").startswith("https://") and  # this is specific for this case, you may also find relative links
        not item.get("href").endswith("/shop/")  # also this one is specific for this case
        )
    ]

In [16]:
######################################## 06
# Order the links putting the deepest categories first. This is an opinionated choice
clean_links.sort(key=lambda x: x.get("link", "").count('/'), reverse=True)

#clean_links

In [17]:
######################################## 07  Check how the information is presented in some catalogue page... no_need not_required
#unfortunately it seems not consistent (and it changed during last week!)
electric_insect = s.get("https://www.soumari.com/categorie-produit/electronique/electric-insect/")
electric_insect = BeautifulSoup(electric_insect.text, 'html.parser')
#electric_insect

In [18]:
######################################## 08 no_need not_required
# I had to change category, as the website changed
category_list = [item.get_text() for item in electric_insect.find("ul", {"class": "breadcrumbs"}).find_all("span", {"itemprop": "name"})]
category_list  # need to drop the first two items


AttributeError: 'NoneType' object has no attribute 'find_all'

In [19]:
######################################## 09 no_need not_required
tecno = s.get("https://www.soumari.com/categorie-produit/smartphones-haut-de-gamme/tecno/")
tecno = BeautifulSoup(tecno.text, 'html.parser')

not_working     = [item.get_text() for item in tecno.find("ul", {"class": "breadcrumbs"}).find_all("span", {"itemprop": "name"})]



AttributeError: 'NoneType' object has no attribute 'find_all'

In [21]:
######################################## 10### Pagination no_need not_required
#Probably we are better off using only one category, the deepest one we can find. Let's see how to implement this.
# This is the second page...uncomment to run and show that there is no "Next" there
#tecno = requests.get("https://www.soumari.com/categorie-produit/smartphones-haut-de-gamme/tecno/page/2/")
tecno = requests.get("https://www.soumari.com/categorie-produit/smartphones-haut-de-gamme/samsung/page/2/")
tecno = BeautifulSoup(tecno.text, 'html.parser')

#tecno

In [22]:
######################################## 11 no_need not_required
next_page = tecno.find("nav", {"class": "woocommerce-pagination"}).find("a", {"class": "next"})

# If there is no next page, the result from above is None

if next_page is not None:
    next_page = next_page.get("href")

In [23]:
######################################## 12### Catalog and product pages
tv_catalogue_link = "https://www.soumari.com/categorie-produit/television/"

tv_catalogue = s.get(tv_catalogue_link)
tv_catalogue = BeautifulSoup(tv_catalogue.text, 'html.parser')

trouveons les categs

In [26]:
######################################## 13  Let's find the data in the catalogue page
#
tv_catalogue.find("ul", {"class": "products"}).find_all("div", {"class": "mf-product-details"})[1].find("bdi").get_text()

products = [
    {"name": item.find("h2").get_text(), "price": item.find("bdi").get_text()}
    for item in
    tv_catalogue.find("ul", {"class": "products"}).find_all("div", {"class": "mf-product-details"})
    ]

#len(products)
#products

In [None]:
######################################## 14
#autre version plus comprehensible on fait click droit sur un produit
'''on navigue jusqu'au niveau le plus profind'''

products_v2 = []
all_dat_in_catalogue= tv_catalogue.find("ul", {"class": "products"}).find_all("div", {"class": "mf-product-details"})
for item in all_dat_in_catalogue:
    one_line = {'name':item.find("h2").get_text(),'price':item.find("bdi").get_text(),  }
    products_v2.append(one_line)



In [28]:

######################################## 15 Let's make it a function
# In this case is easier because it seems there is no decimal in our prices
# It may be very different in other countries
# We may want to add some chek in production to avoid errors
# (for instance, if there are empty strings)

def filter_digits(rawdata:str) -> int:
    """Function to only extract digits from a string
    Parameters:
        rawdata (str): String from which extract digits
    
    Returns:
        String with only digits
    """
    return int("".join(filter(str.isdigit, rawdata)))
    
#    This product is discounted, we may want to get the regular price too
products[15]


{'name': 'Televiseur STAR X 22LJ52', 'price': '35.000\xa0CFA'}

In [34]:
#type(products[15])
val_price = filter_digits(products[15]['price'])
val_price

35000

make fct

In [35]:
######################################## 16 Let's make also this parsing a function too, including also the product link
def parse_catalog_items(page: str) -> list:
    """Function to parse an item from the catalogue page
    Parameters:
        page (str): HTML catalogue page with product information

    Returns:
        parsed_products (list): list of dicts with parsed product information
    """
    parsed_products = []
    products = page.find("ul", {"class": "products"}).find_all("div", {"class": "mf-product-details"})
    for product in products:
        parsed_product = {}
        parsed_product["name"] = product.find("h2").get_text()
        parsed_product["link"] = product.find("h2").find("a").get("href")
        if product.find("ins") is not None:
            parsed_product["price"] = filter_digits(product.find("ins").find("bdi").get_text())
            parsed_product["regular_price"] = filter_digits(product.find("del").find("bdi").get_text())
        else:
            parsed_product["price"] = filter_digits(product.find("bdi").get_text())
        parsed_products.append(parsed_product)

    return parsed_products

In [37]:
######################################## 17
clean_products = parse_catalog_items(tv_catalogue)
#clean_products

In [38]:
######################################## 18Now we define a function that brings together parsing and follow the pagination, using recursion.
import time


def scrape_category(link: str, category: str, Item: BaseModel, s: requests.session(), delay: float = 1) -> list:
    """Function to scrape a category following pagination.
    Parameters:
        link (str): starting link for a category
        category (str): category name
        Item (BaseModel): class of the data object for the specific source
        s (requests.Session()): Requests session with User-Agent properly set
        delay (float): delay in seconds between calls to prevent overloading the source

    Returns:
        list of product with all information
    """
    time.sleep(delay)
    page = s.get(link)
    page = BeautifulSoup(page.text, 'html.parser')
    results = parse_catalog_items(page)
    results = [Item(**res, category=category) for res in results]
    
    # Follow pagination if exists
    try:
        next_page = page.find("nav", {"class": "woocommerce-pagination"}).find("a", {"class": "next"})
        if next_page is not None:
            next_page = next_page.get("href")
            next_results = scrape_category(link=next_page, category=category, Item=Item, s=s, delay=delay)
            results.extend(next_results)
    except AttributeError:
        pass
            
    return results

In [39]:

######################################## 19  A litte example of how scraping from catalogue may look like, just for a couple of categories
#on scrape toutes les données ici
import random  # used to get a random sample of categories

data_list = []

for cat in random.sample(clean_links, 3):
    print(cat)
    data_list.extend(scrape_category(cat["link"], cat["category"], Soumari, s, 1))
    

{'link': 'https://www.soumari.com/categorie-produit/smartphones-haut-de-gamme/infinix/', 'category': 'INFINIX'}
{'link': 'https://www.soumari.com/categorie-produit/smartphones-haut-de-gamme/itel/', 'category': 'ITEL'}
{'link': 'https://www.soumari.com/categorie-produit/electromenager/cuisinieres/', 'category': 'CUISINIÈRES'}


In [41]:
######################################## 20 no_need not_required
# Double check the workings for a category with pagination
results = scrape_category("https://www.soumari.com/categorie-produit/cosmetique-bio/cosmetique/", "Cosmetique", Soumari, s, 1)

len(results)

73

In [42]:
######################################## 21 no_need not_required
example_page = s.get("https://www.soumari.com/produit/televiseur-samsung-85-pouces-qa85qn800atxzt-8k/")
example_page = BeautifulSoup(example_page.text, 'html.parser')

example_page.find("h1", {"class": "product_title"}).get_text()

example_page.find("p", {"class": "price"}).find("bdi").get_text()

filter_digits(example_page.find("p", {"class": "price"}).find("bdi").get_text())


example_page.find("ul", {"class": "entry-meta"}).find("li", {"class": "meta-brand"}).find("a").get_text()


example_page.find("p", {"class": "stock"}).get_text().split(":")[-1].strip() 

example_page.find("span", {"class": "posted_in"}).find_all("a")

example_page.find("span", {"class": "posted_in"}).find_all("a")

[<a href="https://www.soumari.com/categorie-produit/television/samsung-television/" rel="tag">Samsung</a>,
 <a href="https://www.soumari.com/categorie-produit/television/" rel="tag">TÉLÉVISEURS</a>]

In [45]:

######################################## 22  We put all those information together in a function
def scrape_individual_pages(link: str, Item: BaseModel, s: requests.session(), delay: float = 1) -> list:
    """Function to scrape each individual page in a directory following pagination.
    Parameters:
        link (str): starting link for the overall directory catalogue
        Item (BaseModel): class of the data object for the specific source
        s (requests.Session()): Requests session with User-Agent properly set
        delay (float): delay in seconds between calls to prevent overloading the source

    Returns:
        list of product with all information
    """
    time.sleep(delay)
    page = s.get(link)
    page = BeautifulSoup(page.text, 'html.parser')
    links = [item.get("link") for item in parse_catalog_items(page)]  # We can reuse the previous function, but we only keep the link
    results = []
    
    for l in links:  # for testing purposes, only get the first two products in each page. Remove the list selection in production
    #for l in links[:2]:  #getting only the first 2 links(products)
        time.sleep(delay)
        product = s.get(l)
        product = BeautifulSoup(product.text, 'html.parser')
        parsed_product = {}
        parsed_product["name"] = product.find("h1", {"class": "product_title"}).get_text()
        parsed_product["link"] = l
        if product.find("p", {"class": "price"}).find("ins") is not None:
            parsed_product["price"] = filter_digits(product.find("p", {"class": "price"}).find("ins").find("bdi").get_text())
            parsed_product["regular_price"] = filter_digits(product.find("p", {"class": "price"}).find("del").find("bdi").get_text())
        else:
            parsed_product["price"] = filter_digits(product.find("p", {"class": "price"}).find("bdi").get_text())
        if product.find("ul", {"class": "entry-meta"}).find("li", {"class": "meta-brand"}) is not None:
            parsed_product["brand"] = product.find("ul", {"class": "entry-meta"}).find("li", {"class": "meta-brand"}).find("a").get_text()
        if product.find("p", {"class": "stock"}) is not None:
            parsed_product["in_stock"] = product.find("p", {"class": "stock"}).get_text().split(":")[-1].strip()  # This may be quite fragile...
        if product.find("span", {"class": "posted_in"}) is not None:
            category_tags = product.find("span", {"class": "posted_in"}).find_all("a")
            parsed_product["category"] = category_tags.pop().get_text()
            # The number of category tag may be variable...this function account up to 3 tags
            if len(category_tags) > 0:
                parsed_product["subcategory"] = category_tags.pop().get_text()
            if len(category_tags) > 0:
                parsed_product["subsubcategory"] = category_tags.pop().get_text()
        results.append(Item(**parsed_product))

    # Follow pagination if exists
    try:
        next_page = page.find("nav", {"class": "woocommerce-pagination"}).find("a", {"class": "next"})
        if next_page is not None:
            next_page = next_page.get("href")
            next_results = scrape_individual_pages(link=next_page, Item=Item, s=s, delay=delay)
            results.extend(next_results)
    except AttributeError:
        pass
            
    return results
    
individual_results = scrape_individual_pages("https://www.soumari.com/categorie-produit/electromenager/", Soumari, s, 1)

len(individual_results)   

Maintenant on visualise les données sous Pandas

In [46]:
######################################## 23### Save data as CSV
#It is a good practice to automatically set today's date in the file name, to avoid overwriting past data by mistake

import pandas as pd

soumari_catalog_df = pd.DataFrame([prod.dict(exclude_none=True) for prod in data_list])
soumari_catalog_df.to_csv(f"soumari_catalog_{date.today().strftime('%Y-%m-%d')}.csv", index=False)

soumari_individual_df = pd.DataFrame([prod.dict(exclude_none=True) for prod in individual_results])
soumari_individual_df.to_csv("soumari_individual_{}.csv".format(date.today().strftime("%Y-%m-%d")), index=False,encoding='utf-8')

# F-strings and .format() can be used for creating dynamic names, it's a matter of personal preferences.

soumari_individual_df.head()

Unnamed: 0,link,source,category,subcategory,subsubcategory,name,brand,price,currency,in_stock,date,regular_price
0,https://www.soumari.com/produit/congelateur-sm...,Soumari,Réfrigérateurs et Congélateurs,ELECTROMÉNAGER,CONGÉLATEUR,Congelateur SMART TECHNOLOGY Horizontal Vitree...,Smart Technology,181500.0,CFA,En stock,2023-03-28,
1,https://www.soumari.com/produit/congelateur-sm...,Soumari,Réfrigérateurs et Congélateurs,ELECTROMÉNAGER,CONGÉLATEUR,Congelateur SMART TECHNOLOGY Horizontal 197 Li...,Smart Technology,132000.0,CFA,En stock,2023-03-28,
2,https://www.soumari.com/produit/ventilateur-in...,Soumari,VENTILATEUR,ELECTROMÉNAGER,AIR & CLIMATISATION,Ventilateur Industriel SMART TECHNOLOGY 30 Sur...,Smart Technology,50200.0,CFA,En stock,2023-03-28,
3,https://www.soumari.com/produit/tecno-spark-10...,Soumari,TÉLÉPHONES,TECNO,,Tecno SPARK 10 Pro – Mémoire 128 Go – RAM 8 Go...,Tecno,90000.0,CFA,En stock,2023-03-28,
4,https://www.soumari.com/produit/boitier-wifi-t...,Soumari,ROUTEUR & MODEM,INFORMATIQUE,,Boitier WiFi TP LINK DECO X60 (Pack de 3 Route...,tp-link,324000.0,CFA,En stock,2023-03-28,


In [6]:
soumari_individual_df.to_csv('all_data_soumari.csv',encoding='utf-8') #relative position
#soumari_individual_df.to_csv('C:/Users/abc/Desktop/file_name.csv')

NameError: name 'soumari_individual_df' is not defined