## Import

In [11]:
import json
import requests
from bs4 import BeautifulSoup
import os
from selenium import webdriver
from time import sleep
from PIL import Image
from glob import glob
import shutil

## Scraper

In [12]:
def load_html(url:str, static=True) :
    """
    Load a BeautifulSoup instance over an HTML page.
    If static=False, we use Selenium to load the page.
    If static=True, we use requests to load the page.
    Input :
    url: str, url of the page
    static: bool
    """

    if static :
        response = requests.get(url)
        return BeautifulSoup(response.content, "html.parser")
    else :
        driver = webdriver.Firefox()
        driver.get(url)
        sleep(5)
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")
        driver.quit()
        return soup

In [13]:
def extract_urls_from_search(search_url) :
    urls=[]
    soup = load_html(search_url, False)
    for tag in soup.find_all("a", href=True) :
        if "from_search" in tag["href"] : urls.append(tag["href"])
    return urls

In [14]:
def extract_most_read(slideshow: dict) -> list :
    """
    Construct a one-hot vector of length nb_slides. Put a 1 at the index of slides most read.
    Inputs :
    slideshow: dict, metadata of the slideshow (extracted with BeautifulSoup)
    Ouputs :
    isMostRead: list, one-hot vector of nb_slides 
    """
    nb_slides = slideshow["totalSlides"]
    indexMostRead = [list(slide.items())[0][1] for slide in slideshow["topReadSlides"]]
    isMostRead = [1 if k in indexMostRead else 0 for k in range(nb_slides)]
    return isMostRead

In [15]:
def scrap(url:str) :
    """
    The scrap function slides from SlideShare.net and then creates a folder which contains all slides in jpg format.
    Input :
    url : a str-like, contains the url of the download page of the slides
    """
    #html request
    soup = load_html(url)

    #The script tag with id "__NETX_DATA__" contains with a json format some meta informations.
    data = json.loads(soup.select_one("#__NEXT_DATA__").text)
    slideshow =  data["props"]["pageProps"]["slideshow"]
    meta_data = { "id": slideshow["id"],
                  "presentation_url": slideshow["canonicalUrl"],
                  "title": slideshow["title"],
                  "author": slideshow["username"],
                  "date": slideshow["createdAt"][:10], #AAAA-MM-JJ
                  "len": slideshow["totalSlides"],
                  "description": slideshow["description"],
                  "lang": slideshow["language"],
                  "dim": slideshow["slideDimensions"],
                  "like": slideshow["likes"],
                  "view": slideshow["views"],
                  "transcript": slideshow["transcript"],
                  "mostRead": extract_most_read(slideshow),
                  }
    #img_urls
    slides = slideshow["slides"]
    sizes = slides["imageSizes"][1]
    img_url = f"{slides['host']}/{slides['imageLocation']}/{sizes['quality']}/{slideshow['title']}-" + "{}" + f"-{sizes['width']}.{sizes['format']}"

    #folder creation
    folder_name = str(slideshow["id"])
                                      
    if os.path.exists("data/"+folder_name) : shutil.rmtree("data/"+folder_name)
    os.mkdir("data/"+folder_name)
    
    #For each image
    for i in range(slideshow["totalSlides"]):
        response = requests.get(img_url.format(i+1))
        sleep(5)
        with open(f"data/{folder_name}/slide_{i}.jpg", "wb") as f :
            f.write(response.content)
            
    with open(f"data/{folder_name}/{folder_name.lower()}.json", "w") as f :
        json.dump(meta_data,f)

## Test Scraper

In [17]:
url = "https://fr.slideshare.net/slideshow/repas-franais-46412448/46412448?from_search=0"
soup = load_html(url)
json.loads(soup.select_one("#__NEXT_DATA__").text)

{'props': {'pageProps': {'name': 'slideshow',
   'edgeTestAssignments': [{'name': 'example', 'variant': 'B'},
    {'name': 'gallery_view', 'variant': 'A'},
    {'name': 'mobile_cleanup', 'variant': 'B'},
    {'name': 'nextjs_profile', 'variant': 'A'},
    {'name': 'reading_modes', 'variant': 'A'},
    {'name': 'recs_placement', 'variant': 'A'},
    {'name': 'recs_placement_v2', 'variant': 'B'}],
   'layout': {'currentUser': None,
    'fullPath': 'https://fr.slideshare.net/slideshow/repas-franais-46412448/46412448?from_search=0',
    'osanoId': '079b27eb-bb3f-48dd-9bd9-3feb8aec3c38',
    'featureFlags': [{'name': 'disable_facebook', 'enabled': True},
     {'name': 'document_interstitials_flag', 'enabled': True},
     {'name': 'recommendation_impression_tracking', 'enabled': True},
     {'name': 'search_results_tracking', 'enabled': True},
     {'name': 'view_restriction_without_subscription_after_five',
      'enabled': True},
     {'name': 'disable_lazy_hydration', 'enabled': False}]},

## Scraping

In [16]:
#page_i is the url format of the content search page in French on slideshare
#the first page is page_i.format(1), the second page is page_2.format(2) etc...
#nb pages is the number of the last content search page that we scrap.
page_i = "https://fr.slideshare.net/search?searchfrom=header&q=fran%C3%A7ais&page={}"
nb_pages = 2

In [17]:
#Execution of the scraper
for i in range(1,nb_pages+1) :
    urls = extract_urls_from_search(page_i.format(i))
    for j,url in enumerate(urls) :
            sleep(2)
            scrap(url)

ConnectTimeout: HTTPSConnectionPool(host='image.slidesharecdn.com', port=443): Max retries exceeded with url: /27092023elabebfmtvlesfranaislessenceetlepouvoirdachatlesfranaisetleharclementscolaire-230927145320-defc4989/85/%20Les%20Fran%C3%A7ais,%20l%E2%80%99essence%20et%20le%20pouvoir%20d%E2%80%99achat%20/%20Les%20Fran%C3%A7ais%20et%20le%20harc%C3%A8lement%20scolaire-37-638.jpg (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000001CE26974450>, 'Connection to image.slidesharecdn.com timed out. (connect timeout=None)'))