## Import

In [1]:
import json
import csv
import requests
from bs4 import BeautifulSoup
import os
from selenium import webdriver
from time import sleep
import shutil
from tqdm import tqdm
DATA_PATH = "data"

## Scraper

In [2]:
def load_html(url:str, static=True) :
    """
    Load a BeautifulSoup instance over an HTML page.
    If static=False, we use Selenium to load the page.
    If static=True, we use requests to load the page.
    Input :
    url: str, url of the page
    static: bool
    """

    if static :
        response = requests.get(url)
        return BeautifulSoup(response.content, "html.parser")
    else :
        driver = webdriver.Firefox()
        driver.get(url)
        sleep(5)
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")
        driver.quit()
        return soup

In [3]:
def extract_urls_from_search(search_url) :
    urls=[]
    soup = load_html(search_url, False)
    for tag in soup.find_all("a", href=True) :
        if "from_search" in tag["href"] : urls.append(tag["href"])
    return urls

In [4]:
def extract_most_read(slideshow: dict) -> list :
    """
    Construct a one-hot vector of length nb_slides. Put a 1 at the index of slides most read.
    Inputs :
    slideshow: dict, metadata of the slideshow (extracted with BeautifulSoup)
    Ouputs :
    isMostRead: list, one-hot vector of nb_slides 
    """
    nb_slides = slideshow["totalSlides"]
    indexMostRead = [list(slide.items())[0][1] for slide in slideshow["topReadSlides"]]
    isMostRead = [1 if k in indexMostRead else 0 for k in range(nb_slides)]
    return isMostRead

In [5]:
def scrap(url:str) :
    """
    The scrap function slides from SlideShare.net and then creates a folder which contains all slides in jpg format.
    Input :
    url : a str-like, contains the url of the download page of the slides
    """
    #html request
    soup = load_html(url)

    #The script tag with id "__NETX_DATA__" contains with a json format some meta informations.
    data = json.loads(soup.select_one("#__NEXT_DATA__").text)
    slideshow =  data["props"]["pageProps"]["slideshow"]

    if type(slideshow["transcript"]) == str : transcript = [slideshow["transcript"]]
    else : transcript = slideshow["transcript"]

    meta_data = { "id": slideshow["id"],
                  "presentation_url": slideshow["canonicalUrl"],
                  "title": slideshow["title"],
                  "author": slideshow["username"],
                  "date": slideshow["createdAt"][:10], #AAAA-MM-JJ
                  "len": slideshow["totalSlides"],
                  "description": slideshow["description"],
                  "lang": slideshow["language"],
                  "dim": slideshow["slideDimensions"],
                  "like": slideshow["likes"],
                  "view": slideshow["views"],
                  "transcript": transcript,
                  "mostRead": extract_most_read(slideshow),
                  }
    #img_urls
    slides = slideshow["slides"]
    sizes = slides["imageSizes"][1]
    img_url = f"{slides['host']}/{slides['imageLocation']}/{sizes['quality']}/{slideshow['title']}-" + "{}" + f"-{sizes['width']}.{sizes['format']}"

    #folder creation
    folder_name = str(slideshow["id"])
    if not os.path.exists(DATA_PATH) : os.mkdir(DATA_PATH)
    if os.path.exists(DATA_PATH+"/"+folder_name) : shutil.rmtree(DATA_PATH+"/"+folder_name)
    os.mkdir(DATA_PATH+"/"+folder_name)
    
    #For each image
    for i in range(slideshow["totalSlides"]):
        response = requests.get(img_url.format(i+1))
        sleep(5)
        with open(f"{DATA_PATH}/{folder_name}/slide_{i}.jpg", "wb") as f :
            f.write(response.content)
            
    with open(f"{DATA_PATH}/{folder_name}/{folder_name.lower()}.json", "w") as f :
        json.dump(meta_data,f)

## Scraping

### Get urls of slideshows

Set the number of pages (1 page = 18 slideshows)

In [8]:
#page_i is the url format of the content search page in French on slideshare
#the first page is page_i.format(1), the second page is page_2.format(2) etc...
#nb pages is the number of the last content search page that we scrap.
page_i = "https://fr.slideshare.net/search?searchfrom=header&q=espa%C3%B1ol&language=es&page={}"
nb_pages = 30
urls = []

Execute this to scrap urls of slideshows

In [9]:
#Execution of the scraper
for i in range(1,nb_pages+1) :
    urls.extend(extract_urls_from_search(page_i.format(i)))

In [10]:
with open("urls.csv", "w", newline="") as f:
    for url in urls : f.write(f"{url}\n")

If you have already scraped urls you can get them with :

In [11]:
urls = []
with open("urls.csv", "r") as f :
    for url in f.readlines() : urls.append(url)

If you begin the scraping execute this cell and execute the scraping cell:

In [11]:
last_slideshow = -1

If you want to resume after a break of the loop, execute this cell and relunch the scraping cell

In [12]:
with open("last_slideshow.txt") as f : last_slideshow = int(f.read())

Scraping cell :

In [12]:
#Scraping cell
for j in tqdm(range(last_slideshow+1, len(urls))) :
    try :
        scrap(urls[j])
        sleep(3)
    except KeyError as e : print(f"KeyError catched : {e}")
    except requests.exceptions.ConnectionError as e : print(f"ConnectionError catched : {e}")
    except requests.exceptions.HTTPError as e : print(f"HTTPError catched : {e}")
    last_slideshow+=1
    with open("last_slideshow.txt", "w") as f : f.write(str(last_slideshow))

 17%|█▋        | 90/540 [2:32:26<12:42:13, 101.63s/it]


KeyboardInterrupt: 

## Post Processing

Before the Q/A generation, please run this cell :

In [6]:
#Verif integrity
listdir = os.listdir(DATA_PATH)
for dir in listdir : 
    meta_data_path = f"{DATA_PATH}/{dir}/{dir}.json"
    if not os.path.exists(meta_data_path) :
        print(f"Can't find meta-data for {dir}")
        shutil.rmtree(f"{DATA_PATH}/{dir}")
    else :
        with open(meta_data_path, "r") as f:
            meta_data = json.load(f)
        if type(meta_data["id"]) != str : print(f"{dir} : id type error")
        if type(meta_data["presentation_url"]) != str : print(f"{dir}: presentation_url type error")
        if type(meta_data["title"]) != str : print(f"{dir} : title type error")
        if type(meta_data["author"]) != str: print(f"{dir} : author type error")
        if type(meta_data["date"]) != str : print(f"{dir} : date type error")
        if type(meta_data["len"]) != int : print(f"{dir} : len type error")
        if type(meta_data["description"]) != str : print(f"{dir} : description type error")
        if type(meta_data["lang"]) != str : print(f"{dir} :lang  type error")
        if type(meta_data["dim"]) != dict : print(f"{dir} : dim type error")
        if type(meta_data["like"]) != int : print(f"{dir} : like type error")
        if type(meta_data["view"]) != int : print(f"{dir} : view type error")
        if type(meta_data["transcript"]) != list : print(f"{dir} : transcript type error")
        if type(meta_data["mostRead"]) != list : print(f"{dir} : mostRead type error")

If the previous cell returns nothing, you can go to generate_qa.ipynb