In [18]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import re
from tqdm import tqdm_notebook
from yaml import load, dump

In [2]:
%matplotlib widget

In [3]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200
            and content_type is not None
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)


pages = [
    f'http://www.litteratureaudio.com/notre-bibliotheque-de-livres-audio-gratuits?pg={i}' for i in range(1, 21)]

raw_html = simple_get(pages[0])
html = BeautifulSoup(raw_html, 'html.parser')

In [27]:
title_matcher = re.compile("(.+) [-–] (.+)")
for page in tqdm_notebook(pages):
    raw_html = simple_get(page)
    html = BeautifulSoup(raw_html, 'html.parser')
    for p in html.select('ul'):
        if 'Catégorie' in p.select_one('li').text:
            break
    bibli = {}
    for li in tqdm_notebook(p.findChildren("li", recursive=False)):

        booklist = li.find('ul')
        category = booklist.previousSibling
        category = str(category).split(':')[1].strip() if 'Catégorie' in category else category.text
        print(category)
        bibli[category] = []
        c = 0
        for b in tqdm_notebook(booklist.findChildren("li", recursive=False)):
            link = b.select_one('a')
            #m = title_matcher.match(link['title'])
            #if m:
            #    author = m.group(1)
            #    title = m.group(2)

            bookpage = BeautifulSoup(simple_get(link['href']), 'html.parser')
            files = bookpage.find_all(
                "a", class_="link-mp3-file") + bookpage.find_all("a", class_="link-roman-mp3-file")
            files = [f['href'] for f in files]
            bibli[category].append({link['title']:files})
                #{"author": author, "title": title, "links": files})
                #c += 1
            #if c > 10:
            #    break


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

Arts


HBox(children=(IntProgress(value=0, max=51), HTML(value='')))

Aventure


HBox(children=(IntProgress(value=0, max=117), HTML(value='')))

KeyboardInterrupt: 

In [9]:
import json

In [15]:
json.dump(bibli,  open('bibli.json','w'))

In [31]:
dump({'Bibliothèques':{'Catégories': bibli},    
    "Reprendre la lecture en cours":None,
    "quel heure est-t-il?":None,"Quitter l'application":None},  open('bibli.yaml','w'),allow_unicode=True)