<a href="https://colab.research.google.com/github/astrologos/libri-scraper/blob/main/Libri_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Public Audiobook Scraper
The Public Audiobook Scraper downloads full audiobook MP3's from LibriVox.org.  It is meant to assist in building an labeled speech dataset for use in training neural Text-To-Speech systems in conjunction with Automatic Speech Recognition models.

## Mount Google Drive

In [17]:
import os
from google.colab import drive
nb_dir = '/content/drive/MyDrive/COLAB/Pub-Audiobook-Scraper'
drive.mount('/content/drive/',force_remount=True)
if not os.path.exists(nb_dir):
    os.makedirs(nb_dir)
os.chdir(nb_dir)
print('Current path: ' + os.getcwd())

Mounted at /content/drive/
Current path: /content/drive/MyDrive/COLAB/Pub-Audiobook-Scraper


## Install Chromium and Selenium

In [None]:
# Install Chromium
%%shell
# Add debian buster
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF

# Add keys
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

# Prefer debian repo for chromium* packages only
# Note the double-blank lines between entries
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
Package: *
Pin: release a=eoan
Pin-Priority: 500


Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300


Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
EOF


apt-get update
apt-get install -qq chromium chromium-driver
pip install -q selenium

## Imports

In [18]:
# System
import sys
import numpy.random as rand
import time

# IO
import pickle
from tqdm.notebook import tqdm
import zipfile

# Web
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
from urllib.parse import parse_qs, urlparse, urlencode, urlunparse
import requests

## Utilities

In [14]:
# Util to modify search page number in URL
def update_search_page(url, new_search_page):
    parsed_url = urlparse(url)                                                  # Parse the URL
    query_params = parse_qs(parsed_url.query)                                   # Get the query parameters
    query_params['search_page'] = [str(new_search_page)]                        # Update the search_page parameter
    updated_query_string = urlencode(query_params, doseq=True)                  # Construct the updated query string
    updated_url = urlunparse(parsed_url._replace(query=updated_query_string))   # Construct the updated URL
    return updated_url

## Scrape links from LibriVox.org

In [None]:
# Set up Selenium
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.headless = True

# Instantiate webdriver
links = []
wd = webdriver.Chrome('chromedriver',options=chrome_options)
url = 'https://librivox.org/search?primary_key=1&search_category=language&search_page=1&search_form=get_results' 
max_pages = 500

# Iterate through search pages
for i in tqdm(range(max_pages)):
    time.sleep(rand.randint(5, 5000) / 1000.0)                                  # Sleep a few seconds to avoid exposure / DOSing the website
    wd.get(url)                                                                 # GET request
    WebDriverWait(wd, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, '.download-btn')))     # Load the webpage and wait for JavaScript to execute
    html = wd.page_source                                                       # Get the fully rendered HTML
    soup = BeautifulSoup(html, 'html.parser')                                   # Parse the HTML with BeautifulSoup
    download_links = soup.find_all('a')

    # Pull links
    for link in download_links:
        href = link.get('href')
        if href.endswith('.zip'):
            links.append(href)

    # Update URL
    url = update_search_page(url,i+2)

# Print list of links when done
_ = [print(i) for i in links]

# Close the Selenium WebDriver
wd.quit()

## Save list of links

In [19]:
file_path = 'audiolinks.pickle'
with open(file_path, 'wb') as file:
    pickle.dump(links, file)

## Download, unzip and save mp3s

In [None]:
# Create save folder
output_directory = './mp3'
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Iterate through links
for link in tqdm(links[(65):]):    
    # Download the ZIP file
    response = requests.get(link)
    file_path = os.path.join(output_directory, os.path.basename(link))
    
    # Write the ZIP file to disk
    with open(file_path, 'wb') as file:
        file.write(response.content)

    # Extract the ZIP file
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall(output_directory)

    # Delete the downloaded ZIP file
    os.remove(file_path)

  0%|          | 0/12241 [00:00<?, ?it/s]