# LN-LazyReader: From Online Light Novels to Audiobooks

A simple python notebook that scrapes light novel websites and converts data into audiobooks in .wav format

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import subprocess
from tqdm import tqdm_notebook as tqdm
from lxml.html.clean import Cleaner
from pathlib import Path

In [2]:
# Replace the value of root_url with the URL of one of the ligh novel chapters. 
# The program will attempt to download all chapters starting from the input URL.
# To download the whole novel, use the URL of the first chapter.

# Root url
root_url = 'https://m.wuxiaworld.co/library-of-heaven-is-path/1108180.html'

# Chapter of the root url
current_chapter = 501

# Number of chapters per file
file_range = 5

# Maximum chapters to download starting from root url. Set to None to download all the following chapters.
max_chapters = None

# Output directories. 
output_text_dir = 'ln_text'
output_audio_dir = 'ln_audio'

In [3]:
# Functions used to infer the light novel website being scraped

def get_website(url, websites):
    for website in websites:
        if website in url:
            return website
    return None

def get_base_url(url, pattern):
    return re.findall(pattern, url)[0]

In [4]:
# Create acronym from the title of the novel

def get_novel_title(url, regex, acronym=False):
    output = re.findall(regex, url)[0]
    output = output.replace('-', '_')
    output = output.replace(' ', '_')
    if acronym:
        output = ''.join(word[0] for word in output.split()).upper()
    return output

In [5]:
# Function that extracts the title, content and next chapter url of a light novel chapter given a url

def crawl_chapter(url, base_url, content_marker, title_marker, next_marker, replace_filters, cleaner):

    response = requests.get(url)
    data = BeautifulSoup(response.text, 'html.parser')

    # Get light novel title
    title = data.find(*title_marker).get_text()
    title = title.encode('utf-8')

    # Get light novel content
    content = str(data.find(*content_marker))
    content = cleaner.clean_html(content)
    content = content[len('<div>'):-len('</div>')]
    content = content.strip()
    for i in replace_filters:
        content = content.replace(i, replace_filters[i])
    content = content.encode('utf-8')

    # Get next chapter url
    try:
        next_url = data.find(*next_marker)['href']
        if not 'http' in next_url:
            next_url = base_url + next_url
    except:
        next_url = None
    
    return {'title': title, 'content': content, 'next': next_url}

In [6]:
def crawl_novel(url, base_url, content_marker, title_marker, next_marker,cleaner, replace_filters,
                max_chapters=None, verbose=True):
    
    novel = []
    next_url = url
    counter = 0
    pbar = tqdm(disable=not(verbose))
    
    while next_url:
        if max_chapters and counter >= max_chapters: break
        else: counter += 1

        chapter = crawl_chapter(next_url, base_url=base_url, 
                      content_marker=content_marker, 
                      title_marker=title_marker, 
                      next_marker=next_marker,
                      replace_filters=replace_filters,
                      cleaner=cleaner)
        
        if chapter['content'] == b'None': break
        
        novel.append(chapter)
        next_url = chapter['next']
        pbar.set_description(chapter['title'].decode('utf-8'), refresh=True)
        pbar.update()
    
    return novel

In [48]:
'''
Saves the crawled chapters into text files.

novel            : a list of chapters crawled using the crawl_novel method
text_dir         : a path object that points to the output directory of create_text_files method
current_chapter  : specify the chapter of the root URL to adjust the name of the output text file
file_range       : specify the number of chapters per text file
chapter_append     : text to include before each chapters
chapter_splitter : text that separate each chapters when multiple chapters are present in a single file
'''

def create_text_files(novel, text_dir,
                      current_chapter = 1,
                      file_range = 1,
                      chapter_append = '',
                      novel_acronym = '',
                      chapter_splitter = '\n\n\n**********\n\n\n'):
    
    chapter_append = chapter_append.encode('utf-8')
    chapter_splitter = chapter_splitter.encode('utf-8')
    total_file_count = (len(novel) + current_chapter) // file_range
    last_chapter = (len(novel) + current_chapter - 1) % file_range
    
    for idx, i in tqdm(enumerate(range(current_chapter, len(novel) + current_chapter, file_range)), 
                       desc='Creating text files'):
        if idx == total_file_count:
            output_file = f'{novel_acronym}_{str(idx + 1).zfill(5)}_Chapter_{i}_to_{i + last_chapter - 1}.txt'
        else:
            output_file = f'{novel_acronym}_{str(idx + 1).zfill(5)}_Chapter_{i}_to_{i + file_range - 1}.txt'
        
        with open(text_dir / output_file, 'ab') as my_file:
            for chapter in novel[i-current_chapter : (i-current_chapter) + file_range]:
                my_file.write(chapter_append + chapter['title'] + b'\n\n')
                my_file.write(chapter['content'])
                my_file.write(chapter_splitter)

In [8]:
# Convert text files into audio files
# Requires balabolka - command line utility (http://www.cross-plus-a.com/bconsole.htm)

def create_audio_files(text_dir, audio_dir, voice_name='Zira', audio_format='mp3'):
    for chapter in tqdm(text_dir.glob('*.txt'), desc='Creating audio files'):
        audio_file = audio_dir / (str(chapter.name)[:-len('.txt')])
        if audio_format == 'wav':
            my_cmd = str(f'balcon -f "{chapter}" -w "{audio_file}.wav" -n "{voice_name}"')
        else:
            my_cmd = str(f'balcon -f {chapter} -n {voice_name} -o --raw | lame -r -s 16 -m m -h - {audio_file}.mp3')
        subprocess.call(my_cmd, shell=True)

In [9]:
'''
Meta Filters

A dictionary containing the filters used to scrape the different light novel websites.
Each dictionary keys refers to the name of the target light novel website for web scaping.
Each item contains another dictionary containing the specific filters for the target website:
    base_url : regex to capture the URL of the specific novel
    title    : tuple required to capture the chapter title
    content  : tuple required to capture the light novel content
    next     : tuple required to capture the URL of the next chapter of the novel
Values for title, content and next requires some basic understanding of the Beautiful Soup module
as the tuples provided will be passed to the find(...) method of a BeautifulSoup object.

'''

meta_filters = {
    'm.wuxiaworld': {
        'base_url': r'https://m.wuxiaworld.co/.*/',
        'acronym' : r'(?<=https://m.wuxiaworld.co/).*(?=/)',
        'title': ('span', {'class': 'title'}),
        'content': ('div', {'id': 'chaptercontent'}),
        'next': ('a', {'id': 'pt_next'})
    },
    'royalroad': {
        'base_url': r'^https://www.royalroad.com/',
        'acronym' : r'(?<=https://www.royalroad.com/fiction/\d{5}/).*(?=/chapter/\d)',
        'title': ('title',),
        'content': ('div', {'class': 'chapter-inner chapter-content'}),
        'next': ('link', {'rel': 'next'})
    },
}

replace_filters = {
    '<br>': '\n',
    '“': '"',
    '”': '"',
    '’': "'",
    '‘': "'",
    '《': '',
    '》': '',
    '…': '...',
}

# List of html tags to remove including contents
kill_tags = ['ins']

# List of html tags to remove excluding contents
remove_tags = ['div', 'p']

In [10]:
website = get_website(root_url, meta_filters.keys())
base_url = get_base_url(root_url, meta_filters[website]['base_url'])
content_marker = meta_filters[website]['content']
title_marker = meta_filters[website]['title']
next_marker = meta_filters[website]['next']

novel_title = get_novel_title(root_url, meta_filters[website]['acronym'], acronym=False)
novel_acronym = get_novel_title(root_url, meta_filters[website]['acronym'], acronym=True)

output_dir = Path(novel_title)
text_dir = novel_title / Path(output_text_dir)
audio_dir = novel_title / Path(output_audio_dir)

# If directory exists, create a new one
cnt = 0
while text_dir.exists() or audio_dir.exists():
    cnt += 1
    text_dir = novel_title / Path(f'{output_text_dir}_{cnt}' )
    audio_dir = novel_title / Path(f'{output_audio_dir}_{cnt}')

output_dir.mkdir(exist_ok=True)
text_dir.mkdir(exist_ok=True)
audio_dir.mkdir(exist_ok=True)

In [11]:
# Create cleaner object
cleaner = Cleaner(page_structure=False)
cleaner.kill_tags = kill_tags
cleaner.remove_tags = remove_tags

In [13]:
# Save the crawled data to a dictionary
novel = crawl_novel(root_url, base_url=base_url, 
                    content_marker=content_marker, 
                    title_marker=title_marker, 
                    next_marker=next_marker,
                    replace_filters=replace_filters,
                    cleaner=cleaner,
                    max_chapters=max_chapters)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [49]:
# Save the crawled data into text files
create_text_files(novel, text_dir=text_dir, file_range=file_range, 
                  current_chapter=current_chapter, 
                  novel_acronym = novel_acronym,
                  chapter_append='Chapter ')

HBox(children=(IntProgress(value=1, bar_style='info', description='Creating text files', max=1, style=Progress…

In [50]:
# Convert text files into audio files
create_audio_files(text_dir=text_dir, audio_dir=audio_dir)

HBox(children=(IntProgress(value=1, bar_style='info', description='Creating audio files', max=1, style=Progres…