In [1]:
import bs4, requests, json
from time import sleep

# The novel is in the public domain.
# This is where the text will be scraped from. 
BASE_URL = 'https://klempera.tripod.com/svejk{i:0>2}.htm'

# The index 'i' in 'svejk{i}.htm'  needs to be mapped to a pair (part, chapter)
MAP_DIL_KAPITOLA = {}
# The chapter (kapitola) values of i for each part (dil) 1-4 are as follows:
RANGES_DIL_KAPITOLA = [(1, 15), (16, 20), (21, 24), (25, 27)]

for part, (first, last) in enumerate(RANGES_DIL_KAPITOLA, start=1):
    for i in range(first, last + 1):
        MAP_DIL_KAPITOLA[i] = (part, i - first + 1)
POOL_i = range(1, 28)

ENCODING = 'windows-1250' # Important: the webpage is not in UTF-8.
HEADERS= {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'} 

# This is to handle the occasional 502s.
MAX_RETRIES = 5
WAIT_TIME = 0.5 # Time is in seconds.

def extract_plain_text(url):
    for _ in range(MAX_RETRIES):
        try:     
            response = requests.get(url, headers=HEADERS, timeout=50)
            response.raise_for_status()
            response.encoding = ENCODING
        
            html_data = bs4.BeautifulSoup(response.text, 'html.parser')
            all_text = html_data.get_text(separator=' ', strip=True)
            return all_text
        except requests.exceptions.HTTPError as e:
            print (f'Server error {e.response.status_code} for {url}. Retrying...')
            sleep(WAIT_TIME)        

BOOK_TITLE = 'Osudy dobrého vojáka Švejka za světové války'
AUTHOR = 'Jaroslav Hašek'

all_chapters = [] 
for i in POOL_i:
    raw_text = extract_plain_text(BASE_URL.format(i=i))
    if raw_text:
        clean_text = raw_text.replace('&nbsp', ' ')
        part_num, chapter_num_in_part = MAP_DIL_KAPITOLA.get(i, (None, None))

        chapter_data = {'book_title': BOOK_TITLE, 'author': AUTHOR,
                        'part_number': part_num, 'chapter_number': chapter_num_in_part,
                        'chapter_text': clean_text}
        all_chapters.append(chapter_data)
        print (f'Dil {part_num}, kapitola {chapter_num_in_part} scraped successfully.')
    else:
        print (f'Could not extract the text. Skipping dil {part_num}, kapitola  {chapter_num_in_part}.')
        
    sleep(WAIT_TIME) # Being polite.

output_jsonl_filename = './data/clean/svejk_chapters.jsonl'
try:
    with open(output_jsonl_filename, 'w', encoding='utf-8') as f:
        for chapter in all_chapters:
            f.write(json.dumps(chapter, ensure_ascii=False) + '\n')
    print (f'All chapters saved to "{output_jsonl_filename}" as JSONL.')
except Exception as e:
    print (f'Error saving JSONL: {e}')



Dil 1, kapitola 1 scraped successfully.
Dil 1, kapitola 2 scraped successfully.
Dil 1, kapitola 3 scraped successfully.
Dil 1, kapitola 4 scraped successfully.
Dil 1, kapitola 5 scraped successfully.
Dil 1, kapitola 6 scraped successfully.
Server error 502 for https://klempera.tripod.com/svejk07.htm. Retrying...
Server error 502 for https://klempera.tripod.com/svejk07.htm. Retrying...
Dil 1, kapitola 7 scraped successfully.
Dil 1, kapitola 8 scraped successfully.
Dil 1, kapitola 9 scraped successfully.
Dil 1, kapitola 10 scraped successfully.
Dil 1, kapitola 11 scraped successfully.
Dil 1, kapitola 12 scraped successfully.
Dil 1, kapitola 13 scraped successfully.
Dil 1, kapitola 14 scraped successfully.
Dil 1, kapitola 15 scraped successfully.
Dil 2, kapitola 1 scraped successfully.
Dil 2, kapitola 2 scraped successfully.
Dil 2, kapitola 3 scraped successfully.
Dil 2, kapitola 4 scraped successfully.
Dil 2, kapitola 5 scraped successfully.
Dil 3, kapitola 1 scraped successfully.
Dil 3,