In [37]:
from bs4 import BeautifulSoup
import requests
from pprint import pprint
import json, itertools

def get_soup(url):
    return BeautifulSoup(requests.get(url).text, 'lxml')

def get_library(url):
    pages = max([
        int(li.find("a")["href"].split("page=")[1])
        for li in get_soup(url).find_all("li", {"class": "page"})
        if li.find("a") is not None and li.find("a")["href"] is not None
    ])
    
    books = {}
    for page in range(pages):
        books.update({
            div.find("a").text: div.find("a")["href"]
            for div in (
                get_soup((url + "?page={page}").format(page=page+1))
                  .find_all("div", {"class": "vcWrapA titleBox"})
            )
        })
    return books

def get_book(url):
    pages = max([1] + [
        int(li.find("a")["href"].split("page=")[1])
        for li in get_soup(url).find_all("li", {"class": "page"})
        if li.find("a") is not None and li.find("a")["href"] is not None
    ])
    text = ""
    for page in range(pages):
        text += get_soup((url + "?page={page}").format(page=page+1)).find("div", {"id": "pageText"}).text
    return text

In [27]:
library = get_library("http://www.pasakas.net/pasakas/latviesu-pasakas/")
pprint(library)

{' Tirgotāja meita': 'http://www.pasakas.net/pasakas/latviesu-pasakas/dzukstes_pasakas/t/tirgotaja_meita/',
 'Aitu gans': 'http://www.pasakas.net/pasakas/latviesu-pasakas/latviesu_tautas_pasakas/a/aitu-gans/',
 'Aizej tur, nezin kur, atnes to - nezin ko': 'http://www.pasakas.net/pasakas/latviesu-pasakas/latviesu_tautas_pasakas/a/aizej-tur-nezin-kur-atnes-to-nezin-ko/',
 'Apķērīgam visur laimējas.': 'http://www.pasakas.net/pasakas/latviesu-pasakas/latviesu_tautas_pasakas/a/apkerigam-visur-laimejas/',
 'Ar vienu burvju lietu iekaro otru': 'http://www.pasakas.net/pasakas/latviesu-pasakas/latviesu_tautas_pasakas/a/ar-vienu-burvju-lietu-iekaro-otru/',
 'Atrastais Ansītis': 'http://www.pasakas.net/pasakas/latviesu-pasakas/latviesu_tautas_pasakas/a/atrastais-ansitis/',
 'Attapīgais puisis': 'http://www.pasakas.net/pasakas/latviesu-pasakas/latviesu_tautas_pasakas/a/attapigais-puisis/',
 'Attapīgā meita': 'http://www.pasakas.net/pasakas/latviesu-pasakas/latviesu_tautas_pasakas/a/attapiga-meita/

In [79]:
from collections import Counter
import re
import numpy as np

class FrequencyDictionary:
    def __init__(self, samples):
        self.freqs = dict(Counter([
            self.extract_stem(w)
            for w in self.split_text(" ".join(samples))
            if len(self.extract_stem(w)) > 0
        ]))
        total = sum(self.freqs.values())
        self.freqs = {k:v/total for k, v in self.freqs.items()}
        
    def split_text(self, text):
        return re.split(r"\s+", re.sub(r"[\"\',.?!\u00bb\u00ab\-\(\)\—]", "", text))
        
    def __getitem__(self, word):
        return self.freqs.get(self.extract_stem(word), 0)

    def extract_stem(self, word):
        raise NotImplementedError("Need to subclass FrequencyDictionary per language")
        
    def text_difficulty(self, text):
        return np.mean([self.__getitem__(w) for w in self.split_text(text)])

class LatvianFD(FrequencyDictionary):
    def extract_stem(self, word):
        word = word.lower()
        prefixes = [
            'a(.*)', 'aiz(.*)', 'ap(.*)', 'at(.*)',
            'homo(.*)', 'ie(.*)', 'iz(.*)', 'jota(.*)',
            'mikro(.*)', 'ne(.*)', 'no(.*)', 'pa(.*)',
            'pie(.*)', 'prie(.*)', 'priekš(.*)', 'proto(.*)',
            'pār(.*)', 'sa(.*)', 'uz(.*)'
        ]
        for p in prefixes:
            m = re.match(p, word)
            if m:
                word = m.group(1)
        return word

In [86]:
try:
    with open("library.json") as f:
        books = json.loads(f.read())
except:
    books = {
        title: {
            "source": link,
            "text": get_book(link)
        }
        for title, link in itertools.islice(library.items(), 10)
    }
    with open("library.json", "w") as f:
        f.write(json.dumps(books, indent=4))

In [87]:
fd = LatvianFD([book["text"] for title, book in books.items()])

books = {
    title: {
        **book,
        "difficulty": fd.text_difficulty(book["text"])
    }
    for title, book in books.items()
}
pprint(books)

{'Dažāda atmaksa': {'difficulty': 0.005716698196153199,
                    'source': 'http://www.pasakas.net/pasakas/latviesu-pasakas/latviesu_tautas_pasakas/d/dazada-atmaksa/',
                    'text': '\n'
                            'Reiz viena meita gājusi pa ceļu. Pa priekšu gājis '
                            'vecs vecis, kas tikko varējis pavazāt kājas.\n'
                            '\n'
                            '\n'
                            '  "Vecais tēv\', sēdi man kukurī!" teikusi meita, '
                            '"es tevi panesīšu."\n'
                            '\n'
                            '\n'
                            '  "Ko nu, meit," teicis vecis, "gan jau es tāpat '
                            'aizvilkšos."\n'
                            '\n'
                            '\n'
                            '  "Nāc vien, nāc!" teikusi meita, "es esmu jauna '
                            'un stipra, tevi panest varu."\n'
                            '\n'

In [85]:
with open("library.json", "w") as f:
    f.write(json.dumps(books, indent=4))

In [97]:
markdown_format = """
# {title}
*Difficulty: {difficulty:0.04f}*

{text}
"""

with open("short-stories.md", "w") as f:
    for title, book in sorted(list(books.items()), key=lambda b: b[1]['difficulty']):
        f.write(markdown_format.format(
            title=title,
            difficulty=book["difficulty"],
            text=book["text"]
        ))