## CREATE CZECH LANGUAGE DATASET FROM WIKI ARTICLES

In [9]:
import wikipedia
import pandas as pd
from nltk.tokenize import sent_tokenize
import re

In [10]:
#set czech language and initialize parameters
wikipedia.set_lang('cs')
npages = 10
path = 'data.json'

In [11]:
def parse(raw_content: str) -> str:
    """
    :param raw_content: text from wiki page
    :return: cleaned text
    """
    section_title_re = re.compile("^=+\s+.*\s+=+$")
    content = []
    skip = False
    for l in raw_content.splitlines():
        line = l.strip()
        if "= references =" in line.lower():
            skip = True
        elif "= further reading =" in line.lower():
            skip = True
        elif section_title_re.match(line):
            skip = False
        elif not skip:
            content.append(line)
    return '\n'.join(content) + '\n'

In [12]:
def avoidexceptions(title: str) -> str:
    """
    :param title: page title
    :return: page summary as plain text
    """
    text = None
    counter = 10
    while text is None and counter > 0:
        try:
            page = wikipedia.page(title)
            text = parse(page.content)
        except:
            title = wikipedia.random(pages=1)
        counter -= 1
    return text

In [13]:
def getrandomczechtexts(npages: int = 10) -> pd.DataFrame:
    """
    :param npages: number of texts
    :return: list of texts
    """
    titles = wikipedia.random(pages=npages)
    summaries = [[title, avoidexceptions(title=title)] for title in titles]
    return pd.DataFrame(data=summaries, columns=["title", "text"])

In [15]:
# get dataframe with czech pages and save to json
df = getrandomczechtexts(npages=npages)
df = df.dropna()
df.to_json(path)