In [1]:
import regex as re
from pathlib import Path
from nltk.corpus import stopwords
import spacy
from spacy import Language
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/itsv.org.sv-
[nltk_data]     services.at/tibor.cus@itsv.at/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def get_raw_files_per_language(data_base: Path) -> dict[str, list[Path]]:
    res = {}
    for language_dir in data_base.iterdir():
        raw_dir = language_dir / "raw-documents"
        res[language_dir.name] = list(raw_dir.iterdir())
    return res

def get_stopwords_for_language(language: str, stopwords_path: Path) -> set[str]:
    if language.lower() in ("eng", "en", "english"):
        return set(stopwords.words('english'))
    if language.lower() in ("ru", "russian"):
        return set(stopwords.words('russian'))
    if language.lower() in ("pt", "portuguese"):
        return set(stopwords.words('portuguese'))
    if language.lower() in ("bg", "bulgarian"):
        sw = open(stopwords_path / "bulgarian.txt").readlines()
        sw = [w.strip() for w in sw]
        return set(sw)
    if language.lower() in ("hi", "hindi"):
        sw = open(stopwords_path / "hindi.txt").readlines()
        sw = [w.strip() for w in sw]
        return set(sw)

def process_file(filepath: Path, stop_words: set[str]) -> list[str]:
    url_pattern = re.compile(r'https?://\S+') ## remove urls- lots of articles have them

    content = "".join(open(filepath).readlines())
    content = url_pattern.sub("", content)
    content = content.lower()
    content = re.sub(r'[^\w\s]', "", content) # non-word and non-whitespace characters
    content = re.sub(r'\d', "", content) # remove digits
    content = re.sub(r"\s+", " ", content) # remove newlines, duplicate whitespaces, tabs, ...
    tokens = [ token for token in content.split(" ") if token not in stop_words ]

    return tokens

def add_tags(tokens: list[str], nlp: Language = None) -> list[str, str, str]:
    if nlp is None:
        return [(token, '_', '_') for token in tokens] 

    str_tokens = " ".join(tokens)
    doc = nlp(str_tokens)

    return [(token.text, token.pos_, token.lemma_) for token in doc]

def save_to_conllu(filepath: Path, content: list[tuple[str, str, str]], language: str):
    with open(filepath, "w", encoding='utf-8') as f:
        f.write("# id\ttoken\tpos\tlemma\n")
        for i, (token, pos, lemma) in enumerate(content):
            f.write(f"{i + 1}\t{token}\t{pos}\t{lemma}\n")
        f.write("\n")

In [3]:
!python -m spacy download en_core_web_sm
!python -m spacy download ru_core_news_sm
!python -m spacy download pt_core_news_sm

spacy_models = {
    'EN' : 'en_core_web_sm',
    'PT' : 'pt_core_news_sm',
    'RU' : 'ru_core_news_sm',
    'BG' : None,
    'HI' : None,
}

base_path = Path.cwd().parent / "data"
filepaths = get_raw_files_per_language(base_path  / "training_data_16_October_release")

for language in filepaths.keys():
    spacy_model = spacy_models[language]        
    nlp = spacy.load(spacy_model) if spacy_model else None

    stop_words = get_stopwords_for_language(language, base_path /  "stopwords")
    Path(base_path / "tmp" / language ).mkdir(parents=True, exist_ok=True)
    for filepath in filepaths[language]:
        output_file = base_path / "tmp" / language / (filepath.stem + ".conllu")
        if output_file.exists():
            output_file.unlink() #delete if file exists
        tokens = process_file(filepath, stop_words)
        tagged_tokens = add_tags(tokens, nlp)
        save_to_conllu(output_file, tagged_tokens, language)

