In [74]:
import regex as re
from pathlib import Path
from nltk.corpus import stopwords

In [146]:
def get_raw_files_per_language(data_base: Path) -> dict[str, list[Path]]:
    res = {}
    for language_dir in data_base.iterdir():
        raw_dir = language_dir / "raw-documents"
        res[language_dir.name] = list(raw_dir.iterdir())
    return res

def get_stopwords_for_language(language: str, stopwords_path: Path) -> set[str]:
    if language.lower() in ("eng", "en", "english"):
        return set(stopwords.words('english'))
    if language.lower() in ("ru", "russian"):
        return set(stopwords.words('russian'))
    if language.lower() in ("pt", "portuguese"):
        return set(stopwords.words('portuguese'))
    if language.lower() in ("bg", "bulgarian"):
        sw = open(stopwords_path / "bulgarian.txt").readlines()
        sw = [w.strip() for w in sw]
        return set(sw)
    if language.lower() in ("hi", "hindi"):
        sw = open(stopwords_path / "hindi.txt").readlines()
        sw = [w.strip() for w in sw]
        return set(sw)

def process_file(filepath: Path, stop_words: set[str]) -> list[str]:
    url_pattern = re.compile(r'https?://\S+') ## remove urls- lots of articles have them

    content = "".join(open(filepath).readlines())
    content = url_pattern.sub("", content)
    content = content.lower()
    content = re.sub(r'[^\w\s]', "", content) # non-word and non-whitespace characters
    content = re.sub(r'\d', "", content) # remove digits
    content = re.sub(r"\s+", " ", content) # remove newlines, duplicate whitespaces, tabs, ...
    content = [ token for token in content.split(" ") if token not in stop_words ]
    return content

In [147]:
base_path = Path.cwd().parent / "data"
filepaths = get_raw_files_per_language(base_path  / "training_data_16_October_release")

for language in filepaths.keys():
    stop_words = get_stopwords_for_language(language, base_path /  "stopwords")
    Path(base_path / "tmp" / language ).mkdir(parents=True, exist_ok=True)
    for filepath in filepaths[language]:
        output_file = base_path / "tmp" / language / filepath.name
        output_file.unlink() #delete if file exists
        tokens = process_file(filepath, stop_words)
        with open(output_file, "w") as file:
            file.write(" ".join(tokens))