In [1]:
import regex as re
from pathlib import Path
from nltk.corpus import stopwords
import spacy
from spacy import Language
import nltk
from typing import Set, List, Tuple
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ahmed\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def get_raw_files_per_language(data_base: Path) -> dict[str, List[Path]]:
    res = {}
    for language_dir in data_base.iterdir():
        raw_dir = language_dir / "raw-documents"
        res[language_dir.name] = list(raw_dir.iterdir())
    return res

def get_stopwords_for_language(language: str, stopwords_path: Path) -> Set[str]:
    if language.lower() in ("eng", "en", "english"):
        return set(stopwords.words('english'))
    if language.lower() in ("ru", "russian"):
        return set(stopwords.words('russian'))
    if language.lower() in ("pt", "portuguese"):
        return set(stopwords.words('portuguese'))
    if language.lower() in ("bg", "bulgarian"):
        sw = open(stopwords_path / "bulgarian.txt", encoding='utf-8').readlines()
        sw = [w.strip() for w in sw]
        return set(sw)
    if language.lower() in ("hi", "hindi"):
        sw = open(stopwords_path / "hindi.txt", encoding='utf-8').readlines()
        sw = [w.strip() for w in sw]
        return set(sw)

def process_file(filepath: Path, stop_words: Set[str], nlp: Language) -> List[str]:
    url_pattern = re.compile(r'https?://\S+')  # remove urls- lots of articles have them

    with open(filepath, encoding='utf-8') as file:
        content = file.read()
    content = url_pattern.sub("", content)
    content = content.lower()
    content = re.sub(r'[^\w\s]', "", content)  # non-word and non-whitespace characters
    content = re.sub(r'\d', "", content)  # remove digits
    content = re.sub(r"\s+", " ", content)  # remove newlines, duplicate whitespaces, tabs, ...

    # Use spaCy's tokenizer
    doc = nlp(content)
    tokens = [token.text for token in doc if token.text not in stop_words]

    return tokens

def add_tags(tokens: List[str], nlp: Language = None) -> List[Tuple[str, str, str]]:
    if nlp is None:
        return [(token, '_', '_') for token in tokens]

    str_tokens = " ".join(tokens)
    doc = nlp(str_tokens)

    return [(token.text, token.pos_, token.lemma_) for token in doc]

def save_to_conllu(filepath: Path, content: List[Tuple[str, str, str]], language: str):
    with open(filepath, "w", encoding='utf-8') as f:
        f.write("# id\ttoken\tpos\tlemma\n")
        for i, (token, pos, lemma) in enumerate(content):
            f.write(f"{i + 1}\t{token}\t{pos}\t{lemma}\n")
        f.write("\n")

In [3]:
# Download necessary spaCy models
!python -m spacy download en_core_web_sm
!python -m spacy download ru_core_news_sm
!python -m spacy download pt_core_news_sm

spacy_models = {
    'EN' : 'en_core_web_sm',
    'PT' : 'pt_core_news_sm',
    'RU' : 'ru_core_news_sm',
    'BG' : None,
    'HI' : None,
}

base_path = Path.cwd().parent / "data"
filepaths = get_raw_files_per_language(base_path  / "training_data_16_October_release")

for language in filepaths.keys():
    spacy_model = spacy_models[language]        
    nlp = spacy.load(spacy_model) if spacy_model else None

    stop_words = get_stopwords_for_language(language, base_path /  "stopwords")
    Path(base_path / "tmp" / language ).mkdir(parents=True, exist_ok=True)
    for filepath in filepaths[language]:
        output_file = base_path / "tmp" / language / (filepath.stem + ".conllu")
        if output_file.exists():
            output_file.unlink() #delete if file exists
        tokens = process_file(filepath, stop_words, nlp)
        tagged_tokens = add_tags(tokens, nlp)
        save_to_conllu(output_file, tagged_tokens, language)

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---- ----------------------------------- 1.6/12.8 MB 8.4 MB/s eta 0:00:02
     ---------- ----------------------------- 3.4/12.8 MB 9.2 MB/s eta 0:00:02
     ---------------- ----------------------- 5.2/12.8 MB 8.9 MB/s eta 0:00:01
     ---------------------- ----------------- 7.3/12.8 MB 9.1 MB/s eta 0:00:01
     ---------------------------- ----------- 9.2/12.8 MB 9.1 MB/s eta 0:00:01
     ----------------------------------- ---- 11.3/12.8 MB 9.2 MB/s eta 0:00:01
     ---------------------------------------- 12.8/12.8 MB 8.9 MB/s eta 0:00:00
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting ru-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.7.0/ru_core_news_sm-3.7.0-py3-none-any.whl (15.3 MB)
     ---------------------------------------- 0.0/15.3 MB ? eta -:--:--
     ---- ----------------------------------- 1.6/15.3 MB 7.6 MB/s eta 0:00:02
     -------- ------------------------------- 3.1/15.3 MB 8.0 MB/s eta 0:00:02
     ------------- -------------------------- 5.0/15.3 MB 8.2 MB/s eta 0:00:02
     ----------------- ---------------------- 6.6/15.3 MB 8.4 MB/s eta 0:00:02
     --------------------- ------------------ 8.1/15.3 MB 8.1 MB/s eta 0:00:01
     -------------------------- ------------- 10.2/15.3 MB 8.4 MB/s eta 0:00:01
     ------------------------------ --------- 11.8/15.3 MB 8.2 MB/s eta 0:00:01
     ----------------------------------- ---- 13.4/15.3 MB 8.1 MB/s eta 0:00:01
     ---------------------------------------  15.2/15.3 MB 8.2 MB/s eta 0:00:01
     -----------------------------


[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting pt-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.7.0/pt_core_news_sm-3.7.0-py3-none-any.whl (13.0 MB)
     ---------------------------------------- 0.0/13.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/13.0 MB ? eta -:--:--
      --------------------------------------- 0.3/13.0 MB ? eta -:--:--
     -- ------------------------------------- 0.8/13.0 MB 1.4 MB/s eta 0:00:09
     --- ------------------------------------ 1.0/13.0 MB 1.5 MB/s eta 0:00:09
     ---- ----------------------------------- 1.3/13.0 MB 1.5 MB/s eta 0:00:08
     ---- ----------------------------------- 1.6/13.0 MB 1.5 MB/s eta 0:00:08
     ----- ---------------------------------- 1.8/13.0 MB 1.5 MB/s eta 0:00:08
     ------- -------------------------------- 2.4/13.0 MB 1.5 MB/s eta 0:00:08
     -------- ------------------------------- 2.6/13.0 MB 1.5 MB/s eta 0:00:08
     -------- ------------------------------- 2.9/13

In [9]:
with open(base_path / "tmp" / "EN" / "EN_UA_000543.conllu", "r", encoding='utf-8') as file:
    lines = file.readlines()
    print(lines[50])  # Line 50 in 0-indexed list

# POS Tagging and Stopword Issues
#
# Issues Identified
#
# POS Tagging
# - Misclassification of Proper Nouns: Proper nouns were frequently misclassified due to lowercasing. For example:
#   - "Unity Foods" was tagged as NOUN instead of PROPN.
#   - "Edvard Munch’s" was tagged as NOUN instead of PROPN.
#   - "residents" was tagged as PROPN instead of NOUN.
#   - "Jane" was tagged as NOUN instead of PROPN.
#   - "Fondas" was tagged as VERB instead of PROPN.
#
# Stopwords
# - Some stopwords (e.g., "that", "do", "we", "d", "s") were inadvertently left in the text.
#
# Improvements Implemented
#
# Enhanced Tokenization
# - Integrated spaCy's tokenizer to address complex tokenization cases more effectively. This change also resolved the issue of leftover stopwords.
#
# ---
#
# Overall, while some errors still persist (like the printed word), there has been a significant reduction in their occurrence. These improvements have led to better analysis and more meaningful insights from the text data.



50	worry	NOUN	worry

