## Library imports

In [2]:
import regex as re
from pathlib import Path
from nltk.corpus import stopwords
import spacy
from spacy import Language
import nltk
from typing import Set, List, Tuple
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\IvanB\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Define functions

In [3]:
def get_raw_files_per_language(data_base: Path) -> dict[str, List[Path]]:
    res = {}
    for language_dir in data_base.iterdir():
        raw_dir = language_dir / "raw-documents"
        res[language_dir.name] = list(raw_dir.iterdir())
    return res

def get_stopwords_for_language(language: str, stopwords_path: Path) -> Set[str]:
    if language.lower() in ("eng", "en", "english"):
        return set(stopwords.words('english'))
    if language.lower() in ("ru", "russian"):
        return set(stopwords.words('russian'))
    if language.lower() in ("pt", "portuguese"):
        return set(stopwords.words('portuguese'))
    if language.lower() in ("bg", "bulgarian"):
        sw = open(stopwords_path / "bulgarian.txt", encoding='utf-8').readlines()
        sw = [w.strip() for w in sw]
        return set(sw)
    if language.lower() in ("hi", "hindi"):
        sw = open(stopwords_path / "hindi.txt", encoding='utf-8').readlines()
        sw = [w.strip() for w in sw]
        return set(sw)

def process_file(filepath: Path, stop_words: Set[str], nlp: Language) -> List[str]:
    url_pattern = re.compile(r'https?://\S+')  # remove urls- lots of articles have them

    with open(filepath, encoding='utf-8') as file:
        content = file.read()
    content = url_pattern.sub("", content)
    content = content.lower()
    content = re.sub(r'[^\w\s]', "", content)  # non-word and non-whitespace characters
    content = re.sub(r'\d', "", content)  # remove digits
    content = re.sub(r"\s+", " ", content)  # remove newlines, duplicate whitespaces, tabs, ...

    # Use spaCy's tokenizer
    doc = nlp(content)
    tokens = [token.text for token in doc if token.text not in stop_words]

    return tokens

def add_tags(tokens: List[str], nlp: Language = None) -> List[Tuple[str, str, str]]:
    if nlp is None:
        return [(token, '_', '_') for token in tokens]

    str_tokens = " ".join(tokens)
    doc = nlp(str_tokens)

    return [(token.text, token.pos_, token.lemma_) for token in doc]

def save_to_conllu(filepath: Path, content: List[Tuple[str, str, str]], language: str):
    with open(filepath, "w", encoding='utf-8') as f:
        f.write("# id\ttoken\tpos\tlemma\n")
        for i, (token, pos, lemma) in enumerate(content):
            f.write(f"{i + 1}\t{token}\t{pos}\t{lemma}\n")
        f.write("\n")

## Run preprocessing

In [6]:
# Download necessary spaCy models
!python -m spacy download en_core_web_sm
!python -m spacy download ru_core_news_sm
!python -m spacy download pt_core_news_sm

spacy_models = {
    'EN' : 'en_core_web_sm',
    'PT' : 'pt_core_news_sm',
    'RU' : 'ru_core_news_sm',
    'BG' : None,
    'HI' : None,
}

base_path = Path.cwd().parent / "data"
filepaths = get_raw_files_per_language(base_path  / "training_data_16_October_release")

for language in filepaths.keys():
    spacy_model = spacy_models[language]        
    nlp = spacy.load(spacy_model) if spacy_model else None

    stop_words = get_stopwords_for_language(language, base_path /  "stopwords")
    Path(base_path / "tmp" / language ).mkdir(parents=True, exist_ok=True)
    for filepath in filepaths[language]:
        output_file = base_path / "tmp" / language / (filepath.stem + ".conllu")
        if output_file.exists():
            output_file.unlink() #delete if file exists
        tokens = process_file(filepath, stop_words, nlp)
        tagged_tokens = add_tags(tokens, nlp)
        save_to_conllu(output_file, tagged_tokens, language)

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ----- ---------------------------------- 1.8/12.8 MB 11.2 MB/s eta 0:00:01
     ------------ --------------------------- 3.9/12.8 MB 10.2 MB/s eta 0:00:01
     -------------------- ------------------- 6.6/12.8 MB 10.9 MB/s eta 0:00:01
     ---------------------------- ----------- 9.2/12.8 MB 11.4 MB/s eta 0:00:01
     ----------------------------------- --- 11.8/12.8 MB 11.5 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 11.0 MB/s eta 0:00:00
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting ru-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.7.0/ru_core_news_sm-3.7.0-py3-none-any.whl (15.3 MB)
     ---------------------------------------- 0.0/15.3 MB ? eta -:--:--
     -- ------------------------------------- 0.8/15.3 MB 5.6 MB/s eta 0:00:03
     ------ --------------------------------- 2.4/15.3 MB 6.7 MB/s eta 0:00:02
     ---------- ----------------------------- 4.2/15.3 MB 7.4 MB/s eta 0:00:02
     ---------------- ----------------------- 6.3/15.3 MB 8.0 MB/s eta 0:00:02
     ---------------------- ----------------- 8.7/15.3 MB 8.7 MB/s eta 0:00:01
     ---------------------------- ----------- 10.7/15.3 MB 8.9 MB/s eta 0:00:01
     ---------------------------------- ----- 13.1/15.3 MB 9.3 MB/s eta 0:00:01
     ---------------------------------------  15.2/15.3 MB 9.7 MB/s eta 0:00:01
     ---------------------------------------- 15.3/15.3 MB 9.3 MB/s eta 0:00:00
[38;5;2m[+] Download and installa


[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting pt-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.7.0/pt_core_news_sm-3.7.0-py3-none-any.whl (13.0 MB)
     ---------------------------------------- 0.0/13.0 MB ? eta -:--:--
     -- ------------------------------------- 0.8/13.0 MB 8.3 MB/s eta 0:00:02
     -------- ------------------------------- 2.6/13.0 MB 8.9 MB/s eta 0:00:02
     --------------- ------------------------ 5.0/13.0 MB 9.7 MB/s eta 0:00:01
     ----------------------- ---------------- 7.6/13.0 MB 10.4 MB/s eta 0:00:01
     ------------------------------ -------- 10.2/13.0 MB 10.8 MB/s eta 0:00:01
     ------------------------------------- - 12.6/13.0 MB 11.0 MB/s eta 0:00:01
     --------------------------------------- 13.0/13.0 MB 10.4 MB/s eta 0:00:00
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## English (EN) output analysis

In [None]:
with open(base_path / "tmp" / "EN" / "EN_UA_000543.conllu", "r", encoding='utf-8') as file:
    lines = file.readlines()
    print(lines[51])  # Line 50 in 0-indexed list

### Conclusion for EN

POS Tagging and Stopword Issues

Issues Identified

POS Tagging
- Misclassification of Proper Nouns: Proper nouns were frequently misclassified due to lowercasing. For example:
  - "Unity Foods" was tagged as NOUN instead of PROPN.
  - "Edvard Munch’s" was tagged as NOUN instead of PROPN.
  - "residents" was tagged as PROPN instead of NOUN.
  - "Jane" was tagged as NOUN instead of PROPN.
  - "Fondas" was tagged as VERB instead of PROPN.

Stopwords
- Some stopwords (e.g., "that", "do", "we", "d", "s") were inadvertently left in the text.

Improvements Implemented

Enhanced Tokenization
- Integrated spaCy's tokenizer to address complex tokenization cases more effectively. This change also resolved the issue of leftover stopwords.

---

Overall, while some errors still persist (like the printed word), there has been a significant reduction in their occurrence. These improvements have led to better analysis and more meaningful insights from the text data.

## Portuguese (PT) output analysis

In [7]:
# Sample output file in PT
output_file_path = base_path / "tmp" / "PT" / "PT_21.conllu"
with open(output_file_path, "r", encoding='utf-8') as file:
    lines = file.readlines()
    for i in range(1, 10):  # print lines 50 to 100
        print(lines[i])

1	fundo	PROPN	Fundo

2	brasil	PROPN	Brasil

3	apoia	VERB	apoiar

4	projetos	NOUN	projeto

5	comunidades	NOUN	comunidade

6	tradicionais	ADJ	tradicional

7	aviso	NOUN	aviso

8	comunidades	NOUN	comunidade

9	tradicionais	ADJ	tradicional



In [17]:
# Original text for comparison
original_file_path = base_path / "training_data_16_October_release" / "PT" / "raw-documents" / "PT_21.txt"
with open(original_file_path, "r", encoding='utf-8') as file:
    original_content = file.read()
    print(original_content[:375])

Não queremos que o conflito na Ucrânia se transforme numa guerra entre a Rússia e a NATO, diz Alemanha

A insinuação feita na segunda-feira (26) pelo presidente francês, Emmanuel Macron, de que os aliados da Organização do Tratado do Atlântico Norte (NATO) poderiam enviar tropas para a Ucrânia ainda está a ter repercussões entre os líderes europeus e norte-americanos.

Est


### Conclusion for PT

During a detailed manual inspection of six randomly selected documents: PT_11, PT_21, PT_31, PT_171, PT_181, PT_191 (the first 3 documents on the topic of Ukraine and the latter 3 documents on the topic of climate change), several types of errors were identified in the preprocessing and annotation.

These mistakes can be broadly categorized into five main types: POS Tagging Errors, Named Entity Recognition (NER) Failures, and Lemmatization Issues.

**1. POS Tagging Errors**

This type of error occurs when the algorithm incorrectly classifies the part of speech (POS) of a token.

- PT_11: "armadas" was tagged as a VERB, but in the context, it should be an ADJ.
- PT_21: "parte" was classified as a VERB, but it should be a NOUN.
- PT_181: "recursos" was marked as an ADJ, but it should be a NOUN.

Possible Solutions could be using contextualized models that use a more robust language model like BERT or a transformer-based model specifically fine-tuned for Portuguese. Or using linguistic rules that capture the nuances of Portuguese, such as words that can function as multiple parts of speech depending on the context (e.g., "caso").

**2. Named Entity Recognition (NER) Failures**

NER failures occur when named entities, such as proper nouns (PROPN), are misclassified as common nouns (NOUN) or other POS categories.

- PT_11: "reuters" was tagged as a NOUN but should be PROPN.
- PT_21: "ucrânia" (Ukraine) was classified as a VERB, but it should be a PROPN.
- PT_191: "catarina" was labeled as a NOUN, but it should be a PROPN.

Possible Solutions could involve fine-tuning a NER model for Portuguese using a corpus rich in proper nouns and named entities, as well as ensuring the preprocessing step does not affect the capitalization of words, as this is often a strong indicator of proper nouns in Portuguese.

**3. Lemmatization Issues**

Lemmatization errors are when the lemma assigned to a token is incorrect.

- PT_171: "fizesse" was correctly identified as a VERB, but the lemma should be "fazer," not "fizesse."
- PT_31: "terraar" is a lemmatization error, where "terra-ar" should have been split into "terra" and "ar" and processed separately.

Possible solution are using a comprehensive lemmatization dictionary tailored for Portuguese, which can handle verb conjugations and compound words, as well as improving tokenization to handle hyphenated ("-") words correctly.
