In [None]:
import re
from unidecode import unidecode
from nltk.tokenize import word_tokenize
import pandas as pd
import spacy
from nltk.corpus import stopwords
import docx
import stanza
stanza.download('es')
def clean_text(texto):
    text_ascii = unidecode(texto)
    result = re.sub(r'[^A-Za-z0-9 ]', '', text_ascii)
    result = result.lower()
    return result

def get_tokens(
    df : pd.DataFrame,
    text_col : str
    ) -> pd.DataFrame:
    
    df = df.copy()
    df[text_col] = df[text_col].apply(lambda x: word_tokenize(x))
    return df

def remove_stopwords(
    df : pd.DataFrame,
    stopwords : list,
    tokens_col : str
    ) -> pd.DataFrame:
    df[tokens_col] = df[tokens_col].apply(lambda x: [word for word in x if word not in stopwords])
    return df

def lemmatize_stanza(text_list):
    nlp = stanza.Pipeline('es')
    final = []
    for text in text_list:
        doc = nlp(text)
        for sentence in doc.sentences:
            for word in sentence.words:
                final.append(word.lemma)
    return final

def lemmatize_tokens(
    df : pd.DataFrame,
    tokens_col : str
    ) -> pd.DataFrame:
    def lemmatize(text):
        nlp = spacy.load("es_core_news_sm")
        return nlp(text)[0].lemma_
    df[tokens_col] = df[tokens_col].apply(lambda x: lemmatize_stanza(x))
    return df

def get_text(filename):
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
    return '\n'.join(fullText)

def preprocess_text(
        df : pd.DataFrame,
        text_col : str,
        stopwords : list
    ) -> pd.DataFrame:

    df = df.copy()
    df[text_col] = df[text_col].apply(clean_text)
    df = get_tokens(df, text_col)
    df = remove_stopwords(df, stopwords, text_col)
    df = lemmatize_tokens(df, text_col)
    return df

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 424kB [00:00, 53.0MB/s]                    
2025-03-01 15:25:18 INFO: Downloaded file to /Users/aladelca/stanza_resources/resources.json
2025-03-01 15:25:18 INFO: Downloading default packages for language: es (Spanish) ...
2025-03-01 15:25:19 INFO: File exists: /Users/aladelca/stanza_resources/es/default.zip
2025-03-01 15:25:21 INFO: Finished downloading models and saved to /Users/aladelca/stanza_resources


In [22]:
texto = get_text("/Users/aladelca/Downloads/Carlos Adrián Alarcón Delgado alarcon.docx")
df = pd.DataFrame({"texto": [texto]})
preprocess_text(
    df,
    "texto",
    list(set(stopwords.words('spanish')))
)

2025-03-01 15:22:56 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 424kB [00:00, 49.9MB/s]                    
2025-03-01 15:22:57 INFO: Downloaded file to /Users/aladelca/stanza_resources/resources.json
2025-03-01 15:22:58 INFO: Loading these models for language: es (Spanish):
| Processor    | Package           |
------------------------------------
| tokenize     | combined          |
| mwt          | combined          |
| pos          | combined_charlm   |
| lemma        | combined_nocharlm |
| constituency | combined_charlm   |
| depparse     | combined_charlm   |
| sentiment    | tass2020_charlm   |
| ner          | conll02           |

2025-03-01 15:22:58 INFO: Using device: cpu
2025-03-01 15:22:58 INFO: Loading: tokenize
2025-03-01 1

KeyboardInterrupt: 