In [None]:
!pip install -U spacy



In [None]:
!python -m spacy download es_core_news_sm

Collecting es-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0-py3-none-any.whl (12.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy
nlp = spacy.load('es_core_news_sm')



---

# Spacy

---



In [None]:
import re
import unicodedata

def filters(value):
  text = str(value)
  text = re.sub(r'[()\-_]', ' ', text)
  text = ''.join((c for c in unicodedata.normalize('NFD', value) if unicodedata.category(c) != 'Mn'))

  return text

def is_token_allowed(token: str) -> bool:
  return bool(
      token
      and not token.is_stop
      and token.is_alpha
      and not token.is_punct
  )

def preprocess_token(token: str) -> bool:
  return token.lemma_.strip().lower()

def process_text(df, columns):
    df[columns] = df[columns].applymap(
      lambda value: ' '.join([
        preprocess_token(token)
        for token in nlp(value)
        if is_token_allowed(token)
    ]))
    return df



---

# NLTK

---



In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import re
import string

# Descarga los datos necesarios
nltk.download('punkt')
nltk.download('stopwords')

stemmer = nltk.SnowballStemmer("spanish")
stopword = set(stopwords.words('spanish'))

def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text

# Define tus funciones
def is_token_allowed_nltk(token: str) -> bool:
  # Lista de palabras vacías en español
  stop_words = set(stopwords.words('spanish'))

  return bool(
      token
      and token not in stop_words
      and token.isalpha()
  )

def preprocess_token_nltk(token: str) -> str:
  # Inicializa el stemmer para español
  stemmer = SnowballStemmer('spanish')

  return stemmer.stem(token.lower())

def process_text_nltk(df, columns):
    df[columns] = df[columns].applymap(
      lambda value: ' '.join([
        token
        for token in word_tokenize(clean(value))
    ]))
    return df


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
text = '𝙲𝚊𝚕𝚕 𝚘𝚏 𝙳𝚞𝚝𝚢 𝙱𝚕𝚊𝚌𝚔 𝙾𝚙𝚜 𝙲𝚘𝚕𝚍 𝚆𝚊𝚛 [Hóla] {Mi_sitio-web} es https://www.misitio.com.  5464546 Me {encanta} (programar) en <Python>! ¿Has visto el/ /último\ video en www.youtube.com? Es increíble. Además, mi dirección de correo electrónico es usuario123@ejemplo.com. Tengo 20 años y vivo en la calle 123. ¡Hasta luego!'

In [None]:
tokens = [
  preprocess_token(token)
  for token in nlp(filters(text))
  if is_token_allowed(token)
]
tokens

['𝙲𝚊𝚕𝚕',
 '𝚘𝚏',
 '𝙳𝚞𝚝𝚢',
 '𝙱𝚕𝚊𝚌𝚔',
 '𝙾𝚙𝚜',
 '𝙲𝚘𝚕𝚍',
 '𝚆𝚊𝚛',
 'hola',
 'encantar',
 'programar',
 'python',
 'has',
 'ver',
 'video',
 'increible',
 'direccion',
 'correo',
 'electronico',
 'ano',
 'vivo',
 'calle']

In [None]:
tokens = [
  preprocess_token(token)
  for token in nlp(text)
  if is_token_allowed(token)
]
tokens

['𝙲𝚊𝚕𝚕',
 '𝚘𝚏',
 '𝙳𝚞𝚝𝚢',
 '𝙱𝚕𝚊𝚌𝚔',
 '𝙾𝚙𝚜',
 '𝙲𝚘𝚕𝚍',
 '𝚆𝚊𝚛',
 'hóla',
 'encantar',
 'programar',
 'python',
 'has',
 'ver',
 'video',
 'increíble',
 'dirección',
 'correo',
 'electrónico',
 'año',
 'vivo',
 'calle']

In [None]:
tokens = [
  token
  for token in word_tokenize(clean(text))
]
tokens

['misitioweb',
 'encant',
 'program',
 '¿has',
 'vist',
 'ultim',
 'vide',
 'increibl',
 'ademas',
 'direccion',
 'corre',
 'electron',
 'años',
 'viv',
 'call',
 '¡hast',
 'lueg']