# ✅ Lematización Simple (sin POS tagging) para Tweets en Español

In [25]:
# 📦 IMPORTACIÓN DE LIBRERÍAS Y RECURSOS
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import string

# Descargas necesarias
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rodrigovillacinda/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rodrigovillacinda/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/rodrigovillacinda/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [27]:
# 🧹 DEFINICIÓN DE FUNCIÓN DE LIMPIEZA Y LEMATIZACIÓN SIMPLE
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('spanish'))
tokenizer = RegexpTokenizer(r'\w+')

def limpiar_y_lematizar(text_limpio):
    if not isinstance(text_limpio, str):
        return ""
    text_limpio = text_limpio.lower()
    text_limpio = text_limpio.translate(str.maketrans('', '', string.punctuation))
    tokens = tokenizer.tokenize(text_limpio)
    lematizado = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(lematizado)


In [29]:
# 📂 CARGA DE DATOS Y APLICACIÓN
df = pd.read_csv('02tweetsTokenizados.csv')
print("Columnas del dataset:", df.columns)

# Aplica lematización a la columna 'text_limpio'
df['text_limpio_limpio'] = df['text_limpio'].apply(limpiar_y_lematizar)

# Guarda el resultado
df.to_csv('03lemmatized.csv', index=False)
print("✅ Archivo '03lemmatized.csv' generado con éxito.")
df.head(10)


Columnas del dataset: Index(['textID', 'text', 'selected_text', 'sentiment', 'text_limpio',
       'tokens'],
      dtype='object')
✅ Archivo '03lemmatized.csv' generado con éxito.


Unnamed: 0,textID,text,selected_text,sentiment,text_limpio,tokens,text_limpio_limpio
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,idhaverespondedifiweregoing,['idhaverespondedifiweregoing'],idhaverespondedifiweregoing
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooosadiwillmissyouhereinsandiego,['sooosadiwillmissyouhereinsandiego'],sooosadiwillmissyouhereinsandiego
2,088c60f138,my boss is bullying me...,bullying me,negative,mybossisbullyingme,['mybossisbullyingme'],mybossisbullyingme
3,9642c003ef,what interview! leave me alone,leave me alone,negative,whatinterviewleavemealone,['whatinterviewleavemealone'],whatinterviewleavemealone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,sonsofwhycouldnttheyputthemonthereleaseswealre...,['sonsofwhycouldnttheyputthemonthereleasesweal...,sonsofwhycouldnttheyputthemonthereleaseswealre...
5,28b57f3990,http://www.dothebouncy.com/smf - some shameles...,http://www.dothebouncy.com/smf - some shameles...,neutral,httpwwwdothebouncycomsmfsomeshamelesspluggingf...,['httpwwwdothebouncycomsmfsomeshamelesspluggin...,httpwwwdothebouncycomsmfsomeshamelesspluggingf...
6,6e0c6d75b1,2am feedings for the baby are fun when he is a...,fun,positive,2amfeedingsforthebabyarefunwhenheisallsmilesan...,['2amfeedingsforthebabyarefunwhenheisallsmiles...,2amfeedingsforthebabyarefunwhenheisallsmilesan...
7,50e14c0bb8,Soooo high,Soooo high,neutral,soooohigh,['soooohigh'],soooohigh
8,e050245fbd,Both of you,Both of you,neutral,bothofyou,['bothofyou'],bothofyou
9,fc2cbefa9d,Journey!? Wow... u just became cooler. hehe....,Wow... u just became cooler.,positive,journeywowujustbecamecoolerheheisthatpossible,['journeywowujustbecamecoolerheheisthatpossible'],journeywowujustbecamecoolerheheisthatpossible
