# Lematización real usando NLTK
Este notebook aplica lematización real sobre los tokens usando `WordNetLemmatizer` de NLTK.

In [28]:
import pandas as pd
import re
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, download
import nltk
nltk.download('averaged_perceptron_tagger_eng')


# Descargar recursos necesarios de NLTK (si no se tienen)
download('stopwords')
download('wordnet')
download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/rodrigovillacinda/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rodrigovillacinda/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rodrigovillacinda/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/rodrigovillacinda/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [30]:
# Función auxiliar para mapear POS tags de NLTK a los de WordNet
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [32]:
# Tokenización, limpieza, stopwords y lematización
stop_words = set(stopwords.words('spanish'))
lemmatizer = WordNetLemmatizer()

def lematizar_texto(texto):
    tokens = re.findall(r'[a-zA-Z]+', str(texto).lower())
    tokens_filtrados = [token for token in tokens if token not in stop_words]
    tagged = pos_tag(tokens_filtrados)
    lemas = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in tagged]
    return lemas

In [34]:
# Cargar el dataset limpio
df = pd.read_csv("01tweetsLimpios.csv")
df["tokens"] = df["text"].apply(lematizar_texto)
df["text_limpio_limpio"] = df["tokens"].apply(lambda tokens: " ".join(tokens))
df[["text", "tokens", "text_limpio_limpio"]].head()

Unnamed: 0,text,tokens,text_limpio_limpio
0,"I`d have responded, if I were going","[i, d, have, respond, if, i, be, go]",i d have respond if i be go
1,Sooo SAD I will miss you here in San Diego!!!,"[sooo, sad, i, will, miss, you, here, in, san,...",sooo sad i will miss you here in san diego
2,my boss is bullying me...,"[my, bos, be, bully]",my bos be bully
3,what interview! leave me alone,"[what, interview, leave, alone]",what interview leave alone
4,"Sons of ****, why couldn`t they put them on t...","[son, of, why, couldn, t, they, put, them, on,...",son of why couldn t they put them on the relea...


In [36]:
# Guardar el archivo final con lemas
df.to_csv("03lemmatized.csv", index=False)
print("✅ Lematización real exportada como 03lemmatized.csv")
df.head(10)

✅ Lematización real exportada como 03lemmatized.csv


Unnamed: 0,textID,text,selected_text,sentiment,text_limpio,tokens,text_limpio_limpio
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,id have responded if i were going,"[i, d, have, respond, if, i, be, go]",i d have respond if i be go
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad i will miss you here in san diego,"[sooo, sad, i, will, miss, you, here, in, san,...",sooo sad i will miss you here in san diego
2,088c60f138,my boss is bullying me...,bullying me,negative,my boss is bullying me,"[my, bos, be, bully]",my bos be bully
3,9642c003ef,what interview! leave me alone,leave me alone,negative,what interview leave me alone,"[what, interview, leave, alone]",what interview leave alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,sons of why couldnt they put them on the relea...,"[son, of, why, couldn, t, they, put, them, on,...",son of why couldn t they put them on the relea...
5,28b57f3990,http://www.dothebouncy.com/smf - some shameles...,http://www.dothebouncy.com/smf - some shameles...,neutral,some shameless plugging for the best rangers f...,"[http, www, dothebouncy, com, smf, some, shame...",http www dothebouncy com smf some shameless pl...
6,6e0c6d75b1,2am feedings for the baby are fun when he is a...,fun,positive,2am feedings for the baby are fun when he is a...,"[be, feeding, for, the, baby, be, fun, when, b...",be feeding for the baby be fun when be all smi...
7,50e14c0bb8,Soooo high,Soooo high,neutral,soooo high,"[soooo, high]",soooo high
8,e050245fbd,Both of you,Both of you,neutral,both of you,"[both, of, you]",both of you
9,fc2cbefa9d,Journey!? Wow... u just became cooler. hehe....,Wow... u just became cooler.,positive,journey wow u just became cooler hehe is that ...,"[journey, wow, u, just, become, cool, hehe, be...",journey wow u just become cool hehe be that po...
