# Tokenización y generación de texto limpio espaciado
Este notebook tokeniza el texto limpio y genera una nueva columna con los tokens separados por espacios para una vectorización adecuada.

In [3]:
import pandas as pd
import re
from nltk.corpus import stopwords

# Cargar el dataset limpio
df = pd.read_csv("01tweetsLimpios.csv")
df.head()

Unnamed: 0,textID,text,selected_text,sentiment,text_limpio
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,id have responded if i were going
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,soo sad i will miss you here in san diego
2,088c60f138,my boss is bullying me...,bullying me,negative,my boss is bullying me
3,9642c003ef,what interview! leave me alone,leave me alone,negative,what interview leave me alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,sons of why couldnt they put them on the relea...


In [5]:
# Obtener stopwords en español
stop_words = set(stopwords.words('english'))

# Función para tokenizar y filtrar stopwords
def tokenizar_sin_stopwords(texto):
    tokens = re.findall(r'[a-zA-Z]+', str(texto))  # solo letras
    tokens = [t for t in tokens if t not in stop_words]  # quita stopwords
    tokens = [t for t in tokens if len(t) > 2]  # opcional: quita tokens muy cortos
    return tokens


In [7]:
# Aplicar tokenización y crear columna con tokens
df["text_tokenizado"] = df["text_limpio"].apply(tokenizar_sin_stopwords)
# Generar texto limpio con espacios para vectorización
df["text_limpio_limpio"] = df["text_tokenizado"].apply(lambda tokens: " ".join(tokens))
df[["text_limpio", "text_tokenizado", "text_limpio_limpio"]].head()

Unnamed: 0,text_limpio,text_tokenizado,text_limpio_limpio
0,id have responded if i were going,"[responded, going]",responded going
1,soo sad i will miss you here in san diego,"[soo, sad, miss, san, diego]",soo sad miss san diego
2,my boss is bullying me,"[boss, bullying]",boss bullying
3,what interview leave me alone,"[interview, leave, alone]",interview leave alone
4,sons of why couldnt they put them on the relea...,"[sons, couldnt, put, releases, already, bought]",sons couldnt put releases already bought


In [9]:
# Guardar archivo final listo para vectorización
df.to_csv("02tweetsTokenizados.csv", index=False)
print("✅ Tokenización completa y texto limpio generado. Guardado como 03lemmatized.csv")
df.head(10)

✅ Tokenización completa y texto limpio generado. Guardado como 03lemmatized.csv


Unnamed: 0,textID,text,selected_text,sentiment,text_limpio,text_tokenizado,text_limpio_limpio
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,id have responded if i were going,"[responded, going]",responded going
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,soo sad i will miss you here in san diego,"[soo, sad, miss, san, diego]",soo sad miss san diego
2,088c60f138,my boss is bullying me...,bullying me,negative,my boss is bullying me,"[boss, bullying]",boss bullying
3,9642c003ef,what interview! leave me alone,leave me alone,negative,what interview leave me alone,"[interview, leave, alone]",interview leave alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,sons of why couldnt they put them on the relea...,"[sons, couldnt, put, releases, already, bought]",sons couldnt put releases already bought
5,28b57f3990,http://www.dothebouncy.com/smf - some shameles...,http://www.dothebouncy.com/smf - some shameles...,neutral,some shameless plugging for the best rangers f...,"[shameless, plugging, best, rangers, forum, ea...",shameless plugging best rangers forum earth
6,6e0c6d75b1,2am feedings for the baby are fun when he is a...,fun,positive,am feedings for the baby are fun when he is al...,"[feedings, baby, fun, smiles, coos]",feedings baby fun smiles coos
7,50e14c0bb8,Soooo high,Soooo high,neutral,soo high,"[soo, high]",soo high
8,e050245fbd,Both of you,Both of you,neutral,both of you,[],
9,fc2cbefa9d,Journey!? Wow... u just became cooler. hehe....,Wow... u just became cooler.,positive,journey wow you just became cooler hehe is tha...,"[journey, wow, became, cooler, hehe, possible]",journey wow became cooler hehe possible
