# Vectorización del texto limpio
Este notebook carga el archivo `03lemmatized.csv`, vectoriza el texto limpio usando `CountVectorizer` y guarda el resultado como un nuevo CSV con etiquetas.

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Cargar solo una parte del dataset (ej. 5000 filas)
df = pd.read_csv("03lemmatized.csv", nrows=5000)
df[['text_limpio_limpio', 'sentiment']].head()

Unnamed: 0,text_limpio_limpio,sentiment
0,i d have respond if i be go,neutral
1,sooo sad i will miss you here in san diego,negative
2,my bos be bully,negative
3,what interview leave alone,negative
4,son of why couldn t they put them on the relea...,negative


In [2]:
# Vectorizar el texto limpio
vectorizer = CountVectorizer()
# Eliminar filas donde 'text_limpio_limpio' es NaN
df = df.dropna(subset=["text_limpio_limpio"])
X = vectorizer.fit_transform(df["text_limpio_limpio"])

# Convertir a DataFrame y añadir etiquetas
df_bow = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
df_bow["tweet"] = df["sentiment"]

# Guardar el resultado
df_bow.to_csv("04tweets_vectorizados_bow.csv", index=False)
print("✅ Vectorización completa. Archivo guardado como 04tweets_vectorizados_bow.csv")

✅ Vectorización completa. Archivo guardado como 04tweets_vectorizados_bow.csv


In [3]:
df_bow.head(10)

Unnamed: 0,aaaaaaaahhhhhhhh,aaaaaah,aaaaaw,aaaawww,aaargh,aaarrrgh,aaaw,aaawww,aah,aasman,...,zit,zls,znl,zo,zombie,zorz,zu,zulu,zune,zwitschert
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
