# Vectorización del texto limpio
Este notebook carga el archivo `03lemmatized.csv`, vectoriza el texto limpio usando `CountVectorizer` y guarda el resultado como un nuevo CSV con etiquetas.

In [15]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Cargar solo una parte del dataset (ej. 5000 filas)
df = pd.read_csv("03lemmatized.csv", nrows=5000)
df[['text_limpio_limpio', 'sentiment']].head()
df.head(2)

Unnamed: 0,textID,text,selected_text,sentiment,text_limpio,text_tokenizado,text_limpio_limpio,tokens_lema,text_lema
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,id have responded if i were going,"['responded', 'going']",responded going,"['respond', 'go']",respond go
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,soo sad i will miss you here in san diego,"['soo', 'sad', 'miss', 'san', 'diego']",soo sad miss san diego,"['soo', 'sad', 'miss', 'san', 'diego']",soo sad miss san diego


In [17]:
# Vectorizar el texto limpio
vectorizer = CountVectorizer(
    stop_words='english',      # eliminar palabras comunes
    min_df=5,            # ignora palabras que aparecen en menos de 5 tweets
    max_df=0.8,          # ignora palabras que aparecen en más del 80% de tweets
    max_features=3000    # opcional: limita el vocabulario
)

 #Eliminar filas donde 'text_limpio_limpio' es NaN
df = df.dropna(subset=["text_limpio_limpio"])
X = vectorizer.fit_transform(df["text_lema"])

 #Convertir a DataFrame y añadir etiquetas
df_bow = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
df_bow["tweet"] = df["sentiment"]

 #Guardar el resultado
df_bow.to_csv("04tweets_vectorizados_bow.csv", index=False)
print("✅ Vectorización completa. Archivo guardado como 04tweets_vectorizados_bow.csv")



✅ Vectorización completa. Archivo guardado como 04tweets_vectorizados_bow.csv


In [18]:
# Seleccionar las 20 palabras más frecuentes
frecuentes = df_bow.drop(columns=["tweet"]).sum().sort_values(ascending=False).head(20).index
df_bow[frecuentes].head(10)


Unnamed: 0,day,good,work,like,love,want,know,time,lol,today,think,happy,miss,make,look,dont,really,mother,thanks,home
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
