# 04 - Vectorización TF-IDF desde cero
En este notebook implementamos la vectorización de tweets usando el esquema TF-IDF sin usar bibliotecas externas como `TfidfVectorizer`.

In [3]:
# Paso 1: Cargar dataset limpio y lematizado
import pandas as pd

# Cargar tweets lematizados
df = pd.read_csv('03lemmatized.csv')
df.head()

Unnamed: 0,textID,text,selected_text,sentiment,text_limpio,tokens,text_limpio_limpio
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,id have responded if i were going,"['i', 'd', 'have', 'respond', 'if', 'i', 'be',...",i d have respond if i be go
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad i will miss you here in san diego,"['sooo', 'sad', 'i', 'will', 'miss', 'you', 'h...",sooo sad i will miss you here in san diego
2,088c60f138,my boss is bullying me...,bullying me,negative,my boss is bullying me,"['my', 'bos', 'be', 'bully']",my bos be bully
3,9642c003ef,what interview! leave me alone,leave me alone,negative,what interview leave me alone,"['what', 'interview', 'leave', 'alone']",what interview leave alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,sons of why couldnt they put them on the relea...,"['son', 'of', 'why', 'couldn', 't', 'they', 'p...",son of why couldn t they put them on the relea...


In [9]:
# Vectorización con TfidfVectorizer optimizado
from sklearn.feature_extraction.text import TfidfVectorizer

# Aseguramos que los textos sean string
df = df.dropna(subset=['text'])
corpus = df['text'].astype(str).tolist()

vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(corpus)

# Convertimos a DataFrame
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df['tweet'] = df['sentiment']
tfidf_df.head()

Unnamed: 0,00,000,000th,00am,00pm,01,02,024,02mxjj,03,...,½tearï,½timo,½ureo,½ve,½why,½whyyy,½y,½you,½z,½ï
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Paso 6: Guardar en CSV para usar en NaiveBayes
tfidf_df.to_csv('04tweets_vectorizados_tfidf.csv', index=False)
print('✅ Dataset vectorizado con TF-IDF guardado.')