# 04 - Vectorización TF-IDF desde cero
En este notebook implementamos la vectorización de tweets usando el esquema TF-IDF sin usar bibliotecas externas como `TfidfVectorizer`.

In [3]:
# Paso 1: Cargar dataset limpio y lematizado
import pandas as pd

# Cargar tweets lematizados
df = pd.read_csv('03lemmatized.csv')
df.head()

Unnamed: 0,textID,text,selected_text,sentiment,text_limpio,tokens,text_limpio_limpio
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,id have responded if i were going,"['i', 'd', 'have', 'respond', 'if', 'i', 'be',...",i d have respond if i be go
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad i will miss you here in san diego,"['sooo', 'sad', 'i', 'will', 'miss', 'you', 'h...",sooo sad i will miss you here in san diego
2,088c60f138,my boss is bullying me...,bullying me,negative,my boss is bullying me,"['my', 'bos', 'be', 'bully']",my bos be bully
3,9642c003ef,what interview! leave me alone,leave me alone,negative,what interview leave me alone,"['what', 'interview', 'leave', 'alone']",what interview leave alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,sons of why couldnt they put them on the relea...,"['son', 'of', 'why', 'couldn', 't', 'they', 'p...",son of why couldn t they put them on the relea...


In [5]:
# Paso 2: Tokenizar (si no está tokenizado aún)
df = df.dropna(subset=['text'])  # Asegura que no hay NaN
df['tokens'] = df['text'].astype(str).apply(lambda x: x.split())


df.head()

Unnamed: 0,textID,text,selected_text,sentiment,text_limpio,tokens,text_limpio_limpio
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,id have responded if i were going,"[I`d, have, responded,, if, I, were, going]",i d have respond if i be go
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad i will miss you here in san diego,"[Sooo, SAD, I, will, miss, you, here, in, San,...",sooo sad i will miss you here in san diego
2,088c60f138,my boss is bullying me...,bullying me,negative,my boss is bullying me,"[my, boss, is, bullying, me...]",my bos be bully
3,9642c003ef,what interview! leave me alone,leave me alone,negative,what interview leave me alone,"[what, interview!, leave, me, alone]",what interview leave alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,sons of why couldnt they put them on the relea...,"[Sons, of, ****,, why, couldn`t, they, put, th...",son of why couldn t they put them on the relea...


In [7]:
# Paso 3: Calcular TF (frecuencia de término por documento)
from collections import Counter
import numpy as np

vocabulario = set()
tf_list = []

for tokens in df['tokens']:
    tf = Counter(tokens)
    vocabulario.update(tf.keys())
    tf_list.append(tf)

vocabulario = sorted(vocabulario)
print(f"Vocabulario total: {len(vocabulario)} palabras")

Vocabulario total: 52270 palabras


In [9]:
# Paso 4: Calcular DF (document frequency) y luego IDF
N = len(df)
df_counts = {term: 0 for term in vocabulario}

for tf in tf_list:
    for term in tf:
        df_counts[term] += 1

idf = {term: np.log(N / (df_counts[term])) for term in vocabulario}

In [None]:
# Paso 5: Calcular matriz TF-IDF
tfidf_matrix = []
for tf in tf_list:
    row = [tf.get(term, 0) * idf[term] for term in vocabulario]
    tfidf_matrix.append(row)

tfidf_df = pd.DataFrame(tfidf_matrix, columns=vocabulario)
tfidf_df['tweet'] = df['tweet']  # Agregar etiquetas
tfidf_df.head()

In [None]:
# Paso 6: Guardar en CSV para usar en NaiveBayes
tfidf_df.to_csv('04tweets_vectorizados_tfidf.csv', index=False)
print('✅ Dataset vectorizado con TF-IDF guardado.')