In [2]:
# Entrenamiento con text_lema usando CountVectorizer y Naive Bayes

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Paso 1: Cargar el dataset preprocesado
df = pd.read_csv("03lemmatized.csv")

# Paso 2: Eliminar valores vacíos y limpiar
df = df.dropna(subset=["text_lema", "sentiment"])
df = df[df["text_lema"].str.strip() != ""]

# Paso 3: Vectorización con CountVectorizer
vectorizer = CountVectorizer(
    stop_words='english',
    min_df=5,
    max_df=0.8,
    max_features=3000
)
X = vectorizer.fit_transform(df["text_lema"])
y = df["sentiment"].astype(str)

# Paso 4: Separar entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Paso 5: Implementación Naive Bayes desde cero (corregido para sparse y predicción)
class NaiveBayes:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.class_probs = {}
        self.word_probs = {}

        for c in self.classes:
            X_c = X[y == c]
            self.class_probs[c] = X_c.shape[0] / X.shape[0]
            total_wc = X_c.sum()
            self.word_probs[c] = (X_c.sum(axis=0) + 1) / (total_wc + X.shape[1])

    def predict(self, X):
        predictions = []
        for i in range(X.shape[0]):
            posteriors = {}
            row = X[i].toarray()[0]
            for c in self.classes:
                log_prob = np.log(self.class_probs[c]) + np.sum(row * np.log(self.word_probs[c].A1))
                posteriors[c] = log_prob
            predictions.append(max(posteriors, key=posteriors.get))
        return np.array(predictions)

# Paso 6: Entrenamiento y evaluación
model = NaiveBayes()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Paso 7: Métricas
accuracy = (y_pred == y_test.values).mean()
print(f'\n✅ Accuracy: {accuracy:.4f}')
print("\n📊 Matriz de Confusión:")
print(confusion_matrix(y_test, y_pred))
print("\n📈 Reporte de Clasificación:")
print(classification_report(y_test, y_pred))


✅ Accuracy: 0.6531

📊 Matriz de Confusión:
[[ 922  526  107]
 [ 344 1486  376]
 [  91  456 1169]]

📈 Reporte de Clasificación:
              precision    recall  f1-score   support

    negative       0.68      0.59      0.63      1555
     neutral       0.60      0.67      0.64      2206
    positive       0.71      0.68      0.69      1716

    accuracy                           0.65      5477
   macro avg       0.66      0.65      0.65      5477
weighted avg       0.66      0.65      0.65      5477

