In [4]:
# gnb_simple_test.py

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report

In [19]:
df = pd.read_csv("../../../data/processed/train_preprocess_v1.csv")
df_test = pd.read_csv("../../../data/processed/test_preprocess_v1.csv")

In [10]:


# 1) Carga tu DataFrame
# df = pd.read_csv("train.csv")   # descoméntalo/adáptalo
# -------------------------------
X = df.drop(columns=['id', 'statement'])  # quitamos 'id' y 'statement'
y = df['label']

# 2) Identifica columnas numéricas y categóricas
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

# 3) Preprocesador: escala numéricas, one-hot en categóricas
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
], remainder='drop')
# 4) Pipeline con GaussianNB
pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', GaussianNB())
])

# 5) Train/Test split (33% test, estratificado)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.33,
    random_state=42,
    stratify=y
)

# 6) Entrena y predice
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

# 7) Resultados: matriz de confusión + report
print("Matriz de confusión (fila=verdadero, columna=predicho):")
print(confusion_matrix(y_test, y_pred))
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))


Matriz de confusión (fila=verdadero, columna=predicho):
[[1028   13]
 [   4 1909]]

Classification report:
              precision    recall  f1-score   support

           0     0.9961    0.9875    0.9918      1041
           1     0.9932    0.9979    0.9956      1913

    accuracy                         0.9942      2954
   macro avg     0.9947    0.9927    0.9937      2954
weighted avg     0.9943    0.9942    0.9942      2954



In [22]:
X_test = df_test.drop(columns=['id', 'statement'])

# ————————————————————————————————
# 4) Predicción y creación del CSV
# ————————————————————————————————
y_pred = pipe.predict(X_test)

submission = pd.DataFrame({
    'id':    df_test['id'],
    'label': y_pred
})

submission.to_csv("submission.csv", index=False)
print("✅ submission.csv creada correctamente")
print(submission.head())

ValueError: columns are missing: {'label'}