In [7]:
# gnb_simple_test.py

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report


import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.naive_bayes import GaussianNB

In [2]:
df = pd.read_csv("../../../data/processed/train_preprocess_v1.csv")
df_test = pd.read_csv("../../../data/processed/test_preprocess_v1.csv")

In [5]:


# 1) Carga tu DataFrame
# df = pd.read_csv("train.csv")   # descoméntalo/adáptalo
# -------------------------------
X = df.drop(columns=['id', 'statement', 'label'])  # quitamos 'id' y 'statement'
y = df['label']

# 2) Identifica columnas numéricas y categóricas
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

# 3) Preprocesador: escala numéricas, one-hot en categóricas
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
], remainder='drop')
# 4) Pipeline con GaussianNB
pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', GaussianNB())
])

# 5) Train/Test split (33% test, estratificado)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.33,
    random_state=42,
    stratify=y
)

# 6) Entrena y predice
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

# 7) Resultados: matriz de confusión + report
print("Matriz de confusión (fila=verdadero, columna=predicho):")
print(confusion_matrix(y_test, y_pred))
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))


Matriz de confusión (fila=verdadero, columna=predicho):
[[ 681  360]
 [1161  752]]

Classification report:
              precision    recall  f1-score   support

           0     0.3697    0.6542    0.4724      1041
           1     0.6763    0.3931    0.4972      1913

    accuracy                         0.4851      2954
   macro avg     0.5230    0.5236    0.4848      2954
weighted avg     0.5682    0.4851    0.4885      2954



In [10]:
X_test = df_test.drop(columns=['id', 'statement'])

# ————————————————————————————————
# 4) Predicción y creación del CSV
# ————————————————————————————————
y_pred = pipe.predict(X_test)

submission = pd.DataFrame({
    'id':    df_test['id'],
    'label': y_pred
})


filename = f"gaussiannb_submission_{datetime.datetime.now().strftime('%Y%m%d')}.csv"
submission.to_csv(filename, columns=['id', 'label'], index=False)

print(f"✅ Submission generada correctamente: '{filename}'")
print(submission.head())


✅ Submission generada correctamente: 'gaussiannb_submission_20250513.csv'
            id  label
0  dc32e5ffa8b      0
1  aa49bb41cab      1
2  dddc8d12ac1      0
3  bcfe8f51667      1
4  eedbbaff5ab      0


In [13]:
# gnb_with_pos_ratios.py

import ast
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report

# 1) Función para extraer ratios de POS de forma robusta
def add_pos_ratios(df):
    df = df.copy()
    noun, verb, adj, adv = [], [], [], []

    for row in df['pos_freq']:
        # Intentamos obtener un dict de conteos
        try:
            if isinstance(row, str):
                d = ast.literal_eval(row)
            elif isinstance(row, dict):
                d = row
            else:
                d = dict(row)
        except Exception:
            d = {}

        total = sum(d.values()) or 1
        noun.append(d.get('NOUN', 0) / total)
        verb.append(d.get('VERB', 0) / total)
        adj .append(d.get('ADJ',  0) / total)
        adv .append(d.get('ADV',  0) / total)

    df['noun_ratio'] = noun
    df['verb_ratio'] = verb
    df['adj_ratio']  = adj
    df['adv_ratio']  = adv
    return df

# 2) Carga y split
# df = pd.read_csv("train.csv")   # ← ajusta tu ruta si hace falta
X = add_pos_ratios(df.drop(columns=['id', 'statement', 'label']))
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.33,
    random_state=42,
    stratify=y
)

# 3) Identificamos columnas numéricas y categóricas
num_cols = ['num_tokens', 'num_sentences', 'num_tokens_without_stopwords',
            'noun_ratio', 'verb_ratio', 'adj_ratio', 'adv_ratio']
cat_cols = [c for c in X.columns if c not in num_cols]

# 4) Preprocesador: escala numéricas y one-hot en categóricas
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
], remainder='drop')

# 5) Pipeline y entrenamiento de GaussianNB
pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', GaussianNB())
])
pipe.fit(X_train, y_train)

# 6) Predicción y métricas
y_pred = pipe.predict(X_test)
print("Matriz de confusión (fila=verdadero ⇒ columna=predicho):")
print(confusion_matrix(y_test, y_pred))
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))


Matriz de confusión (fila=verdadero ⇒ columna=predicho):
[[ 681  360]
 [1161  752]]

Classification report:
              precision    recall  f1-score   support

           0     0.3697    0.6542    0.4724      1041
           1     0.6763    0.3931    0.4972      1913

    accuracy                         0.4851      2954
   macro avg     0.5230    0.5236    0.4848      2954
weighted avg     0.5682    0.4851    0.4885      2954



In [15]:
def add_pos_ratios(df):
    df = df.copy()
    noun, verb, adj, adv = [], [], [], []
    for row in df['pos_freq']:
        try:
            d = ast.literal_eval(row) if isinstance(row, str) else dict(row)
        except Exception:
            d = {}
        total = sum(d.values()) or 1
        noun.append(d.get('NOUN', 0) / total)
        verb.append(d.get('VERB', 0) / total)
        adj .append(d.get('ADJ',  0) / total)
        adv .append(d.get('ADV',  0) / total)
    df['noun_ratio'] = noun
    df['verb_ratio'] = verb
    df['adj_ratio']  = adj
    df['adv_ratio']  = adv
    return df

# 3) Prepara X_test igual que en train (quitando id y statement + añadir POS)
X_test = add_pos_ratios(df_test.drop(columns=['id', 'statement']))

# 4) Predicción
y_pred = pipe.predict(X_test)

# 5) Creación del DataFrame de submission
submission = pd.DataFrame({
    'id':    df_test['id'],
    'label': y_pred
})

# 6) Guardado con fecha en nombre
filename = f"gaussiannb_proporciones_de_POS_submission_{datetime.datetime.now().strftime('%Y%m%d')}.csv"
submission.to_csv(filename, columns=['id', 'label'], index=False)

print(f"✅ Submission generada correctamente: '{filename}'")
print(submission.head())

✅ Submission generada correctamente: 'gaussiannb_proporciones_de_POS_submission_20250513.csv'
            id  label
0  dc32e5ffa8b      0
1  aa49bb41cab      1
2  dddc8d12ac1      0
3  bcfe8f51667      1
4  eedbbaff5ab      0
