# Gaussian NB

## 1.- Explicación

Gaussian Naive Bayes es un clasificador rápido y muy sencillo de entrenar que asume que, dadas las clases (“fake” - 1 o “real” - 0), cada característica numérica sigue una distribución normal e independiente de las demás. Esto lo convierte en una excelente línea base: en segundos calcula las medias y varianzas por clase, te da un benchmark inicial y te permite identificar qué variables (longitud de texto, frecuencia de POS, TF-IDF de palabras clave…) discriminan mejor entre noticias falsas y verdaderas.

## 2.- Implementación

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report


import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.naive_bayes import GaussianNB

: 

In [None]:
df = pd.read_csv("../../../../data/processed/train_preprocess_v1.csv")
df_test = pd.read_csv("../../../../data/processed/test_preprocess_v1.csv")

In [None]:
df.info()

### 2.1.- Intento 1: GaussianNB sobre rasgos básicos

In [None]:
X = df.drop(columns=['id', 'statement', 'label']) 
y = df['label']

num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
], remainder='drop')

pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', GaussianNB())
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.33,
    random_state=42,
    stratify=y
)

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print("Matriz de confusión (fila=verdadero, columna=predicho):")
print(confusion_matrix(y_test, y_pred))
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))


Matriz de confusión (fila=verdadero, columna=predicho):
[[ 844  197]
 [1506  407]]

Classification report:
              precision    recall  f1-score   support

         0.0     0.3591    0.8108    0.4978      1041
         1.0     0.6738    0.2128    0.3234      1913

    accuracy                         0.4235      2954
   macro avg     0.5165    0.5118    0.4106      2954
weighted avg     0.5629    0.4235    0.3849      2954



In [None]:
X_test = df_test.drop(columns=['id', 'statement'])

y_pred = pipe.predict(X_test)

submission = pd.DataFrame({
    'id':    df_test['id'],
    'label': y_pred
})


filename = f"gaussiannb_submission_{datetime.datetime.now().strftime('%Y%m%d_%H%M')}.csv"
submission.to_csv(filename, columns=['id', 'label'], index=False)

print(f" Submission generada correctamente: '{filename}'")
print(submission.head())


ValueError: columns are missing: {'party_affiliation'}

### 2.2.- Intento 2: Incorporando ratios POS

In [None]:

import ast
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report

def add_pos_ratios(df):
    df = df.copy()
    noun, verb, adj, adv = [], [], [], []

    for row in df['pos_freq']:
        try:
            if isinstance(row, str):
                d = ast.literal_eval(row)
            elif isinstance(row, dict):
                d = row
            else:
                d = dict(row)
        except Exception:
            d = {}

        total = sum(d.values()) or 1
        noun.append(d.get('NOUN', 0) / total)
        verb.append(d.get('VERB', 0) / total)
        adj .append(d.get('ADJ',  0) / total)
        adv .append(d.get('ADV',  0) / total)

    df['noun_ratio'] = noun
    df['verb_ratio'] = verb
    df['adj_ratio']  = adj
    df['adv_ratio']  = adv
    return df

X = add_pos_ratios(df.drop(columns=['id', 'statement', 'label']))
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.33,
    random_state=42,
    stratify=y
)


num_cols = ['num_tokens', 'num_sentences', 'num_tokens_without_stopwords',
            'noun_ratio', 'verb_ratio', 'adj_ratio', 'adv_ratio']
cat_cols = [c for c in X.columns if c not in num_cols]

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
], remainder='drop')

pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', GaussianNB())
])
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
print("Matriz de confusión (fila=verdadero ⇒ columna=predicho):")
print(confusion_matrix(y_test, y_pred))
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))


Matriz de confusión (fila=verdadero ⇒ columna=predicho):
[[ 681  360]
 [1161  752]]

Classification report:
              precision    recall  f1-score   support

           0     0.3697    0.6542    0.4724      1041
           1     0.6763    0.3931    0.4972      1913

    accuracy                         0.4851      2954
   macro avg     0.5230    0.5236    0.4848      2954
weighted avg     0.5682    0.4851    0.4885      2954



In [None]:
def add_pos_ratios(df):
    df = df.copy()
    noun, verb, adj, adv = [], [], [], []
    for row in df['pos_freq']:
        try:
            d = ast.literal_eval(row) if isinstance(row, str) else dict(row)
        except Exception:
            d = {}
        total = sum(d.values()) or 1
        noun.append(d.get('NOUN', 0) / total)
        verb.append(d.get('VERB', 0) / total)
        adj .append(d.get('ADJ',  0) / total)
        adv .append(d.get('ADV',  0) / total)
    df['noun_ratio'] = noun
    df['verb_ratio'] = verb
    df['adj_ratio']  = adj
    df['adv_ratio']  = adv
    return df

X_test = add_pos_ratios(df_test.drop(columns=['id', 'statement']))

y_pred = pipe.predict(X_test)

submission = pd.DataFrame({
    'id':    df_test['id'],
    'label': y_pred
})

filename = f"gaussiannb_proporciones_de_POS_submission_{datetime.datetime.now().strftime('%Y%m%d')}.csv"
submission.to_csv(filename, columns=['id', 'label'], index=False)

print(f" Submission generada correctamente: '{filename}'")
print(submission.head())

✅ Submission generada correctamente: 'gaussiannb_proporciones_de_POS_submission_20250513.csv'
            id  label
0  dc32e5ffa8b      0
1  aa49bb41cab      1
2  dddc8d12ac1      0
3  bcfe8f51667      1
4  eedbbaff5ab      0


### 2.3.- Intento 3: Variables demográficas y de speaker


In [None]:
import ast
import datetime
import numpy as np
import pandas as pd
import category_encoders as ce

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    FunctionTransformer, StandardScaler,
    OneHotEncoder, QuantileTransformer
)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
    confusion_matrix, classification_report,
    make_scorer, f1_score
)

X = df[['subject','speaker_type', 'speaker_job', 'party_affiliation_uni']]
y = df['label']

num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
], remainder='drop')

pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', GaussianNB())
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.33,
    random_state=42,
    stratify=y
)

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print("Matriz de confusión (fila=verdadero, columna=predicho):")
print(confusion_matrix(y_test, y_pred))
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))


Matriz de confusión (fila=verdadero, columna=predicho):
[[ 966   75]
 [1718  195]]

Classification report:
              precision    recall  f1-score   support

           0     0.3599    0.9280    0.5187      1041
           1     0.7222    0.1019    0.1787      1913

    accuracy                         0.3930      2954
   macro avg     0.5411    0.5149    0.3487      2954
weighted avg     0.5945    0.3930    0.2985      2954



In [None]:
X_test = df_test[['subject','speaker_type', 'speaker_job', 'party_affiliation_uni']]

y_pred = pipe.predict(X_test)

submission = pd.DataFrame({
    'id':    df_test['id'],
    'label': y_pred
})


filename = f"gaussiannb_submission_{datetime.datetime.now().strftime('%Y%m%d_%H%M')}.csv"
submission.to_csv(filename, columns=['id', 'label'], index=False)

print(f" Submission generada correctamente: '{filename}'")
print(submission.head())


✅ Submission generada correctamente: 'gaussiannb_submission_20250513_2207.csv'
            id  label
0  dc32e5ffa8b      0
1  aa49bb41cab      0
2  dddc8d12ac1      0
3  bcfe8f51667      0
4  eedbbaff5ab      0


In [36]:
print(df_test.columns)

Index(['id', 'statement', 'subject', 'speaker', 'speaker_job', 'state_info',
       'party_affiliation', 'party_affiliation_uni',
       'party_affiliation_category_map', 'statement_tokens', 'num_tokens',
       'num_sentences', 'pos_info', 'pos_freq', 'lemma_freq', 'tag_freq',
       'entities', 'stopwords', 'statement_tokens_without_stopwords',
       'num_tokens_without_stopwords', 'pos_info_without_stopwords',
       'pos_freq_without_stopwords', 'lemma_freq_without_stopwords',
       'tag_freq_without_stopwords', 'processed_subject', 'speaker_entities',
       'speaker_type', 'speaker_job_tokens', 'state_info_tokens',
       'party_affiliation_tokens'],
      dtype='object')


### 2.4.- Intento 4: MultinomialNB con TF-IDF y búsqueda de parámetros

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
import datetime

In [None]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('nb',    MultinomialNB())
])

param_grid = {
    'tfidf__max_features': [2000, 5000, None],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'nb__alpha': [0.1, 0.5, 1.0]
}

grid = GridSearchCV(pipe, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid.fit(df['statement'], df['label'])

print("Mejor F1 (CV):", grid.best_score_)
print("Mejores parámetros:", grid.best_params_)

y_pred = grid.predict(df['statement'])
print(classification_report(df['label'], y_pred))
print("Matriz de confusión:\n", confusion_matrix(df['label'], y_pred))

Mejor F1 (CV): 0.7857383249956282
Mejores parámetros: {'nb__alpha': 1.0, 'tfidf__max_features': None, 'tfidf__ngram_range': (1, 2)}
              precision    recall  f1-score   support

           0       1.00      0.37      0.54      3155
           1       0.74      1.00      0.85      5795

    accuracy                           0.78      8950
   macro avg       0.87      0.68      0.69      8950
weighted avg       0.83      0.78      0.74      8950

Matriz de confusión:
 [[1157 1998]
 [   0 5795]]


In [None]:
y_pred = grid.predict(df_test['statement'])

submission = pd.DataFrame({
    'id':    df_test['id'],
    'label': y_pred
})

filename = f"submission_nb_{datetime.datetime.now():%Y%m%d_%H%M}.csv"
submission.to_csv(filename, index=False, columns=['id','label'])

print(f" Submission generada: {filename}")
print(submission.head())

✅ Submission generada: submission_nb_20250517_1424.csv
            id  label
0  dc32e5ffa8b      1
1  aa49bb41cab      1
2  dddc8d12ac1      1
3  bcfe8f51667      1
4  eedbbaff5ab      1


### 2.5.- Intento 5: pipelines mixtos y BERT


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, f1_score
from scipy.sparse import hstack
import joblib


In [None]:
def list_to_text(x):
    if isinstance(x, str):
        x = x.strip("[]").replace("'", "").replace('"', "")
        tokens = [tok.strip() for tok in x.split(",") if tok.strip()]
        return " ".join(tokens)
    else:
        return ""

for col in ['statement_tokens_without_stopwords', 'processed_subject', 'party_affiliation_tokens', 'speaker_job_tokens']:
    df[col] = df[col].apply(list_to_text)
    df_test[col] = df_test[col].apply(list_to_text)


text_col = 'statement_tokens_without_stopwords'
cat_cols = ['processed_subject', 'party_affiliation_tokens', 'speaker_job_tokens']

X_train_text = df[text_col]
X_train_cat = df[cat_cols]
y_train = df['label']

X_test_text = df_test[text_col]
X_test_cat = df_test[cat_cols]


from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.key].values

class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, keys):
        self.keys = keys
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.keys]


text_pipeline = Pipeline([
    ('selector', TextSelector(text_col)),
    ('tfidf', TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(1,2), sublinear_tf=True))
])


cat_pipeline = Pipeline([
    ('selector', ColumnSelector(cat_cols)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

X_text_train_tfidf = text_pipeline.fit_transform(df)
X_cat_train_oh = cat_pipeline.fit_transform(df)

X_text_test_tfidf = text_pipeline.transform(df_test)
X_cat_test_oh = cat_pipeline.transform(df_test)

from scipy.sparse import hstack

X_train_combined = hstack([X_text_train_tfidf, X_cat_train_oh])
X_test_combined = hstack([X_text_test_tfidf, X_cat_test_oh])

X_train_dense = X_train_combined.toarray()
X_test_dense = X_test_combined.toarray()

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_dense)
X_test_scaled = scaler.transform(X_test_dense)

gnb = GaussianNB()
gnb.fit(X_train_scaled, y_train)

y_train_pred = gnb.predict(X_train_scaled)
print("Train classification report GNB:")
print(classification_report(y_train, y_train_pred))

y_test_pred = gnb.predict(X_test_scaled)

joblib.dump(gnb, 'gnb_fake_news_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(text_pipeline, 'text_pipeline.pkl')
joblib.dump(cat_pipeline, 'cat_pipeline.pkl')

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

from datasets import Dataset

def concat_text(row):
    cats = []
    for c in cat_cols:
        cats.append(row[c])
    cats_concat = " ".join(cats)
    return row['statement'] + " " + cats_concat

df['bert_input'] = df.apply(concat_text, axis=1)
df_test['bert_input'] = df_test.apply(concat_text, axis=1)

hf_train_dataset = Dataset.from_pandas(df[['bert_input','label']])
hf_test_dataset = Dataset.from_pandas(df_test[['bert_input']])

def tokenize_function(examples):
    return tokenizer(examples['bert_input'], padding="max_length", truncation=True, max_length=256)

tokenized_train = hf_train_dataset.map(tokenize_function, batched=True)
tokenized_test = hf_test_dataset.map(tokenize_function, batched=True)


model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir='./bert_fake_news_model',
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=500,
    save_steps=500,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
) 

import numpy as np
from sklearn.metrics import f1_score, accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    precision, recall, f1_w, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_train,  
    compute_metrics=compute_metrics,
)

trainer.train()

model.save_pretrained('./bert_fake_news_model')
tokenizer.save_pretrained('./bert_fake_news_model')


predictions = trainer.predict(tokenized_test)
pred_labels = np.argmax(predictions.predictions, axis=-1)


df_test['label_pred'] = pred_labels
df_test[['id', 'label_pred']].to_csv('test_predictions_bert.csv', index=False)
filename = f"submission_nb_{datetime.datetime.now():%Y%m%d_%H%M}.csv"
df_test.to_csv(filename, index=False, columns=['id','label'])

Train classification report GNB:
              precision    recall  f1-score   support

           0       0.56      1.00      0.72      3155
           1       1.00      0.57      0.73      5795

    accuracy                           0.72      8950
   macro avg       0.78      0.79      0.72      8950
weighted avg       0.84      0.72      0.72      8950



Map: 100%|██████████| 8950/8950 [00:00<00:00, 10430.27 examples/s]
Map: 100%|██████████| 3836/3836 [00:00<00:00, 10553.57 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

## **Conclusión**

El recorrido evidencia que, sin enriquecer el modelo con características lingüísticas relevantes o sin equilibrar el sesgo de clases, un Naive Bayes básico se estanca alrededor del 40–50 % de exactitud y favorece desproporcionadamente una etiqueta. Añadir razones POS impulsó el rendimiento, pero el verdadero salto llegará al explotar métodos de lenguaje profundo. 