In [12]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [13]:
# Ejemplo de datos sintéticos:
# Supondremos que tenemos correos marcados como spam o no spam (ham).
data = {
    'text': [
        "Hello friend, how are you today?",
        "WIN a brand new car by clicking this link!",
        "Just checking in to see how things are going",
        "CONGRATULATIONS, YOU WON a FREE vacation!",
        "Can we schedule a meeting for next week?",
        "This is not spam, just wanted to say hi",
        "Make money fast!!! Click here to learn how!",
        "Sale is on, buy one get one free, don't miss out"
    ],
    'label': [
        "ham",
        "spam",
        "ham",
        "spam",
        "ham",
        "ham",
        "spam",
        "spam"
    ]
}

df = pd.DataFrame(data)

In [14]:
# Preprocesamiento básico: pasar a minúsculas
df['text'] = df['text'].str.lower()

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    filtered = [w for w in words if w not in stop_words]
    return " ".join(filtered)

df['text'] = df['text'].apply(remove_stopwords)


In [15]:
# Convertir el texto a una matriz de conteos de palabras
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['text'])  # Matriz de recuentos
y = df['label']


In [16]:
# Dividir datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Crear y entrenar el clasificador Naive Bayes
clf = MultinomialNB()
clf.fit(X_train, y_train)

In [18]:
# Predecir en el conjunto de prueba
y_pred = clf.predict(X_test)

In [19]:
# Evaluar exactitud
acc = accuracy_score(y_test, y_pred)
print("Exactitud:", acc)

Exactitud: 0.5


In [20]:
# Probar con un correo nuevo
nuevo_correo = "win a free gift now!!!"
nuevo_correo = nuevo_correo.lower()
nuevo_correo = remove_stopwords(nuevo_correo)
nuevo_vec = vectorizer.transform([nuevo_correo])

In [21]:
# Calcular las probabilidades de cada clase
probs = clf.predict_proba(nuevo_vec)
clases = clf.classes_
for c, p in zip(clases, probs[0]):
    print(f"Probabilidad de {c}: {p:.4f}")

Probabilidad de ham: 0.2977
Probabilidad de spam: 0.7023


In [22]:
print("Predicción final:", clf.predict(nuevo_vec)[0])

Predicción final: spam
