In [1]:
import numpy as np
import pandas as pd

In [2]:
email_dataset = pd.read_csv('spam.csv', encoding='latin1')
email_dataset.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
email_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [4]:
email_dataset.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
email_dataset=email_dataset.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4"], axis=1)
email_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [6]:
email_dataset.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# Ver los valores que tiene el 
email_dataset["v1"].unique()

array(['ham', 'spam'], dtype=object)

In [8]:
email_dataset["v1"]=email_dataset["v1"].map({"spam":1, "ham":0})

In [9]:
email_dataset.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
X = email_dataset["v2"]
y = email_dataset["v1"]


In [11]:
# we need to use TfidfVectorizer to convert the free text into numbers 

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(X)

In [12]:
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

X shape: (5572, 8672)
y shape: (5572,)


In [13]:
from sklearn.model_selection import train_test_split
np.random.seed(42)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)



In [14]:
# Aquí tenemos el core del asunto, la idea es probar diferentes modelos, por eso tengo un ciclo for

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


estimators = {
    "LogisticRegression":LogisticRegression(),
    "RandomForestRegressor":RandomForestRegressor(),
    "SVC": SVC()
}

In [15]:
# Aquí tenemos el core del asunto, la idea es probar diferentes modelos, por eso tengo un ciclo for

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


models = {
    "LogisticRegression": {
        "model": LogisticRegression(),
        "params": {
            "C": [0.01, 0.1, 1, 10, 100],
            "solver": ["lbfgs", "liblinear"]
        }
    },
    "RandomForestClassifier": {
        "model": RandomForestClassifier(),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5, 10]
        }
    },
    "SVC": {
        "model": SVC(),
        "params": {
            "C": [0.1, 1, 10],
            "kernel": ["linear", "rbf", "poly"]
        }
    }
}

In [None]:

from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

for name, model_info in models.items():
    model=model_info["model"]
    param_grid=model_info["params"]
    
    grid_search=GridSearchCV(model, param_grid,cv=5,scoring="accuracy",n_jobs=-1)
    grid_search.fit(X_train,y_train)

    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    # Evaluar en el conjunto de prueba
    test_score = best_model.score(X_test, y_test)

    #reporte para obtener precisión, recall y F1-Score
    y_pred = best_model.predict(X_test)
    classification = classification_report(y_test, y_pred)
    

    print(f"🔹 Modelo: {name}")
    print(f"   ✅ Mejores parámetros: {best_params}")
    print(f"   🔥 Mejor score en validación cruzada: {best_score:.4f}")
    print(f"   🎯 Score en test: {test_score:.4f}")
    print(f"Reporte clasificación: {classification}")
    print("-" * 50)

## Probando el modelo con un caso real

In [None]:
mensaje = ["Congratulations! You have won a free iPhone. Click here to claim."]

mensaje_vectorizado = vectorizer.transform(mensaje)
prediccion = best_model.predict(mensaje_vectorizado)
print("Spam" if prediccion[0]==1 else "Ham")