# Explore here

In [1]:
import pandas as pd

# Cargar datos
url = "https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv"
data = pd.read_csv(url)

# Inspeccionar las primeras filas
print(data.head())
print(data.info())


                                                 url  is_spam
0  https://briefingday.us8.list-manage.com/unsubs...     True
1                             https://www.hvper.com/     True
2                 https://briefingday.com/m/v4n3i4f3     True
3   https://briefingday.com/n/20200618/m#commentform    False
4                        https://briefingday.com/fan     True
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2999 entries, 0 to 2998
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   url      2999 non-null   object
 1   is_spam  2999 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 26.5+ KB
None


Procesar los enlaces

In [3]:
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Descargar recursos 
nltk.download('stopwords')
nltk.download('wordnet')

# Función para preprocesar las URLs
def preprocess_url(url):
    # Convertir a minúsculas
    url = url.lower()
    # Dividir por signos 
    url = re.split(r'\W+', url)
    # Eliminar stopwords
    stop_words = set(stopwords.words('english'))
    url = [word for word in url if word not in stop_words]
    # Lematizar
    lemmatizer = WordNetLemmatizer()
    url = [lemmatizer.lemmatize(word) for word in url]
    return ' '.join(url)

# Aplicar el preprocesamiento 
data['processed_url'] = data['url'].apply(preprocess_url)

# Dividir el conjunto de datos 
X = data['processed_url']
y = data['is_spam']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorización 
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("Preprocesamiento y vectorización completados.")
print(f"Conjunto de entrenamiento: {X_train_tfidf.shape}")
print(f"Conjunto de prueba: {X_test_tfidf.shape}")


[nltk_data] Downloading package stopwords to /home/gitpod/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/gitpod/nltk_data...


Preprocesamiento y vectorización completados.
Conjunto de entrenamiento: (2399, 5448)
Conjunto de prueba: (600, 5448)


Construcción de un SVM

In [4]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Crear el modelo
svm_model = SVC()

# Entrenar el modelo 
svm_model.fit(X_train_tfidf, y_train)

# Realizar predicciones 
y_pred = svm_model.predict(X_test_tfidf)

# Evaluar el modelo
print("Matriz de confusión:")
print(confusion_matrix(y_test, y_pred))
print("\nReporte de clasificación:")
print(classification_report(y_test, y_pred))
print("\nPrecisión del modelo:")
print(accuracy_score(y_test, y_pred))


Matriz de confusión:
[[442  13]
 [ 18 127]]

Reporte de clasificación:
              precision    recall  f1-score   support

       False       0.96      0.97      0.97       455
        True       0.91      0.88      0.89       145

    accuracy                           0.95       600
   macro avg       0.93      0.92      0.93       600
weighted avg       0.95      0.95      0.95       600


Precisión del modelo:
0.9483333333333334


 Optimización del modelo

In [5]:
from sklearn.model_selection import GridSearchCV

# Definir los hiperparámetros
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto']
}

# Configurar GridSearchCV
grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', verbose=2)

# Entrenar GridSearchCV
grid_search.fit(X_train_tfidf, y_train)

# Imprimir los mejores parámetros y la mejor puntuación
print("Mejores parámetros encontrados:")
print(grid_search.best_params_)
print("\nMejor precisión obtenida:")
print(grid_search.best_score_)

# Evaluar el modelo optimizado en el conjunto de prueba
best_svm_model = grid_search.best_estimator_
y_pred_optimized = best_svm_model.predict(X_test_tfidf)

print("\nMatriz de confusión (modelo optimizado):")
print(confusion_matrix(y_test, y_pred_optimized))
print("\nReporte de clasificación (modelo optimizado):")
print(classification_report(y_test, y_pred_optimized))
print("\nPrecisión del modelo optimizado:")
print(accuracy_score(y_test, y_pred_optimized))


Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.2s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.2s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.2s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.2s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.2s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.3s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.3s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.3s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.3s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.3s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=   0.4s
[CV] END ....................C=0.1, gamma=scale