# Actividad 2: PRÁCTICA DE CLASIFICACIÓN DE TEXTOS
## Borja Lacalle Álvarez

In [48]:
#importar librerias para el desarrollo de la actividad
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import nltk

In [49]:
# leer dataset spam.csv
spam = pd.read_csv('spam.csv')
spam.head(10)



Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [50]:
# ver porcentaje de spam y no spam
spam['label'].value_counts(normalize=True)


label
ham     0.865937
spam    0.134063
Name: proportion, dtype: float64

In [51]:
#normalizar el texto quitando signos de puntuacion y stopwords, entre otras cosas
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords

wpt = WordPunctTokenizer()
stop_words = set(stopwords.words('english'))

def normalize_document(doc):
    doc = doc.replace("!", "").replace("¡", "").replace(",", "").replace(".", "").replace(";", "")
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

In [52]:
norm_spam=[]

for document in spam['text']:
    norm_spam.append(normalize_document(document))

#ver solo los primeros 20 documentos normalizados
norm_spam[:20]


['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 "free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry question ( std txt rate ) & c ' apply 08452810075over18 '",
 'u dun say early hor u c already say',
 "nah ' think goes usf lives around though",
 "freemsg hey darling ' 3 week ' word back ' like fun still ? tb ok xxx std chgs send å £ 150 rcv",
 'even brother like speak treat like aids patent',
 "per request ' melle melle ( oru minnaminunginte nurungu vettam )' set callertune callers press * 9 copy friends callertune",
 'winner valued network customer selected receivea å £ 900 prize reward claim call 09061701461 claim code kl341 valid 12 hours',
 'mobile 11 months ? u r entitled update latest colour mobiles camera free call mobile update co free 08002986030',
 "' gonna home soon ' want talk stuff anymore tonight k ? ' cried enough today",
 'six chances win cash 100 20000 pounds txt > c

In [53]:
#convertir norm_spam en una matriz tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(norm_spam)

tfidf_array = tfidf.toarray()
tfidf_array.shape


(5572, 9042)

In [54]:
#dividir el dataset en train (80%) y test (20%)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tfidf, spam['label'], test_size=0.2, random_state=42)


#### Clasificador bayesiano ingenuo (Naive Bayes)

In [55]:
# entenar modelo bayesiano ingenuo
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train, y_train)


In [56]:
#predecir con el modelo entrenado
y_pred = nb.predict(X_test)
y_pred

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'spam'], dtype='<U4')

In [57]:
#evaluar el modelo con la matriz de confusion
from sklearn.metrics import confusion_matrix

nb_conf = confusion_matrix(y_test, y_pred)
nb_conf


array([[965,   0],
       [ 37, 113]])

La estructura típica de una matriz de confusión binaria es la siguiente:

[[Verdaderos Negativos (TN), Falsos Positivos (FP)]  
 [Falsos Negativos (FN), Verdaderos Positivos (TP)]]

Por lo tanto, estos valores tienen el siguiente significado:

TN (True Negatives): La cantidad de mensajes clasificados correctamente como "ham" (no spam).  
FP (False Positives): La cantidad de mensajes que fueron clasificados incorrectamente como "spam" cuando en realidad son "ham".  
FN (False Negatives): La cantidad de mensajes que fueron clasificados incorrectamente como "ham" cuando en realidad son "spam".  
TP (True Positives): La cantidad de mensajes clasificados correctamente como "spam".



In [58]:
# evaluar el modelo con accuracy 
from sklearn.metrics import accuracy_score

nb_acc = accuracy_score(y_test, y_pred)
nb_acc


0.9668161434977578

El modelo acierta el 96.68% de las predicciones en el conjunto de evaluación. Es una medida de qué tan bien el modelo está clasificando correctamente los mensajes como spam o no spam.

#### Máquina SVM (SUPPORT VECTOR MACHINE)

In [59]:
# entrenar el modelo de svm
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train, y_train)

In [60]:
# predecir con el modelo entrenado
y_pred = svm.predict(X_test)
y_pred

array(['ham', 'ham', 'spam', ..., 'ham', 'ham', 'ham'], dtype=object)

In [61]:
# evaluar el modelo con la matriz de confusion
svm_conf = confusion_matrix(y_test, y_pred)
svm_conf

array([[963,   2],
       [ 29, 121]])

In [62]:
# evaluar el modelo con accuracy
svm_acc = accuracy_score(y_test, y_pred)
svm_acc

0.9721973094170404

#### Árboles de Decisión


In [63]:
# entrenar el modelo de arbol de decision
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)


In [64]:
# predecir con el modelo entrenado
y_pred = dt.predict(X_test)
y_pred

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'spam'], dtype=object)

In [65]:
# evaluar el modelo con la matriz de confusion
dt_conf = confusion_matrix(y_test, y_pred)
dt_conf

array([[953,  12],
       [ 20, 130]])

In [66]:
# evaluar el modelo con accuracy
dt_acc = accuracy_score(y_test, y_pred)
dt_acc

0.9713004484304932

In [67]:
# Imprimir el valor de precisión para cada modelo en porcentaje y redondeado a dos decimales
print(f'Accuracy Naive Bayesian: {round(nb_acc*100, 2)}%')  
print(f'Accuracy SVM: {round(svm_acc*100, 2)}%')
print(f'Accuracy Decision Tree: {round(dt_acc*100, 2)}%')

Accuracy Naive Bayesian: 96.68%
Accuracy SVM: 97.22%
Accuracy Decision Tree: 97.13%


#### Comparación

In [68]:
# Store the evaluation metrics in a dictionary for easy comparison
model_metrics = {
    "Naive Bayes": {
        "Accuracy": nb_acc,
        "Confusion Matrix": nb_conf
    },
    "SVM": {
        "Accuracy": svm_acc,
        "Confusion Matrix": svm_conf
    },
    "Decision Tree": {
        "Accuracy": dt_acc,
        "Confusion Matrix": dt_conf
    }
}

# Function to print the comparison
def print_model_comparison_percentage(metrics):
    for model, data in metrics.items():
        accuracy_percentage = data['Accuracy'] * 100
        print(f"Model: {model}")
        print(f"  Accuracy: {accuracy_percentage:.2f}%")
        print(f"  Confusion Matrix:\n{data['Confusion Matrix']}\n")

# Print the comparison with updated format
print_model_comparison_percentage(model_metrics)

Model: Naive Bayes
  Accuracy: 96.68%
  Confusion Matrix:
[[965   0]
 [ 37 113]]

Model: SVM
  Accuracy: 97.22%
  Confusion Matrix:
[[963   2]
 [ 29 121]]

Model: Decision Tree
  Accuracy: 97.13%
  Confusion Matrix:
[[953  12]
 [ 20 130]]



#### ¿Tiene influencia en el resultado final el número máximo de features a utilizar?

In [69]:
# Reinitialize the TF-IDF Vectorizer with max_features set to 1000
tfidf_vectorizer_1000 = TfidfVectorizer(max_features=1000)

# Fit and transform the processed text
tfidf_1000 = tfidf_vectorizer_1000.fit_transform(norm_spam)

# Split the data into training and testing sets
X_train_1000, X_test_1000, y_train_1000, y_test_1000 = train_test_split(tfidf_1000, spam['label'], test_size=0.2, random_state=42)

# Reinitialize and train the Naive Bayes classifier
nb_classifier_1000 = MultinomialNB()
nb_classifier_1000.fit(X_train_1000, y_train_1000)

# Reinitialize and train the SVM classifier
svm_classifier_1000 = SVC()
svm_classifier_1000.fit(X_train_1000, y_train_1000)

# Reinitialize and train the Decision Tree classifier
dt_classifier_1000 = DecisionTreeClassifier()
dt_classifier_1000.fit(X_train_1000, y_train_1000)

# Predict and evaluate the models
y_pred_nb_1000 = nb_classifier_1000.predict(X_test_1000)
y_pred_svm_1000 = svm_classifier_1000.predict(X_test_1000)
y_pred_dt_1000 = dt_classifier_1000.predict(X_test_1000)

# Store the evaluation metrics for the new models
model_metrics_1000 = {
    "Naive Bayes (1000 features)": {
        "Accuracy": nb_classifier_1000.score(X_test_1000, y_test_1000),
        "Confusion Matrix": confusion_matrix(y_test_1000, y_pred_nb_1000)
    },
    "SVM (1000 features)": {
        "Accuracy": svm_classifier_1000.score(X_test_1000, y_test_1000),
        "Confusion Matrix": confusion_matrix(y_test_1000, y_pred_svm_1000)
    },
    "Decision Tree (1000 features)": {
        "Accuracy": dt_classifier_1000.score(X_test_1000, y_test_1000),
        "Confusion Matrix": confusion_matrix(y_test_1000, y_pred_dt_1000)
    }
}

# Print the comparison with updated format
print_model_comparison_percentage(model_metrics_1000)

# print the shape of the new TF-IDF Vectorizer
print(tfidf_1000.toarray().shape)


Model: Naive Bayes (1000 features)
  Accuracy: 97.94%
  Confusion Matrix:
[[962   3]
 [ 20 130]]

Model: SVM (1000 features)
  Accuracy: 97.94%
  Confusion Matrix:
[[963   2]
 [ 21 129]]

Model: Decision Tree (1000 features)
  Accuracy: 96.50%
  Confusion Matrix:
[[949  16]
 [ 23 127]]

(5572, 1000)


In [70]:
# Reinitialize the TF-IDF Vectorizer with max_features set to 5000
tfidf_vectorizer_5000 = TfidfVectorizer(max_features=5000)

# Fit and transform the processed text
tfidf_5000 = tfidf_vectorizer_5000.fit_transform(norm_spam)

# Split the data into training and testing sets
X_train_5000, X_test_5000, y_train_5000, y_test_5000 = train_test_split(tfidf_5000, spam['label'], test_size=0.2, random_state=42)

# Reinitialize and train the Naive Bayes classifier
nb_classifier_5000 = MultinomialNB()
nb_classifier_5000.fit(X_train_5000, y_train_5000)

# Reinitialize and train the SVM classifier
svm_classifier_5000 = SVC()
svm_classifier_5000.fit(X_train_5000, y_train_5000)

# Reinitialize and train the Decision Tree classifier
dt_classifier_5000 = DecisionTreeClassifier()
dt_classifier_5000.fit(X_train_5000, y_train_5000)

# Predict and evaluate the models
y_pred_nb_5000 = nb_classifier_5000.predict(X_test_5000)
y_pred_svm_5000 = svm_classifier_5000.predict(X_test_5000)
y_pred_dt_5000 = dt_classifier_5000.predict(X_test_5000)

# Store the evaluation metrics for the new models
model_metrics_5000 = {
    "Naive Bayes (5000 features)": {
        "Accuracy": nb_classifier_5000.score(X_test_5000, y_test_5000),
        "Confusion Matrix": confusion_matrix(y_test_5000, y_pred_nb_5000)
    },
    "SVM (5000 features)": {
        "Accuracy": svm_classifier_5000.score(X_test_5000, y_test_5000),
        "Confusion Matrix": confusion_matrix(y_test_5000, y_pred_svm_5000)
    },
    "Decision Tree (5000 features)": {
        "Accuracy": dt_classifier_5000.score(X_test_5000, y_test_5000),
        "Confusion Matrix": confusion_matrix(y_test_5000, y_pred_dt_5000)
    }
}

# Print the comparison with updated format
print_model_comparison_percentage(model_metrics_5000)


Model: Naive Bayes (5000 features)
  Accuracy: 97.49%
  Confusion Matrix:
[[965   0]
 [ 28 122]]

Model: SVM (5000 features)
  Accuracy: 97.49%
  Confusion Matrix:
[[963   2]
 [ 26 124]]

Model: Decision Tree (5000 features)
  Accuracy: 95.87%
  Confusion Matrix:
[[944  21]
 [ 25 125]]



In [71]:
# Valores de precisión previamente calculados y discutidos
accuracy_values = {
    "Naive Bayes": {"No Limit": 96.68, "1000 Features": 97.94, "5000 Features": 97.49},
    "SVM": {"No Limit": 97.21, "1000 Features": 97.94, "5000 Features": 97.49},
    "Decision Tree": {"No Limit": 97.49, "1000 Features": 96.41, "5000 Features": 96.41}
}

# Crear DataFrame para la comparación
accuracy_df = pd.DataFrame(accuracy_values).T

# Imprimir la tabla de comparación
accuracy_df



Unnamed: 0,No Limit,1000 Features,5000 Features
Naive Bayes,96.68,97.94,97.49
SVM,97.21,97.94,97.49
Decision Tree,97.49,96.41,96.41
