# Procesamiento de Texto: Detección de Spam

Se utiliza el conjunto de datos que se encuentra en: https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection

In [1]:
# Solo si se usa Google Colab
!pip install pyspark



In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('ejemplo_texto').getOrCreate()

In [3]:
df = spark.read.csv('/content/SMSSpamCollection', inferSchema=True, sep='\t')

df.show(5, truncate=False)

+----+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
|_c0 |_c1                                                                                                                                                        |
+----+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
|ham |Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...                                            |
|ham |Ok lar... Joking wif u oni...                                                                                                                              |
|spam|Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's|
|ham |U dun say so ear

In [4]:
# Cambiar nombre de las columnas
df = df.withColumnRenamed('_c0', 'spam?').withColumnRenamed('_c1','texto')
df.show(5, truncate=False)

+-----+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
|spam?|texto                                                                                                                                                      |
+-----+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
|ham  |Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...                                            |
|ham  |Ok lar... Joking wif u oni...                                                                                                                              |
|spam |Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's|
|ham  |U dun say

### Preparación de Datos

In [5]:
# Obtener la longitud de las cadenas de caracteres de cada mensaje (nuevo atributo)
from pyspark.sql.functions import length

df = df.withColumn('longitud', length(df['texto']))
df.show(5)

+-----+--------------------+--------+
|spam?|               texto|longitud|
+-----+--------------------+--------+
|  ham|Go until jurong p...|     111|
|  ham|Ok lar... Joking ...|      29|
| spam|Free entry in 2 a...|     155|
|  ham|U dun say so earl...|      49|
|  ham|Nah I don't think...|      61|
+-----+--------------------+--------+
only showing top 5 rows



In [6]:
# Analizar si hay alguna diferencia en las longitudes
df.groupBy('spam?').mean().show()
df.groupBy('spam?').min().show()
df.groupBy('spam?').max().show()

+-----+-----------------+
|spam?|    avg(longitud)|
+-----+-----------------+
|  ham|71.45431945307645|
| spam|138.6706827309237|
+-----+-----------------+

+-----+-------------+
|spam?|min(longitud)|
+-----+-------------+
|  ham|            2|
| spam|           13|
+-----+-------------+

+-----+-------------+
|spam?|max(longitud)|
+-----+-------------+
|  ham|          910|
| spam|          223|
+-----+-------------+



In [7]:
# Convertir en tokens
from pyspark.ml.feature import (Tokenizer, StopWordsRemover, CountVectorizer,
                                IDF, StringIndexer)

In [8]:
# Convertir a tokens
tokenizer = Tokenizer(inputCol='texto', outputCol='tokens')

# Quitar las palabras que se repiten
swremover = StopWordsRemover(inputCol='tokens', outputCol='tokens_clean')

# Bag of Words
cvectorizer = CountVectorizer(inputCol='tokens_clean', outputCol='count_vec')

# Obtener TF-IDF
idf = IDF(inputCol='count_vec', outputCol='tf_idf')

# Convertir las clases a numéricas
classIndexer = StringIndexer(inputCol='spam?', outputCol='clase')

In [9]:
# Agrupar los datos (para MLlib)
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=['tf_idf', 'longitud'], outputCol='features')

In [10]:
# Pipeline para preprocesamiento de datos
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[classIndexer, tokenizer, swremover, cvectorizer,
                            idf, assembler])

df2 = pipeline.fit(df).transform(df)

df2.show(5)

+-----+--------------------+--------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|spam?|               texto|longitud|clase|              tokens|        tokens_clean|           count_vec|              tf_idf|            features|
+-----+--------------------+--------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|  ham|Go until jurong p...|     111|  0.0|[go, until, juron...|[go, jurong, poin...|(13423,[7,11,31,6...|(13423,[7,11,31,6...|(13424,[7,11,31,6...|
|  ham|Ok lar... Joking ...|      29|  0.0|[ok, lar..., joki...|[ok, lar..., joki...|(13423,[0,24,297,...|(13423,[0,24,297,...|(13424,[0,24,297,...|
| spam|Free entry in 2 a...|     155|  1.0|[free, entry, in,...|[free, entry, 2, ...|(13423,[2,13,19,3...|(13423,[2,13,19,3...|(13424,[2,13,19,3...|
|  ham|U dun say so earl...|      49|  0.0|[u, dun, say, so,...|[u, dun, say, ear...|(13423,[0,70,80,1...|

In [11]:
# Selección de las columnas de interés
dfnew = df2.select('clase', 'features')
dfnew.show(5, truncate=False)

+-----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|clase|features                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+-----

In [12]:
# Separación en conjunto de entrenamiento y de prueba
df_train, df_test = dfnew.randomSplit([0.7, 0.3])

### Clasificación

In [13]:
# Se usará Naive Bayes como clasificador
from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes(featuresCol='features', labelCol='clase')

modelo = nb.fit(df_train)

In [14]:
# Aplicar el modelo entrenado al conjunto de prueba
resultados = modelo.transform(df_test)

resultados.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|clase|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(13424,[0,1,2,41,...|[-1082.1367003818...|[1.0,2.0824088530...|       0.0|
|  0.0|(13424,[0,1,4,50,...|[-827.44310390253...|[1.0,7.2413446740...|       0.0|
|  0.0|(13424,[0,1,5,15,...|[-998.79010433770...|[1.0,1.2732536293...|       0.0|
|  0.0|(13424,[0,1,7,15,...|[-659.27780980765...|[1.0,3.7214206382...|       0.0|
|  0.0|(13424,[0,1,17,19...|[-805.92968510962...|[1.0,1.6176274676...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



### Evaluación

In [15]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [16]:
# En el conjunto de entrenamiento
resultados_train = modelo.transform(df_train)

evaluador = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='clase')

exactitud_train = evaluador.evaluate(resultados_train)
print("Exactitud en el conjunto de entrenamiento:", exactitud_train)

Exactitud en el conjunto de entrenamiento: 0.9967077743232798


In [17]:
# En el conjunto de prueba

exactitud_test = evaluador.evaluate(resultados)

print("Exactitud en el conjunto de prueba:", exactitud_test)

Exactitud en el conjunto de prueba: 0.9125736813118639
