In [6]:
datos=spark.read.csv("SMSSpamCollection",inferSchema=True,sep="\t")

In [8]:
datos.show(truncate=False)

+----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|_c0 |_c1                                                                                                                                                                                                 |
+----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|ham |Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...                                                                                     |
|ham |Ok lar... Joking wif u oni...                                                                                                                                                     

In [10]:
from pyspark.sql.functions import length
datos2=datos.withColumn("longitud",length(datos["_c1"]))

In [24]:
datos2.groupBy("_c0").mean("longitud").show()

+----+-----------------+
| _c0|    avg(longitud)|
+----+-----------------+
| ham|71.45431945307645|
|spam|138.6706827309237|
+----+-----------------+



In [25]:
datos2.groupBy("_c0").max("longitud").show()

+----+-------------+
| _c0|max(longitud)|
+----+-------------+
| ham|          910|
|spam|          223|
+----+-------------+



In [26]:
datos2.groupBy("_c0").min("longitud").show()

+----+-------------+
| _c0|min(longitud)|
+----+-------------+
| ham|            2|
|spam|           13|
+----+-------------+



In [28]:
from pyspark.ml.feature import Tokenizer,StopWordsRemover,StringIndexer

In [50]:
datosI=StringIndexer(inputCol="_c0",outputCol="label").fit(datos2).transform(datos2)
datosT=Tokenizer(inputCol="_c1",outputCol="palabras").transform(datosI)
datosS=StopWordsRemover(inputCol="palabras",outputCol="filtradas").transform(datosT)\
    .drop("palabras","_c1")

In [51]:
datosS.show()

+----+--------+-----+--------------------+
| _c0|longitud|label|           filtradas|
+----+--------+-----+--------------------+
| ham|     111|  0.0|[go, jurong, poin...|
| ham|      29|  0.0|[ok, lar..., joki...|
|spam|     155|  1.0|[free, entry, 2, ...|
| ham|      49|  0.0|[u, dun, say, ear...|
| ham|      61|  0.0|[nah, think, goes...|
|spam|     147|  1.0|[freemsg, hey, da...|
| ham|      77|  0.0|[even, brother, l...|
| ham|     160|  0.0|[per, request, 'm...|
|spam|     157|  1.0|[winner!!, valued...|
|spam|     154|  1.0|[mobile, 11, mont...|
| ham|     109|  0.0|[gonna, home, soo...|
|spam|     136|  1.0|[six, chances, wi...|
|spam|     155|  1.0|[urgent!, won, 1,...|
| ham|     196|  0.0|[searching, right...|
| ham|      35|  0.0|[date, sunday, wi...|
|spam|     149|  1.0|[xxxmobilemoviecl...|
| ham|      26|  0.0|[oh, k...i'm, wat...|
| ham|      81|  0.0|[eh, u, remember,...|
| ham|      56|  0.0|[fine, thats, wa...|
|spam|     155|  1.0|[england, v, mace...|
+----+-----

In [57]:
from pyspark.ml.feature import CountVectorizer,IDF,VectorAssembler

In [53]:
datoscv=CountVectorizer(inputCol="filtradas",outputCol="cv").fit(datosS)\
    .transform(datosS)

In [54]:
datoscv.show(truncate=False,vertical=True)

-RECORD 0-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 _c0       | ham                                                                                                                                                                                        
 longitud  | 111                                                                                                                                                                                        
 label     | 0.0                                                                                                                                                                                        
 filtradas | [go, jurong, point,, crazy.., available, bugis, n, great, world, la, e, buffet..., cine, got, amore, wat...]                                                                           

In [55]:
datosIdf=IDF(inputCol="cv",outputCol="idf").fit(datoscv).transform(datoscv)

In [56]:
datosIdf.show(truncate=False,vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 _c0       | ham                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          
 longitud  | 111  

In [58]:
from pyspark.ml.classification import LogisticRegression

In [61]:
constructor=VectorAssembler(inputCols=["longitud","idf"],outputCol="features")

In [81]:
datosNuevos=constructor.transform(datosIdf)

In [82]:
entrena,evalua=datosNuevos.randomSplit([0.8,0.2])

In [83]:
modelo=LogisticRegression().fit(entrena)

In [84]:
res=modelo.evaluate(evalua)

In [85]:
res.predictions.select("label","prediction").show(100)

+-----+----------+
|label|prediction|
+-----+----------+
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|     

In [86]:
res.areaUnderROC

0.9657088148900543

In [87]:
modeloOK=LogisticRegression().fit(datosNuevos)

In [88]:
modeloOK.summary.areaUnderROC

0.9998972479934239