In [2]:
datos=spark\
    .read.csv("SMSSpamCollection",inferSchema=True,sep="\t")

In [3]:
datos.show()

+----+--------------------+
| _c0|                 _c1|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
|spam|Free entry in 2 a...|
| ham|U dun say so earl...|
| ham|Nah I don't think...|
|spam|FreeMsg Hey there...|
| ham|Even my brother i...|
| ham|As per your reque...|
|spam|WINNER!! As a val...|
|spam|Had your mobile 1...|
| ham|I'm gonna be home...|
|spam|SIX chances to wi...|
|spam|URGENT! You have ...|
| ham|I've been searchi...|
| ham|I HAVE A DATE ON ...|
|spam|XXXMobileMovieClu...|
| ham|Oh k...i'm watchi...|
| ham|Eh u remember how...|
| ham|Fine if thats th...|
|spam|England v Macedon...|
+----+--------------------+
only showing top 20 rows



In [4]:
datos.count()

5574

In [6]:
datos.where("_c0='spam'").count()

747

In [7]:
datos.where("_c0='ham'").count()

4827

In [8]:
from pyspark.sql.functions import length

In [9]:
datos2=datos.withColumn("longitud",length("_c1"))

In [10]:
datos2.show()

+----+--------------------+--------+
| _c0|                 _c1|longitud|
+----+--------------------+--------+
| ham|Go until jurong p...|     111|
| ham|Ok lar... Joking ...|      29|
|spam|Free entry in 2 a...|     155|
| ham|U dun say so earl...|      49|
| ham|Nah I don't think...|      61|
|spam|FreeMsg Hey there...|     147|
| ham|Even my brother i...|      77|
| ham|As per your reque...|     160|
|spam|WINNER!! As a val...|     157|
|spam|Had your mobile 1...|     154|
| ham|I'm gonna be home...|     109|
|spam|SIX chances to wi...|     136|
|spam|URGENT! You have ...|     155|
| ham|I've been searchi...|     196|
| ham|I HAVE A DATE ON ...|      35|
|spam|XXXMobileMovieClu...|     149|
| ham|Oh k...i'm watchi...|      26|
| ham|Eh u remember how...|      81|
| ham|Fine if thats th...|      56|
|spam|England v Macedon...|     155|
+----+--------------------+--------+
only showing top 20 rows



In [11]:
datos2.groupBy("_c0").mean("longitud").show()

+----+-----------------+
| _c0|    avg(longitud)|
+----+-----------------+
| ham|71.45431945307645|
|spam|138.6706827309237|
+----+-----------------+



In [12]:
datos2.groupBy("_c0").min("longitud").show()

+----+-------------+
| _c0|min(longitud)|
+----+-------------+
| ham|            2|
|spam|           13|
+----+-------------+



In [13]:
datos2.groupBy("_c0").max("longitud").show()

+----+-------------+
| _c0|max(longitud)|
+----+-------------+
| ham|          910|
|spam|          223|
+----+-------------+



In [14]:
datos2.show()

+----+--------------------+--------+
| _c0|                 _c1|longitud|
+----+--------------------+--------+
| ham|Go until jurong p...|     111|
| ham|Ok lar... Joking ...|      29|
|spam|Free entry in 2 a...|     155|
| ham|U dun say so earl...|      49|
| ham|Nah I don't think...|      61|
|spam|FreeMsg Hey there...|     147|
| ham|Even my brother i...|      77|
| ham|As per your reque...|     160|
|spam|WINNER!! As a val...|     157|
|spam|Had your mobile 1...|     154|
| ham|I'm gonna be home...|     109|
|spam|SIX chances to wi...|     136|
|spam|URGENT! You have ...|     155|
| ham|I've been searchi...|     196|
| ham|I HAVE A DATE ON ...|      35|
|spam|XXXMobileMovieClu...|     149|
| ham|Oh k...i'm watchi...|      26|
| ham|Eh u remember how...|      81|
| ham|Fine if thats th...|      56|
|spam|England v Macedon...|     155|
+----+--------------------+--------+
only showing top 20 rows



In [15]:
from pyspark.ml.feature import (StringIndexer,VectorAssembler,
        RegexTokenizer,StopWordsRemover,CountVectorizer,IDF)

In [20]:
indexador=StringIndexer(inputCol="_c0",outputCol="label")
datosI=indexador.fit(datos2).transform(datos2).drop("_c0")

In [21]:
datosI.show()

+--------------------+--------+-----+
|                 _c1|longitud|label|
+--------------------+--------+-----+
|Go until jurong p...|     111|  0.0|
|Ok lar... Joking ...|      29|  0.0|
|Free entry in 2 a...|     155|  1.0|
|U dun say so earl...|      49|  0.0|
|Nah I don't think...|      61|  0.0|
|FreeMsg Hey there...|     147|  1.0|
|Even my brother i...|      77|  0.0|
|As per your reque...|     160|  0.0|
|WINNER!! As a val...|     157|  1.0|
|Had your mobile 1...|     154|  1.0|
|I'm gonna be home...|     109|  0.0|
|SIX chances to wi...|     136|  1.0|
|URGENT! You have ...|     155|  1.0|
|I've been searchi...|     196|  0.0|
|I HAVE A DATE ON ...|      35|  0.0|
|XXXMobileMovieClu...|     149|  1.0|
|Oh k...i'm watchi...|      26|  0.0|
|Eh u remember how...|      81|  0.0|
|Fine if thats th...|      56|  0.0|
|England v Macedon...|     155|  1.0|
+--------------------+--------+-----+
only showing top 20 rows



In [22]:
reTokenizer=RegexTokenizer(inputCol="_c1",
                           outputCol="palabras",
                          pattern="[ \,\.\?\!\;\(\)]+")

In [23]:
datosT=reTokenizer.transform(datosI)

In [24]:
datosT.show()

+--------------------+--------+-----+--------------------+
|                 _c1|longitud|label|            palabras|
+--------------------+--------+-----+--------------------+
|Go until jurong p...|     111|  0.0|[go, until, juron...|
|Ok lar... Joking ...|      29|  0.0|[ok, lar, joking,...|
|Free entry in 2 a...|     155|  1.0|[free, entry, in,...|
|U dun say so earl...|      49|  0.0|[u, dun, say, so,...|
|Nah I don't think...|      61|  0.0|[nah, i, don't, t...|
|FreeMsg Hey there...|     147|  1.0|[freemsg, hey, th...|
|Even my brother i...|      77|  0.0|[even, my, brothe...|
|As per your reque...|     160|  0.0|[as, per, your, r...|
|WINNER!! As a val...|     157|  1.0|[winner, as, a, v...|
|Had your mobile 1...|     154|  1.0|[had, your, mobil...|
|I'm gonna be home...|     109|  0.0|[i'm, gonna, be, ...|
|SIX chances to wi...|     136|  1.0|[six, chances, to...|
|URGENT! You have ...|     155|  1.0|[urgent, you, hav...|
|I've been searchi...|     196|  0.0|[i've, been, sear..

In [28]:
StopWordsRemover().loadDefaultStopWords("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'no',
 'nor',
 '

In [29]:
filtro=StopWordsRemover(inputCol="palabras",
                        outputCol="filtradas")

In [32]:
datosF=filtro.transform(datosT).drop("_c1")

In [33]:
datosF.show()

+--------+-----+--------------------+--------------------+
|longitud|label|            palabras|           filtradas|
+--------+-----+--------------------+--------------------+
|     111|  0.0|[go, until, juron...|[go, jurong, poin...|
|      29|  0.0|[ok, lar, joking,...|[ok, lar, joking,...|
|     155|  1.0|[free, entry, in,...|[free, entry, 2, ...|
|      49|  0.0|[u, dun, say, so,...|[u, dun, say, ear...|
|      61|  0.0|[nah, i, don't, t...|[nah, think, goes...|
|     147|  1.0|[freemsg, hey, th...|[freemsg, hey, da...|
|      77|  0.0|[even, my, brothe...|[even, brother, l...|
|     160|  0.0|[as, per, your, r...|[per, request, 'm...|
|     157|  1.0|[winner, as, a, v...|[winner, valued, ...|
|     154|  1.0|[had, your, mobil...|[mobile, 11, mont...|
|     109|  0.0|[i'm, gonna, be, ...|[gonna, home, soo...|
|     136|  1.0|[six, chances, to...|[six, chances, wi...|
|     155|  1.0|[urgent, you, hav...|[urgent, won, 1, ...|
|     196|  0.0|[i've, been, sear...|[searching, right..