In [None]:
!pip install pyspark

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("NLP").getOrCreate()

In [2]:
data = spark.read.csv("../../data/SMSSpamCollection",
                      inferSchema = True, sep = "\t")

data.show(5, truncate = False)

+----+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
|_c0 |_c1                                                                                                                                                        |
+----+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
|ham |Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...                                            |
|ham |Ok lar... Joking wif u oni...                                                                                                                              |
|spam|Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's|
|ham |U dun say so ear

In [3]:
data = data.withColumnRenamed(existing = "_c0", new = "class")\
           .withColumnRenamed(existing = "_c1", new = "text")

data.show(3, truncate = False)

+-----+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
|class|text                                                                                                                                                       |
+-----+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
|ham  |Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...                                            |
|ham  |Ok lar... Joking wif u oni...                                                                                                                              |
|spam |Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's|
+-----+---------

In [4]:
from pyspark.sql.functions import length

data = data.withColumn(colName = "length", col = length(data["text"]))

data.show()

+-----+--------------------+------+
|class|                text|length|
+-----+--------------------+------+
|  ham|Go until jurong p...|   111|
|  ham|Ok lar... Joking ...|    29|
| spam|Free entry in 2 a...|   155|
|  ham|U dun say so earl...|    49|
|  ham|Nah I don't think...|    61|
| spam|FreeMsg Hey there...|   147|
|  ham|Even my brother i...|    77|
|  ham|As per your reque...|   160|
| spam|WINNER!! As a val...|   157|
| spam|Had your mobile 1...|   154|
|  ham|I'm gonna be home...|   109|
| spam|SIX chances to wi...|   136|
| spam|URGENT! You have ...|   155|
|  ham|I've been searchi...|   196|
|  ham|I HAVE A DATE ON ...|    35|
| spam|XXXMobileMovieClu...|   149|
|  ham|Oh k...i'm watchi...|    26|
|  ham|Eh u remember how...|    81|
|  ham|Fine if thats th...|    56|
| spam|England v Macedon...|   155|
+-----+--------------------+------+
only showing top 20 rows



In [5]:
data.groupby("class").mean().show()

+-----+-----------------+
|class|      avg(length)|
+-----+-----------------+
|  ham|71.45431945307645|
| spam|138.6706827309237|
+-----+-----------------+



In [7]:
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import IDF
from pyspark.ml.feature import StringIndexer

from pyspark.ml.feature import StandardScaler

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

In [8]:
tokenizer = Tokenizer(inputCol = "text",
                      outputCol = "token_text") # tokenizamos las sentencias

remover = StopWordsRemover(inputCol = "token_text",
                           outputCol = "stop_tokens") #eliminamos stop words

cv = CountVectorizer(inputCol = "stop_tokens",
                            outputCol = "count_vec") # le pasamos el countventorizer

idf = IDF(inputCol = "count_vec",
          outputCol = "tf_idf") # transformamos con IDF

class_indexer = StringIndexer(inputCol = "class",
                              outputCol = "label") # label encoder para codificar nuestras clases

assembler = VectorAssembler(inputCols = ["tf_idf", "length"],
                            outputCol = "features") # constructor del sparsher vector

scaler = StandardScaler(inputCol = "features",
                        outputCol = "scaled_features",
                        withStd = True,
                        withMean = False) # aplicamos el standar scaler a nuestros datos

### NaiveBayes

In [9]:
from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes(featuresCol = "scaled_features",
                labelCol = "label",
                predictionCol = "prediction")

### Pipeline

In [10]:
from pyspark.ml import Pipeline

data_pipeline = Pipeline(stages = [class_indexer, tokenizer, remover, cv, idf, assembler, scaler])

df = data_pipeline.fit(data).transform(data)

df.show()

+-----+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|class|                text|length|label|          token_text|         stop_tokens|           count_vec|              tf_idf|            features|     scaled_features|
+-----+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  ham|Go until jurong p...|   111|  0.0|[go, until, juron...|[go, jurong, poin...|(13423,[7,11,31,6...|(13423,[7,11,31,6...|(13424,[7,11,31,6...|(13424,[7,11,31,6...|
|  ham|Ok lar... Joking ...|    29|  0.0|[ok, lar..., joki...|[ok, lar..., joki...|(13423,[0,24,297,...|(13423,[0,24,297,...|(13424,[0,24,297,...|(13424,[0,24,297,...|
| spam|Free entry in 2 a...|   155|  1.0|[free, entry, in,...|[free, entry, 2, ...|(13423,[2,13,19,3...|(13423,[2,13,19,3...|(13424,[2,13,19,3...|(13424,[2,13,1

In [11]:
df = df.select("label", "scaled_features")
df.show()

+-----+--------------------+
|label|     scaled_features|
+-----+--------------------+
|  0.0|(13424,[7,11,31,6...|
|  0.0|(13424,[0,24,297,...|
|  1.0|(13424,[2,13,19,3...|
|  0.0|(13424,[0,70,80,1...|
|  0.0|(13424,[36,134,31...|
|  1.0|(13424,[10,60,139...|
|  0.0|(13424,[10,53,103...|
|  0.0|(13424,[125,184,4...|
|  1.0|(13424,[1,47,118,...|
|  1.0|(13424,[0,1,13,27...|
|  0.0|(13424,[18,43,120...|
|  1.0|(13424,[8,17,37,8...|
|  1.0|(13424,[13,30,47,...|
|  0.0|(13424,[39,96,217...|
|  0.0|(13424,[552,1697,...|
|  1.0|(13424,[30,109,11...|
|  0.0|(13424,[82,214,47...|
|  0.0|(13424,[0,2,49,13...|
|  0.0|(13424,[0,74,105,...|
|  1.0|(13424,[4,30,33,5...|
+-----+--------------------+
only showing top 20 rows



In [17]:
train, test = df.randomSplit(weights = [0.7, 0.3], seed = 42)

In [13]:
model = nb.fit(train)

y_hat = model.transform(test)

y_hat.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|     scaled_features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(13424,[0,1,2,41,...|[-6226.5365510546...|[1.0,3.3317145166...|       0.0|
|  0.0|(13424,[0,1,5,20,...|[-2988.9347593964...|[1.0,1.5042166897...|       0.0|
|  0.0|(13424,[0,1,7,8,1...|[-5712.5187450676...|           [1.0,0.0]|       0.0|
|  0.0|(13424,[0,1,7,15,...|[-2093.1648499213...|[1.0,2.2619322386...|       0.0|
|  0.0|(13424,[0,1,12,33...|[-1203.2516574584...|[1.0,4.9040892080...|       0.0|
|  0.0|(13424,[0,1,14,18...|[-8246.6942360396...|[1.0,1.8263902087...|       0.0|
|  0.0|(13424,[0,1,14,31...|[-351.55828013012...|[1.0,1.2459092621...|       0.0|
|  0.0|(13424,[0,1,18,20...|[-3498.1648008885...|[1.0,2.1889348879...|       0.0|
|  0.0|(13424,[0,1,21,27...|[-2159.7887451919...|[1.0,6.0675741146...|       0.0|
|  0.0|(13424,[0

In [14]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(predictionCol = "prediction", 
                                              labelCol = "label",
                                              metricName = "accuracy")
accuracy = evaluator.evaluate(y_hat)

accuracy

0.8060263653483992

In [15]:
f1_evaluator = MulticlassClassificationEvaluator(predictionCol = "prediction", 
                                              labelCol = "label",
                                              metricName = "f1")
f1 = f1_evaluator.evaluate(y_hat)

f1

0.8344303522772644

## Haciendo la división de Train-Test antes de escalar y transformar los datos

In [39]:
train, test = data.randomSplit(weights = [0.7, 0.3], seed = 42)

In [40]:
data_pipeline_2 = Pipeline(stages = [class_indexer, tokenizer, remover, cv, idf, assembler, scaler])

pipe_trained = data_pipeline_2.fit(train)
train = pipe_trained.transform(train)
test = pipe_trained.transform(test)

train.show()

+-----+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|class|                text|length|label|          token_text|         stop_tokens|           count_vec|              tf_idf|            features|     scaled_features|
+-----+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  ham| &lt;#&gt;  in mc...|    36|  0.0|[, &lt;#&gt;, , i...|[, &lt;#&gt;, , m...|(11001,[3,8,2824,...|(11001,[3,8,2824,...|(11002,[3,8,2824,...|(11002,[3,8,2824,...|
|  ham| &lt;#&gt;  mins ...|    51|  0.0|[, &lt;#&gt;, , m...|[, &lt;#&gt;, , m...|(11001,[3,8,42,20...|(11001,[3,8,42,20...|(11002,[3,8,42,20...|(11002,[3,8,42,20...|
|  ham| and  picking the...|    41|  0.0|[, and, , picking...|[, , picking, var...|(11001,[3,887,316...|(11001,[3,887,316...|(11002,[3,887,316...|(11002,[3,887,

In [42]:
test.toPandas()

Unnamed: 0,class,text,length,label,token_text,stop_tokens,count_vec,tf_idf,features,scaled_features
0,ham,&lt;DECIMAL&gt; m but its not a common car he...,132,0.0,"[, &lt;decimal&gt;, m, but, its, not, a, commo...","[, &lt;decimal&gt;, m, common, car, better, bu...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 3.1021536787833863, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 3.1021536787833863, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 2.0960046808042807, 0.0, 0.0, ..."
1,ham,"said kiss, kiss, i can't do the sound effects...",133,0.0,"[, said, kiss,, kiss,, i, can't, do, the, soun...","[, said, kiss,, kiss,, sound, effects!, gorgeo...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 3.1021536787833863, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 3.1021536787833863, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 2.0960046808042807, 0.0, 0.0, ..."
2,ham,what number do u live at? Is it 11?,36,0.0,"[, what, number, do, u, live, at?, is, it, 11?]","[, number, u, live, at?, 11?]","(1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(2.0204432009178803, 0.0, 0.0, 3.1021536787833...","(2.0204432009178803, 0.0, 0.0, 3.1021536787833...","(1.8598083446711655, 0.0, 0.0, 2.0960046808042..."
3,ham,"""Gimme a few"" was &lt;#&gt; minutes ago",41,0.0,"[""gimme, a, few"", was, , &lt;#&gt;, , minutes,...","[""gimme, few"", , &lt;#&gt;, , minutes, ago]","(0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","(0.0, 0.0, 0.0, 6.204307357566773, 0.0, 0.0, 0...","(0.0, 0.0, 0.0, 6.204307357566773, 0.0, 0.0, 0...","(0.0, 0.0, 0.0, 4.192009361608561, 0.0, 0.0, 0..."
4,ham,"""Response"" is one of d powerful weapon 2 occup...",154,0.0,"[""response"", is, one, of, d, powerful, weapon,...","[""response"", one, d, powerful, weapon, 2, occu...","(0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 0.0, 5.367474836656289, 0.0, 0.0, 0.0, 0...","(0.0, 0.0, 5.367474836656289, 0.0, 0.0, 0.0, 0...","(0.0, 0.0, 6.091934415732526, 0.0, 0.0, 0.0, 0..."
...,...,...,...,...,...,...,...,...,...,...
1588,spam,"complimentary 4 STAR Ibiza Holiday or £10,000 ...",149,1.0,"[complimentary, 4, star, ibiza, holiday, or, £...","[complimentary, 4, star, ibiza, holiday, £10,0...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.11338975...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.11338975...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.38357660..."
1589,spam,dating:i have had two of these. Only started a...,139,1.0,"[dating:i, have, had, two, of, these., only, s...","[dating:i, two, these., started, sent, text, t...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1590,spam,important information 4 orange user 0789xxxxxx...,163,1.0,"[important, information, 4, orange, user, 0789...","[important, information, 4, orange, user, 0789...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.11338975...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.11338975...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.38357660..."
1591,spam,sexy sexy cum and text me im wet and warm and ...,144,1.0,"[sexy, sexy, cum, and, text, me, im, wet, and,...","[sexy, sexy, cum, text, im, wet, warm, ready, ...","(1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(2.0204432009178803, 0.0, 2.6837374183281444, ...","(2.0204432009178803, 0.0, 2.6837374183281444, ...","(1.8598083446711655, 0.0, 3.045967207866263, 0..."


In [43]:
train = train.select("label", "scaled_features")
test = test.select("label", "scaled_features")

In [44]:
model = nb.fit(train)

y_hat = model.transform(test)

y_hat.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|     scaled_features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(11002,[3,81,128,...|[-2487.7633768525...|           [1.0,0.0]|       0.0|
|  0.0|(11002,[3,115,200...|[-2395.5188627004...|           [1.0,0.0]|       0.0|
|  0.0|(11002,[0,3,86,19...|[-798.40120183507...|[1.0,1.0453269184...|       0.0|
|  0.0|(11002,[3,8,302,1...|[-415.41060815331...|[1.0,2.8823362140...|       0.0|
|  0.0|(11002,[2,7,22,55...|[-2520.9205913190...|[1.0,3.6800840412...|       0.0|
|  0.0|(11002,[0,5,10,33...|[-2262.0873761743...|[1.0,6.1645992048...|       0.0|
|  0.0|(11002,[97,122,24...|[-413.52210074161...|[1.0,1.8854556421...|       0.0|
|  0.0|(11002,[3,8,33,68...|[-3588.8353247867...|           [1.0,0.0]|       0.0|
|  0.0|(11002,[3,8,294,6...|[-586.34290420575...|[1.0,1.9741630381...|       0.0|
|  0.0|(11002,[8

### Metricas en train

In [47]:
train_hat = model.transform(train)
f1_evaluator = MulticlassClassificationEvaluator(predictionCol = "prediction", 
                                              labelCol = "label",
                                              metricName = "f1")
f1 = f1_evaluator.evaluate(train_hat)

f1

0.996999768480733

In [48]:
evaluator = MulticlassClassificationEvaluator(predictionCol = "prediction", 
                                              labelCol = "label",
                                              metricName = "accuracy")
accuracy = evaluator.evaluate(train_hat)

accuracy

0.9969856819894499

### Metricas en Test

In [45]:
f1_evaluator = MulticlassClassificationEvaluator(predictionCol = "prediction", 
                                              labelCol = "label",
                                              metricName = "f1")
f1 = f1_evaluator.evaluate(y_hat)

f1

0.9613723626898588

In [46]:
evaluator = MulticlassClassificationEvaluator(predictionCol = "prediction", 
                                              labelCol = "label",
                                              metricName = "accuracy")
accuracy = evaluator.evaluate(y_hat)

accuracy

0.96045197740113

In [None]:
################################################################################################################################