# NLP

In [21]:
from pyspark.sql import SparkSession

In [22]:
spark = SparkSession.builder.appName('nlp').getOrCreate()

In [23]:
data = spark.read.csv("./files/smsspamcollection/SMSSpamCollection",inferSchema=True,sep='\t')

In [24]:
data = data.withColumnRenamed('_c0', 'class').withColumnRenamed('_c1', 'text')
data.show()

+-----+--------------------+
|class|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
|  ham|U dun say so earl...|
|  ham|Nah I don't think...|
| spam|FreeMsg Hey there...|
|  ham|Even my brother i...|
|  ham|As per your reque...|
| spam|WINNER!! As a val...|
| spam|Had your mobile 1...|
|  ham|I'm gonna be home...|
| spam|SIX chances to wi...|
| spam|URGENT! You have ...|
|  ham|I've been searchi...|
|  ham|I HAVE A DATE ON ...|
| spam|XXXMobileMovieClu...|
|  ham|Oh k...i'm watchi...|
|  ham|Eh u remember how...|
|  ham|Fine if thats th...|
| spam|England v Macedon...|
+-----+--------------------+


In [25]:
from pyspark.sql.functions import length


In [26]:
data = data.withColumn('length', length(data['text']))
data.show()

+-----+--------------------+------+
|class|                text|length|
+-----+--------------------+------+
|  ham|Go until jurong p...|   111|
|  ham|Ok lar... Joking ...|    29|
| spam|Free entry in 2 a...|   155|
|  ham|U dun say so earl...|    49|
|  ham|Nah I don't think...|    61|
| spam|FreeMsg Hey there...|   147|
|  ham|Even my brother i...|    77|
|  ham|As per your reque...|   160|
| spam|WINNER!! As a val...|   157|
| spam|Had your mobile 1...|   154|
|  ham|I'm gonna be home...|   109|
| spam|SIX chances to wi...|   136|
| spam|URGENT! You have ...|   155|
|  ham|I've been searchi...|   196|
|  ham|I HAVE A DATE ON ...|    35|
| spam|XXXMobileMovieClu...|   149|
|  ham|Oh k...i'm watchi...|    26|
|  ham|Eh u remember how...|    81|
|  ham|Fine if thats th...|    56|
| spam|England v Macedon...|   155|
+-----+--------------------+------+


In [27]:
data.groupBy('class').mean().show()

+-----+-----------------+
|class|      avg(length)|
+-----+-----------------+
|  ham|71.45431945307645|
| spam|138.6706827309237|
+-----+-----------------+


In [28]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer

In [29]:
tokenizer = Tokenizer(inputCol='text', outputCol='token_text')
result = tokenizer.transform(data)

In [30]:
stop_remover = StopWordsRemover(inputCol='token_text', outputCol='clean_tokens')
result = stop_remover.transform(result)
result.show()

+-----+--------------------+------+--------------------+--------------------+
|class|                text|length|          token_text|        clean_tokens|
+-----+--------------------+------+--------------------+--------------------+
|  ham|Go until jurong p...|   111|[go, until, juron...|[go, jurong, poin...|
|  ham|Ok lar... Joking ...|    29|[ok, lar..., joki...|[ok, lar..., joki...|
| spam|Free entry in 2 a...|   155|[free, entry, in,...|[free, entry, 2, ...|
|  ham|U dun say so earl...|    49|[u, dun, say, so,...|[u, dun, say, ear...|
|  ham|Nah I don't think...|    61|[nah, i, don't, t...|[nah, think, goes...|
| spam|FreeMsg Hey there...|   147|[freemsg, hey, th...|[freemsg, hey, da...|
|  ham|Even my brother i...|    77|[even, my, brothe...|[even, brother, l...|
|  ham|As per your reque...|   160|[as, per, your, r...|[per, request, 'm...|
| spam|WINNER!! As a val...|   157|[winner!!, as, a,...|[winner!!, valued...|
| spam|Had your mobile 1...|   154|[had, your, mobil...|[mobile,

In [31]:
count_vec = CountVectorizer(inputCol='clean_tokens', outputCol='vector')
result = count_vec.fit(result).transform(result)
result.show()

+-----+--------------------+------+--------------------+--------------------+--------------------+
|class|                text|length|          token_text|        clean_tokens|              vector|
+-----+--------------------+------+--------------------+--------------------+--------------------+
|  ham|Go until jurong p...|   111|[go, until, juron...|[go, jurong, poin...|(13423,[7,11,31,6...|
|  ham|Ok lar... Joking ...|    29|[ok, lar..., joki...|[ok, lar..., joki...|(13423,[0,24,301,...|
| spam|Free entry in 2 a...|   155|[free, entry, in,...|[free, entry, 2, ...|(13423,[2,13,19,3...|
|  ham|U dun say so earl...|    49|[u, dun, say, so,...|[u, dun, say, ear...|(13423,[0,70,80,1...|
|  ham|Nah I don't think...|    61|[nah, i, don't, t...|[nah, think, goes...|(13423,[36,134,31...|
| spam|FreeMsg Hey there...|   147|[freemsg, hey, th...|[freemsg, hey, da...|(13423,[10,60,140...|
|  ham|Even my brother i...|    77|[even, my, brothe...|[even, brother, l...|(13423,[10,53,102...|
|  ham|As 

In [32]:
idf = IDF(inputCol='vector', outputCol='tf_idf')
result = idf.fit(result).transform(result)
result.show()

+-----+--------------------+------+--------------------+--------------------+--------------------+--------------------+
|class|                text|length|          token_text|        clean_tokens|              vector|              tf_idf|
+-----+--------------------+------+--------------------+--------------------+--------------------+--------------------+
|  ham|Go until jurong p...|   111|[go, until, juron...|[go, jurong, poin...|(13423,[7,11,31,6...|(13423,[7,11,31,6...|
|  ham|Ok lar... Joking ...|    29|[ok, lar..., joki...|[ok, lar..., joki...|(13423,[0,24,301,...|(13423,[0,24,301,...|
| spam|Free entry in 2 a...|   155|[free, entry, in,...|[free, entry, 2, ...|(13423,[2,13,19,3...|(13423,[2,13,19,3...|
|  ham|U dun say so earl...|    49|[u, dun, say, so,...|[u, dun, say, ear...|(13423,[0,70,80,1...|(13423,[0,70,80,1...|
|  ham|Nah I don't think...|    61|[nah, i, don't, t...|[nah, think, goes...|(13423,[36,134,31...|(13423,[36,134,31...|
| spam|FreeMsg Hey there...|   147|[free

In [33]:
indexer = StringIndexer(inputCol='class', outputCol='label')
result = indexer.fit(result).transform(result)
result.show()

+-----+--------------------+------+--------------------+--------------------+--------------------+--------------------+-----+
|class|                text|length|          token_text|        clean_tokens|              vector|              tf_idf|label|
+-----+--------------------+------+--------------------+--------------------+--------------------+--------------------+-----+
|  ham|Go until jurong p...|   111|[go, until, juron...|[go, jurong, poin...|(13423,[7,11,31,6...|(13423,[7,11,31,6...|  0.0|
|  ham|Ok lar... Joking ...|    29|[ok, lar..., joki...|[ok, lar..., joki...|(13423,[0,24,301,...|(13423,[0,24,301,...|  0.0|
| spam|Free entry in 2 a...|   155|[free, entry, in,...|[free, entry, 2, ...|(13423,[2,13,19,3...|(13423,[2,13,19,3...|  1.0|
|  ham|U dun say so earl...|    49|[u, dun, say, so,...|[u, dun, say, ear...|(13423,[0,70,80,1...|(13423,[0,70,80,1...|  0.0|
|  ham|Nah I don't think...|    61|[nah, i, don't, t...|[nah, think, goes...|(13423,[36,134,31...|(13423,[36,134,31...

In [34]:
from pyspark.ml.feature import VectorAssembler

In [35]:
assembler = VectorAssembler(inputCols=['tf_idf', 'length'], outputCol='features')
result = assembler.transform(result)
result.show()

+-----+--------------------+------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+
|class|                text|length|          token_text|        clean_tokens|              vector|              tf_idf|label|            features|
+-----+--------------------+------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+
|  ham|Go until jurong p...|   111|[go, until, juron...|[go, jurong, poin...|(13423,[7,11,31,6...|(13423,[7,11,31,6...|  0.0|(13424,[7,11,31,6...|
|  ham|Ok lar... Joking ...|    29|[ok, lar..., joki...|[ok, lar..., joki...|(13423,[0,24,301,...|(13423,[0,24,301,...|  0.0|(13424,[0,24,301,...|
| spam|Free entry in 2 a...|   155|[free, entry, in,...|[free, entry, 2, ...|(13423,[2,13,19,3...|(13423,[2,13,19,3...|  1.0|(13424,[2,13,19,3...|
|  ham|U dun say so earl...|    49|[u, dun, say, so,...|[u, dun, say, ear...|(13423,[0,70,80,1...|(13423,[0,70,80,1...

In [36]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes

In [38]:
data.show()

+-----+--------------------+------+
|class|                text|length|
+-----+--------------------+------+
|  ham|Go until jurong p...|   111|
|  ham|Ok lar... Joking ...|    29|
| spam|Free entry in 2 a...|   155|
|  ham|U dun say so earl...|    49|
|  ham|Nah I don't think...|    61|
| spam|FreeMsg Hey there...|   147|
|  ham|Even my brother i...|    77|
|  ham|As per your reque...|   160|
| spam|WINNER!! As a val...|   157|
| spam|Had your mobile 1...|   154|
|  ham|I'm gonna be home...|   109|
| spam|SIX chances to wi...|   136|
| spam|URGENT! You have ...|   155|
|  ham|I've been searchi...|   196|
|  ham|I HAVE A DATE ON ...|    35|
| spam|XXXMobileMovieClu...|   149|
|  ham|Oh k...i'm watchi...|    26|
|  ham|Eh u remember how...|    81|
|  ham|Fine if thats th...|    56|
| spam|England v Macedon...|   155|
+-----+--------------------+------+


In [39]:
pipeline = Pipeline(stages=[
    tokenizer,
    stop_remover,
    count_vec,
    idf,
    indexer,
    assembler
])

train, test = data.randomSplit([0.7, 0.3])

In [42]:
pipeline = pipeline.fit(train)
train_transformed = pipeline.transform(train)
test_transformed = pipeline.transform(test)

In [43]:
train_transformed = train_transformed.select(['features', 'label'])
test_transformed = test_transformed.select(['features', 'label'])

In [45]:
model = NaiveBayes().fit(train_transformed)

                                                                                

In [46]:
train_predictions = model.transform(train_transformed)
test_predictions = model.transform(test_transformed)

In [47]:
train_predictions.show()

24/06/16 17:24:23 WARN DAGScheduler: Broadcasting large task binary with size 1120.8 KiB


+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(10800,[3,6,36,22...|  0.0|[-294.52390941693...|[1.0,1.8728937129...|       0.0|
|(10800,[3,92,95,2...|  0.0|[-809.82392072852...|[1.0,1.1421553015...|       0.0|
|(10800,[3,900,202...|  0.0|[-255.63134836049...|[1.0,2.0373222414...|       0.0|
|(10800,[3,10,177,...|  0.0|[-984.61871104314...|[1.0,2.0406154539...|       0.0|
|(10800,[3,108,223...|  0.0|[-946.99169727809...|[1.0,1.9758302697...|       0.0|
|(10800,[0,3,13,17...|  0.0|[-1117.1196461143...|[1.0,7.1694161263...|       0.0|
|(10800,[4,24,26,1...|  0.0|[-1116.8676841315...|[1.0,1.8997643707...|       0.0|
|(10800,[2,8,26,49...|  0.0|[-1302.7843397029...|[1.0,4.2344701660...|       0.0|
|(10800,[16,122,27...|  0.0|[-822.34184584247...|[1.0,3.0218180620...|       0.0|
|(10800,[0,78,21

24/06/16 17:24:24 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


In [48]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [49]:
evaluator = MulticlassClassificationEvaluator(metricName='accuracy')
train_accuracy = evaluator.evaluate(train_predictions)
test_accuracy = evaluator.evaluate(test_predictions)

24/06/16 17:25:47 WARN DAGScheduler: Broadcasting large task binary with size 1125.8 KiB
24/06/16 17:25:48 WARN DAGScheduler: Broadcasting large task binary with size 1125.8 KiB
                                                                                

In [50]:
train_accuracy

0.9959204487506375

In [51]:
test_accuracy

0.9769975786924939