# Spark
## Tools for NLP
### Code along

In [30]:
import findspark

from pyspark import SparkConf

from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [31]:
findspark.init()
findspark.find()

'/home/alvarodiego/miniconda3/envs/pyspark/lib/python3.12/site-packages/pyspark'

In [32]:
conf = SparkConf() \
    .setAppName('nlp') \
    #.setMaster('local') \
    #.set('spark.executor.memory', '8g') \
    #.set('spark.driver.maxResultSize', '8g') \
    #.set("spark.memory.fraction", "0.6") \
    #.set("spark.memory.storageFraction", "0.5") \
    #.set("spark.sql.shuffle.partitions", "5") \
    #.set("spark.memory.offHeap.enabled", "false") \
    #.set("spark.reducer.maxSizeInFlight", "96m") \
    #.set("spark.shuffle.file.buffer", "256k") \
    #.set("spark.sql.debug.maxToStringFields", "100") \
    #.set('spark.sql.autoBroadcastJoinThreshold', '-1')

In [33]:
%%capture

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [34]:
data = spark.read.csv('../data/smsspamcollection/SMSSpamCollection', inferSchema=True, sep='\t', header=False)
data.show(5, truncate=False)

+----+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
|_c0 |_c1                                                                                                                                                        |
+----+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
|ham |Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...                                            |
|ham |Ok lar... Joking wif u oni...                                                                                                                              |
|spam|Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's|
|ham |U dun say so ear

In [35]:
data = data.withColumnRenamed('_c0', 'class').withColumnRenamed('_c1', 'text')
data.show(5)

+-----+--------------------+
|class|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
|  ham|U dun say so earl...|
|  ham|Nah I don't think...|
+-----+--------------------+
only showing top 5 rows



In [36]:
import pyspark.sql.functions as F

In [37]:
data = data.withColumn('lenght', F.length(data['text']))
data.show()

+-----+--------------------+------+
|class|                text|lenght|
+-----+--------------------+------+
|  ham|Go until jurong p...|   111|
|  ham|Ok lar... Joking ...|    29|
| spam|Free entry in 2 a...|   155|
|  ham|U dun say so earl...|    49|
|  ham|Nah I don't think...|    61|
| spam|FreeMsg Hey there...|   147|
|  ham|Even my brother i...|    77|
|  ham|As per your reque...|   160|
| spam|WINNER!! As a val...|   157|
| spam|Had your mobile 1...|   154|
|  ham|I'm gonna be home...|   109|
| spam|SIX chances to wi...|   136|
| spam|URGENT! You have ...|   155|
|  ham|I've been searchi...|   196|
|  ham|I HAVE A DATE ON ...|    35|
| spam|XXXMobileMovieClu...|   149|
|  ham|Oh k...i'm watchi...|    26|
|  ham|Eh u remember how...|    81|
|  ham|Fine if thatÂ’s th...|    56|
| spam|England v Macedon...|   155|
+-----+--------------------+------+
only showing top 20 rows



In [38]:
data.groupBy('class').mean().show()

+-----+-----------------+
|class|      avg(lenght)|
+-----+-----------------+
|  ham|71.45431945307645|
| spam|138.6706827309237|
+-----+-----------------+



In [70]:
from pyspark.ml.feature import (
    Tokenizer,
    StopWordsRemover,
    CountVectorizer,
    IDF,
    StringIndexer
)

In [71]:
tokenizer = Tokenizer(
    inputCol='text',
    outputCol='token_text'
)

stop_remove = StopWordsRemover(
    inputCol='token_text',
    outputCol='stop_token'
)

count_vec = CountVectorizer(
    inputCol='stop_token',
    outputCol='c_vec'
)

idf = IDF(
    inputCol='c_vec',
    outputCol='tf_idf'
)

ham_spam_to_numeric = StringIndexer(
    inputCol='class',
    outputCol='label'
)

In [72]:
from pyspark.ml.feature import VectorAssembler

In [73]:
clean_up = VectorAssembler(
    inputCols=[
        'tf_idf',
        'lenght'
    ],
    outputCol='features'
)

In [74]:
from pyspark.ml.classification import NaiveBayes

In [75]:
nb = NaiveBayes()

In [76]:
from pyspark.ml import Pipeline

In [77]:
data_prep_pipe = Pipeline(
    stages=[
        ham_spam_to_numeric,
        tokenizer,
        stop_remove,
        count_vec,
        idf,
        clean_up
    ]
)

In [78]:
cleaner = data_prep_pipe.fit(data)

In [79]:
clean_data = cleaner.transform(data)
clean_data.show(5)

+-----+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|class|                text|lenght|label|          token_text|          stop_token|               c_vec|              tf_idf|            features|
+-----+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|  ham|Go until jurong p...|   111|  0.0|[go, until, juron...|[go, jurong, poin...|(13423,[7,11,31,6...|(13423,[7,11,31,6...|(13424,[7,11,31,6...|
|  ham|Ok lar... Joking ...|    29|  0.0|[ok, lar..., joki...|[ok, lar..., joki...|(13423,[0,24,301,...|(13423,[0,24,301,...|(13424,[0,24,301,...|
| spam|Free entry in 2 a...|   155|  1.0|[free, entry, in,...|[free, entry, 2, ...|(13423,[2,13,19,3...|(13423,[2,13,19,3...|(13424,[2,13,19,3...|
|  ham|U dun say so earl...|    49|  0.0|[u, dun, say, so,...|[u, dun, say, ear...|(13423,[0,70,80,1...|(13423,[0,70,8

In [80]:
clean_data.columns

['class',
 'text',
 'lenght',
 'label',
 'token_text',
 'stop_token',
 'c_vec',
 'tf_idf',
 'features']

In [81]:
input_data = clean_data.select('label', 'features').show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(13424,[7,11,31,6...|
|  0.0|(13424,[0,24,301,...|
|  1.0|(13424,[2,13,19,3...|
|  0.0|(13424,[0,70,80,1...|
|  0.0|(13424,[36,134,31...|
+-----+--------------------+
only showing top 5 rows



In [82]:
train, test = clean_data.randomSplit([0.7, 0.3])

In [83]:
spam_detector = nb.fit(train)

23/11/10 14:29:29 WARN DAGScheduler: Broadcasting large task binary with size 1191.2 KiB
23/11/10 14:29:31 WARN DAGScheduler: Broadcasting large task binary with size 1150.6 KiB
                                                                                

In [84]:
test_result = spam_detector.transform(test)
test_result.show(5)

23/11/10 14:29:55 WARN DAGScheduler: Broadcasting large task binary with size 1409.6 KiB
[Stage 78:>                                                         (0 + 1) / 1]

+-----+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|class|                text|lenght|label|          token_text|          stop_token|               c_vec|              tf_idf|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|  ham| &lt;DECIMAL&gt; ...|   132|  0.0|[, &lt;decimal&gt...|[, &lt;decimal&gt...|(13423,[3,84,115,...|(13423,[3,84,115,...|(13424,[3,84,115,...|[-895.18085866927...|[1.0,9.1838325072...|       0.0|
|  ham| and  picking the...|    41|  0.0|[, and, , picking...|[, , picking, var...|(13423,[3,723,200...|(13423,[3,723,200...|(13424,[3,723,200...|[-268.63138177122...|[1.0,9.5092597136...|       0.0|


                                                                                

In [85]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [86]:
acc_eval = MulticlassClassificationEvaluator()

In [87]:
acc = acc_eval.evaluate(test_result)

23/11/10 14:30:37 WARN DAGScheduler: Broadcasting large task binary with size 1401.1 KiB
                                                                                

In [89]:
acc

0.925541627659008