### Classificacao de SPAM

In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import IDF, HashingTF, Tokenizer
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
spSession = SparkSession.builder.master("local").appName("classifier-spam").getOrCreate()

In [3]:
spamRRR = sc.textFile("data/SMSSpamCollection.csv")
spamRRR.cache()

data/SMSSpamCollection.csv MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [4]:
spamRRR.take(5)

['ham,Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...,,,,,,,,,',
 'ham,Ok lar... Joking wif u oni...,,,,,,,,,,',
 'ham,U dun say so early hor... U c already then say...,,,,,,,,,,',
 "ham,Nah I don't think he goes to usf, he lives around here though,,,,,,,,,",
 'ham,Even my brother is not like to speak with me. They treat me like aids patent.,,,,,,,,,,']

### Pré processamento de dados

In [5]:
def transformToVector(line):
    lineList = line.split(",")
    smsType = 0.0 if lineList[0] == "ham" else 1.0
    
    return [smsType, lineList[1]]

In [6]:
spamRRR2 = spamRRR.map(transformToVector)
spamRRR2.take(2)

[[0.0, 'Go until jurong point'], [0.0, 'Ok lar... Joking wif u oni...']]

In [7]:
spamDF = spSession.createDataFrame(spamRRR2, ["label", "message"])
spamDF.cache()
spamDF.select("*").show()

+-----+--------------------+
|label|             message|
+-----+--------------------+
|  0.0|Go until jurong p...|
|  0.0|Ok lar... Joking ...|
|  0.0|U dun say so earl...|
|  0.0|Nah I don't think...|
|  0.0|Even my brother i...|
|  0.0|As per your reque...|
|  0.0|I'm gonna be home...|
|  0.0|I've been searchi...|
|  0.0|I HAVE A DATE ON ...|
|  0.0|Oh k...i'm watchi...|
|  0.0|Eh u remember how...|
|  0.0|Fine if thats th...|
|  0.0|Is that seriously...|
|  0.0|I‘m going to try ...|
|  0.0|So ü pay first la...|
|  0.0|Aft i finish my l...|
|  0.0|Ffffffffff. Alrig...|
|  0.0|Just forced mysel...|
|  0.0|Lol your always s...|
|  0.0|Did you catch the...|
+-----+--------------------+
only showing top 20 rows



### Processamento de Linguagem Natural
* Aplicação do TF-IDF
* TF - Term Frequency
* IDF - Inverse Document Frequency


In [8]:
# menor estrutura dentro de um texto
tokenizer = Tokenizer(inputCol="message", outputCol="words")

In [9]:
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="tempfeatures")

In [10]:
idf = IDF(inputCol = hashingTF.getOutputCol(), outputCol="features")

### Machine Learning

In [11]:
df_train, df_test = spamDF.randomSplit([0.7, 0.3])

In [12]:
df_train.count()

730

In [13]:
df_test.count()

270

In [14]:
classifier = NaiveBayes()

In [15]:
pipeline = Pipeline(stages = [tokenizer, hashingTF, idf, classifier])

In [16]:
model = pipeline.fit(df_train)

In [17]:
predictions = model.transform(df_test)
predictions.select("*").take(1)

[Row(label=0.0, message='4 oclock at mine. Just to bash out a flat plan.', words=['4', 'oclock', 'at', 'mine.', 'just', 'to', 'bash', 'out', 'a', 'flat', 'plan.'], tempfeatures=SparseVector(262144, {10269: 1.0, 37542: 1.0, 38107: 1.0, 97171: 1.0, 114290: 1.0, 140390: 1.0, 140854: 1.0, 176964: 1.0, 205044: 1.0, 227410: 1.0, 233878: 1.0}), features=SparseVector(262144, {10269: 6.5944, 37542: 6.5944, 38107: 6.5944, 97171: 2.7026, 114290: 6.5944, 140390: 3.0681, 140854: 6.5944, 176964: 3.1287, 205044: 1.0034, 227410: 1.3317, 233878: 2.6824}), rawPrediction=DenseVector([-535.5643, -528.3838]), probability=DenseVector([0.0008, 0.9992]), prediction=1.0)]

In [18]:
predictions.columns

['label',
 'message',
 'words',
 'tempfeatures',
 'features',
 'rawPrediction',
 'probability',
 'prediction']

In [23]:
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',
    labelCol='label',
    metricName='accuracy')

In [24]:
evaluator.evaluate(predictions)

0.9

In [27]:
predictions.groupBy("label", "prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  126|
|  0.0|       1.0|   13|
|  1.0|       0.0|   14|
|  0.0|       0.0|  117|
+-----+----------+-----+

