In [None]:
%matplotlib inline

# Classification with Spark

In [None]:
import os 

from operator import add 
from operator import itemgetter 

In [None]:
# Directories 
DATA = os.path.abspath(os.path.join("..", "data"))
TEXT = os.path.join(DATA, "spam_classifier")

# Classes 
SPAM = 0 
HAM  = 1 

# Labels 
LABELS = {
    SPAM: "spam",
    HAM: "ham",
}

## Loading Data and Extracting Features

In [None]:
# Create RDDs for both Spam and Ham 
spam = sc.textFile(os.path.join(TEXT, "spam.txt"))
ham  = sc.textFile(os.path.join(TEXT, "ham.txt"))

# Map the labels to each. 
spam = spam.map(lambda line: (SPAM, line))
ham  = ham.map(lambda line: (HAM, line))

# Append the datasets with their labels 
text = spam.union(ham)

In [None]:
# Create the DataFrame RDD 
text = text.toDF(["label", "line"])

In [None]:
text.head()

In [None]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [None]:
tokenizer = Tokenizer(inputCol="line", outputCol="words")
words = tokenizer.transform(text)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=200)
features = hashingTF.transform(words)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(features)
rescaled = idfModel.transform(features)

rescaled.select("label", "features").show()

## Training a Bayesian Model

In [None]:
# Import the model family 
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
# Create a data frame of only label and features
data = rescaled.select("label", "features")

# Create train and test splits. 
splits = data.randomSplit([0.8, 0.2], 42)
train  = splits[0]
test   = splits[1]

In [None]:
# Instantiate the model form and set its hyperparameters
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

# Train the model
model = nb.fit(train)

# Make predictions on the test data. 
predictions = model.transform(test)
predictions.show()

# Compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = {:0.3f}".format(accuracy))