In [1]:
#Code Snippet 39
#Step 1 - Importing data and necessary libraries
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SpamHamNLP').getOrCreate()
data = spark.read.csv('spam_ham_nlp.csv',header=True,inferSchema=True,sep='\t')
print("Initial Data")
data.show(5)
#Step 2 - Data pre-processing and applying NLP data format techniques
from pyspark.ml.feature import (StringIndexer,Tokenizer, StopWordsRemover,CountVectorizer,IDF,VectorAssembler)                         
#Converting our category into a numeric
category_to_numeric = StringIndexer(inputCol='category',outputCol='label')
#Tokenizing our content
tokenizer = Tokenizer(inputCol='content',outputCol='tokens')
#Removing the stop words 
stopWords_removed = StopWordsRemover(inputCol='tokens',outputCol='stpWrd_tokens')
#Converting tokens to vectors of token count
count_vectors = CountVectorizer(inputCol='stpWrd_tokens',outputCol='countVec')
#Performing IDF 
idf = IDF(inputCol='countVec',outputCol='tf-idf')
#consolidating the features
consolidated_data = VectorAssembler(inputCols=['tf-idf'],outputCol='features')
#Transforming and finalizing our data to spark accepted format
from pyspark.ml import Pipeline
pipeline_object = Pipeline(stages=[category_to_numeric,tokenizer,stopWords_removed,count_vectors,idf,consolidated_data])                        
pipeline_data_model = pipeline_object.fit(data)
final_data = pipeline_data_model.transform(data)
final_data.head(1)
final_data = final_data.select('features','label')
print('Final Data')
final_data.show(5)
#Step 3 - Applying Machine learning algorithm to our data
#Using a Logistic Regression Classifier as our classification algorithm
from pyspark.ml.classification import LogisticRegression
log_reg = LogisticRegression()
# Splitting the data into 70 and 30 percent
train_data, test_data = final_data.randomSplit([0.7,0.3])
spam_detector = log_reg.fit(train_data)
predictions = spam_detector.transform(test_data)
print("Predictions")
predictions.show(5)
#Step 4 - Evaluating our Model
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
eval_object = MulticlassClassificationEvaluator()
accuracy = eval_object.evaluate(predictions)
print("The Accuracy is {}".format(accuracy))

Initial Data
+--------+--------------------+
|category|             content|
+--------+--------------------+
|     ham|Meet me at Willys...|
|     ham|Let us go to the ...|
|    spam|Free entry in 2 a...|
|     ham|I have sent you t...|
|     ham|Lets meet at 7pm ...|
+--------+--------------------+
only showing top 5 rows

Final Data
+--------------------+-----+
|            features|label|
+--------------------+-----+
|(13497,[73,82,940...|  0.0|
|(13497,[7,85,127,...|  0.0|
|(13497,[2,13,19,2...|  1.0|
|(13497,[95,472,75...|  0.0|
|(13497,[73,491,34...|  0.0|
+--------------------+-----+
only showing top 5 rows

Predictions
+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(13497,[0,1,2,4,3...|  1.0|[-26.086348683110...|[4.68643625282683...|       1.0|
|(13497,[0,1,2,5,5...|  1.0|[37.12750