In [1]:
from pyspark import SparkContext
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.feature import HashingTF

# Load 2 types of emails from text files: spam and ham (non-spam).
# Each line has text from one email.
spam = sc.textFile("s3://hw2yh/data/spam.txt")
ham = sc.textFile("s3://hw2yh/data/ham.txt")

# Create a HashingTF instance to map email text to vectors of 100 features.
tf = HashingTF(numFeatures = 100)
# Each email is split into words, and each word is mapped to one feature.
spamFeatures = spam.map(lambda email: tf.transform(email.split(" ")))
hamFeatures = ham.map(lambda email: tf.transform(email.split(" ")))

# Create LabeledPoint datasets for positive (spam) and negative (ham) examples.
positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features))
negativeExamples = hamFeatures.map(lambda features: LabeledPoint(0, features))
training_data = positiveExamples.union(negativeExamples)
training_data.cache() # Cache data since Logistic Regression is an iterative algorithm.

# Run Logistic Regression using the SGD optimizer.
# regParam is model regularization, which can make models more robust.
model = LogisticRegressionWithSGD.train(training_data)

# Test on a positive example (spam) and a negative one (ham).
# First apply the same HashingTF feature transformation used on the training data.
posTestExample = tf.transform("O M G GET cheap stuff by sending money to ...".split(" "))
negTestExample = tf.transform("Hi Dad, I started studying Spark the other ...".split(" "))

# Now use the learned model to predict spam/ham for new emails.
print ("Prediction for positive test example: %g" % model.predict(posTestExample))
print ("Prediction for negative test example: %g" % model.predict(negTestExample))

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
1,application_1582794258936_0002,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Prediction for positive test example: 0
Prediction for negative test example: 1

In [22]:
test1 = tf.transform("Dear all, I hope you all enjoying the long weekend before the start of Spring I classes. Earlier today,  ...".split(" "))
test2 = tf.transform("Hi everyone, As we are heading in to the second half of the semester I hope everyone is enjoying and doing well in their classes. My group is ...".split(" "))
test3 = tf.transform("Dear Yvonne Hao, Hope this Email finds you well. In order to have an effective consultation, please consider the following questions carefully ...".split(" "))
test4 = tf.transform("Dear students, You are receiving this email because you are enrolled in the course “Data Science and Business Intelligence,” which would be offered in Spring ...".split(" "))
test5 = tf.transform("Dear Yvonee, Please resubmit each of the hws in its right place. I will ask the TA to not discount late penalty.-Diana ...".split(" "))
test6 = tf.transform("I'm in a meeting and need help getting some Amazon Gift Cards ...".split(" "))
test7 = tf.transform("Dear User, This email is enclosed in the Marquette University secure network, hence access it below Access the documents here ...".split(" "))
test8 = tf.transform("Your PayPal account has been suspended due to suspicious activity. Please contact us immediately at 1-409-123-457 ...".split(" "))
test9 = tf.transform("Dear Subscriber, Your Microsoft account has been compromised. You must update it immediately or your account will be closed ...".split(" "))
test10 = tf.transform("Dear user of mikesdomain.org gateway e-mail server,Your e-mail account has been temporary disabled because of ...".split(" "))
test11 = tf.transform("An package has been sent to your nearest Fedex. Click here to Login to track your order ...".split(" "))


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
print ("Prediction for positive test example: %g" % model.predict(test1))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Prediction for positive test example: 0

In [11]:
print ("Prediction for positive test example: %g" % model.predict(test2))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Prediction for positive test example: 0

In [12]:
print ("Prediction for positive test example: %g" % model.predict(test3))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Prediction for positive test example: 1

In [13]:
print ("Prediction for positive test example: %g" % model.predict(test4))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Prediction for positive test example: 0

In [14]:
print ("Prediction for positive test example: %g" % model.predict(test5))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Prediction for positive test example: 1

In [15]:
print ("Prediction for negative test example: %g" % model.predict(test6))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Prediction for negative test example: 0

In [16]:
print ("Prediction for negative test example: %g" % model.predict(test7))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Prediction for negative test example: 0

In [17]:
print ("Prediction for negative test example: %g" % model.predict(test8))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Prediction for negative test example: 1

In [18]:
print ("Prediction for negative test example: %g" % model.predict(test9))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Prediction for negative test example: 0

In [19]:
print ("Prediction for negative test example: %g" % model.predict(test10))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Prediction for negative test example: 1

In [23]:
print ("Prediction for negative test example: %g" % model.predict(test11))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Prediction for negative test example: 1

In [26]:
sc.stop()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…